Skip to content
This repository was archived by the owner on Feb 16, 2023. It is now read-only.

Commit 1e131f0

Browse files
committed
added a very crude and largely untested API endpoint for document merging #335
1 parent 321869e commit 1e131f0

File tree

4 files changed

+155
-62
lines changed

4 files changed

+155
-62
lines changed

src/documents/merge.py

+108-37
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
import tempfile
33

44
from django.conf import settings
5+
from django_q.tasks import async_task
56

7+
from documents import tasks
8+
from documents.consumer import ConsumerError
69
from documents.models import Document
710

811
from pikepdf import Pdf
@@ -12,55 +15,123 @@ class MergeError(Exception):
1215
pass
1316

1417

15-
def execute_split_merge_plan(plan, metadata: str, delete_source: bool, preview: bool):
18+
class PdfCache:
1619

17-
temp_dir = tempfile.mkdtemp(prefix="paperless-merge", dir=settings.SCRATCH_DIR)
20+
def __init__(self):
21+
self.cache = dict()
1822

19-
target_files = []
23+
def open_from_document(self, document: Document):
24+
if document.pk in self.cache:
25+
return self.cache[document.pk]
2026

21-
for (i, target_document_spec) in enumerate(plan):
27+
if document.mime_type == 'application/pdf':
28+
filename = document.source_path
29+
elif document.has_archive_version:
30+
filename = document.archive_path
31+
else:
32+
raise MergeError()
2233

23-
# create a new document from documents in target_document_spec
34+
if not os.path.exists(filename):
35+
raise MergeError()
2436

25-
target_pdf: Pdf = Pdf.new()
37+
pdf = Pdf.open(filename)
38+
self.cache[document.pk] = pdf
2639

27-
for source_document_spec in target_document_spec:
40+
return pdf
2841

29-
source_document_id = source_document_spec['document']
42+
def close_all(self):
43+
for pk in self.cache:
44+
self.cache[pk].close()
3045

31-
if 'pages' in source_document_spec:
32-
pages = source_document_spec['pages']
33-
else:
34-
pages = None
46+
self.cache.clear()
3547

36-
try:
37-
source_document: Document = Document.objects.get(id=source_document_id)
38-
except Document.DoesNotExist:
39-
raise MergeError()
4048

41-
if source_document.mime_type == 'application/pdf':
42-
source_pdf: Pdf = Pdf.open(source_document.source_path)
43-
elif source_document.has_archive_version:
44-
source_pdf: Pdf = Pdf.open(source_document.archive_path)
45-
else:
46-
raise MergeError()
49+
def consume_many_files(kwargs_list, delete_document_ids=None):
50+
new_document_ids = []
4751

48-
if pages is not None:
49-
for page in pages:
50-
if page >= len(source_pdf.pages):
51-
raise MergeError()
52-
target_pdf.pages.append(source_pdf.pages[page])
53-
else:
54-
target_pdf.pages.extend(source_pdf.pages)
52+
try:
53+
for kwargs in kwargs_list:
54+
new_document_ids.append(tasks.consume_file(**kwargs))
5555

56-
target_pdf_filename = os.path.join(temp_dir, f"{i+1:02}.pdf")
57-
target_pdf.save(target_pdf_filename)
58-
target_files.append(target_pdf_filename)
56+
except ConsumerError:
57+
# in case something goes wrong, delete all previously created documents
58+
for document_id in new_document_ids:
59+
Document.objects.get(id=document_id).delete()
60+
raise
61+
else:
62+
# If everything goes well, optionally delete source documents
63+
if delete_document_ids:
64+
for document_id in delete_document_ids:
65+
Document.objects.get(id=document_id).delete()
5966

60-
if not preview:
61-
pass
6267

63-
if delete_source:
64-
pass
68+
def execute_split_merge_plan(plan, tempdir: str, metadata: str = "redo", delete_source: bool = False, preview: bool = True):
69+
70+
consume_tasks = []
71+
cache = PdfCache()
72+
source_documents = set()
73+
74+
try:
75+
for (i, target_document_spec) in enumerate(plan):
76+
# create a new document from documents in target_document_spec
77+
78+
target_pdf: Pdf = Pdf.new()
79+
version = target_pdf.pdf_version
80+
81+
for source_document_spec in target_document_spec:
82+
source_document_id = source_document_spec['document']
83+
source_documents.add(source_document_id)
84+
85+
if 'pages' in source_document_spec:
86+
pages = source_document_spec['pages']
87+
else:
88+
pages = None
89+
90+
try:
91+
source_document: Document = Document.objects.get(id=source_document_id)
92+
except Document.DoesNotExist:
93+
raise MergeError()
94+
95+
source_pdf: Pdf = cache.open_from_document(source_document)
96+
version = max(version, source_pdf.pdf_version)
97+
98+
if pages is not None:
99+
for page in pages:
100+
if page >= len(source_pdf.pages):
101+
raise MergeError()
102+
target_pdf.pages.append(source_pdf.pages[page])
103+
else:
104+
target_pdf.pages.extend(source_pdf.pages)
105+
106+
target_pdf_filename = tempfile.NamedTemporaryFile(suffix="_pdf", dir=tempdir).name
107+
target_pdf.remove_unreferenced_resources()
108+
target_pdf.save(target_pdf_filename, min_version=version)
109+
target_pdf.close()
110+
111+
consume_task = {"path": target_pdf_filename}
112+
113+
first_id = target_document_spec[0]["document"]
114+
first_doc: Document = Document.objects.get(id=first_id)
115+
116+
consume_task["override_title"] = first_doc.title
117+
118+
if metadata == "copy_first":
119+
if first_doc.correspondent:
120+
consume_task["override_correspondent_id"] = first_doc.correspondent.id
121+
if first_doc.document_type:
122+
consume_task["override_document_type_id"] = first_doc.document_type.hidden
123+
if first_doc.tags.count() > 0:
124+
consume_task["override_tag_ids"] = [tag.id for tag in first_doc.tags]
125+
126+
consume_tasks.append(consume_task)
127+
finally:
128+
cache.close_all()
129+
130+
if not preview:
131+
async_task(
132+
"documents.merge.consume_many_files",
133+
kwargs_list=consume_tasks,
134+
delete_document_ids=list(source_documents) if delete_source else None
135+
)
65136

66-
return target_files
137+
return [os.path.basename(t["path"]) for t in consume_tasks]

src/documents/tasks.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,7 @@ def consume_file(path,
8282
)
8383

8484
if document:
85-
return "Success. New document id {} created".format(
86-
document.pk
87-
)
85+
return document.pk
8886
else:
8987
raise ConsumerError("Unknown error: Returned document was null, but "
9088
"no error message was given.")

src/documents/views.py

+38-15
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from datetime import datetime
77
from time import mktime
88

9+
import pathvalidate
910
from django.conf import settings
1011
from django.db.models import Count, Max, Case, When, IntegerField
1112
from django.db.models.functions import Lower
@@ -18,6 +19,7 @@
1819
from rest_framework import parsers
1920
from rest_framework.decorators import action
2021
from rest_framework.filters import OrderingFilter, SearchFilter
22+
from rest_framework.generics import GenericAPIView
2123
from rest_framework.mixins import (
2224
DestroyModelMixin,
2325
ListModelMixin,
@@ -673,28 +675,49 @@ def post(self, request, format=None):
673675
return response
674676

675677

676-
class DocumentMergeView(APIView):
678+
class DocumentSplitMergeViewSet(GenericViewSet):
677679

678680
permission_classes = (IsAuthenticated,)
679681
serializer_class = DocumentSplitMergePlanSerializer
680-
parser_classes = (parsers.JSONParser,)
681682

682-
def get_serializer_context(self):
683-
return {
684-
'request': self.request,
685-
'format': self.format_kwarg,
686-
'view': self
687-
}
683+
def __init__(self, **kwargs):
684+
super(DocumentSplitMergeViewSet, self).__init__(**kwargs)
685+
self.tempdir = os.path.join(settings.SCRATCH_DIR, "paperless-split-merge")
686+
os.makedirs(self.tempdir, exist_ok=True)
688687

689-
def get_serializer(self, *args, **kwargs):
690-
kwargs['context'] = self.get_serializer_context()
691-
return self.serializer_class(*args, **kwargs)
688+
def get_queryset(self):
689+
return os.listdir(self.tempdir)
692690

693-
def post(self, request, *args, **kwargs):
691+
def retrieve(self, request, pk, *args, **kwargs):
692+
filename = os.path.join(self.tempdir, pathvalidate.sanitize_filename(pk))
693+
if not os.path.isfile(filename):
694+
raise Http404()
695+
696+
with open(filename, "rb") as f:
697+
return HttpResponse(f, content_type="application/pdf")
698+
699+
def list(self, request, *args, **kwargs):
700+
return Response(self.get_queryset())
701+
702+
def create(self, request, *args, **kwargs):
694703
serializer = self.get_serializer(data=request.data)
695704
serializer.is_valid(raise_exception=True)
696705

697-
# merge_plan = serializer.validated_data.get("merge_plan")
698-
# preview = serializer.validated_data.get("preview")
706+
from .merge import execute_split_merge_plan, MergeError
707+
708+
split_merge_plan = serializer.validated_data.get("split_merge_plan")
709+
preview = serializer.validated_data.get("preview")
710+
delete_source = serializer.validated_data.get("delete_source")
711+
metadata = serializer.validated_data.get("metadata")
712+
try:
713+
pdf_files = execute_split_merge_plan(
714+
plan=split_merge_plan,
715+
preview=preview,
716+
delete_source=delete_source,
717+
metadata=metadata,
718+
tempdir=self.tempdir
719+
)
720+
except MergeError:
721+
raise
699722

700-
return Response("Not implemented yet")
723+
return Response(pdf_files)

src/paperless/urls.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,23 @@
2525
BulkEditView,
2626
SelectionDataView,
2727
BulkDownloadView,
28-
DocumentMergeView
28+
DocumentSplitMergeViewSet
2929
)
3030
from paperless.views import FaviconView
3131

3232
api_router = DefaultRouter()
33+
34+
api_router.register(r"documents", DocumentViewSet)
3335
api_router.register(r"correspondents", CorrespondentViewSet)
3436
api_router.register(r"document_types", DocumentTypeViewSet)
35-
api_router.register(r"documents", DocumentViewSet)
36-
api_router.register(r"logs", LogViewSet, basename="logs")
3737
api_router.register(r"tags", TagViewSet)
38+
39+
api_router.register(r"logs", LogViewSet, basename="logs")
40+
3841
api_router.register(r"saved_views", SavedViewViewSet)
3942

43+
api_router.register(r"split_merge", DocumentSplitMergeViewSet, basename="split_merge")
44+
4045

4146
urlpatterns = [
4247
re_path(r"^api/", include([
@@ -68,10 +73,6 @@
6873
re_path(r"^documents/bulk_download/", BulkDownloadView.as_view(),
6974
name="bulk_download"),
7075

71-
re_path(r"^documents/merge/",
72-
DocumentMergeView.as_view(),
73-
name="merge"),
74-
7576
path('token/', views.obtain_auth_token)
7677

7778
] + api_router.urls)),

0 commit comments

Comments
 (0)