mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-19 09:31:18 +00:00
Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2c4e34dd0c | ||
|
|
cb308fae7b | ||
|
|
3f03d51b24 | ||
|
|
831db6ab87 | ||
|
|
43fdf634f2 | ||
|
|
f07a6b4586 | ||
|
|
2fcf484229 | ||
|
|
8bf4241b16 | ||
|
|
56bd966c02 | ||
|
|
416101d557 | ||
|
|
c330cca2c9 | ||
|
|
7e88085377 |
@@ -5,6 +5,27 @@
|
||||
Changelog
|
||||
*********
|
||||
|
||||
paperless-ng 1.1.3
|
||||
##################
|
||||
|
||||
* Added a docker-specific configuration option to adjust the number of
|
||||
worker processes of the web server. See :ref:`configuration-docker`.
|
||||
|
||||
* Some more memory usage optimizations.
|
||||
|
||||
* Don't show inbox statistics if no inbox tag is defined.
|
||||
|
||||
.. note::
|
||||
|
||||
Some packages that paperless depends on are slowly dropping Python 3.6
|
||||
support one after another, including the web server. Supporting Python
|
||||
3.6 means that I cannot update these packages anymore.
|
||||
|
||||
At some point, paperless will drop Python 3.6 support. If using a bare
|
||||
metal installation and you're still on Python 3.6, upgrade to 3.7 or newer.
|
||||
|
||||
If using docker, this does not affect you.
|
||||
|
||||
paperless-ng 1.1.2
|
||||
##################
|
||||
|
||||
|
||||
@@ -555,3 +555,65 @@ PAPERLESS_GS_BINARY=<path>
|
||||
|
||||
PAPERLESS_OPTIPNG_BINARY=<path>
|
||||
Defaults to "/usr/bin/optipng".
|
||||
|
||||
|
||||
.. _configuration-docker:
|
||||
|
||||
Docker-specific options
|
||||
#######################
|
||||
|
||||
These options don't have any effect in ``paperless.conf``. These options adjust
|
||||
the behavior of the docker container. Configure these in `docker-compose.env`.
|
||||
|
||||
PAPERLESS_WEBSERVER_WORKERS=<num>
|
||||
The number of worker processes the webserver should spawn. More worker processes
|
||||
usually result in the front end to load data much quicker. However, each worker process
|
||||
also loads the entire application into memory separately, so increasing this value
|
||||
will increase RAM usage.
|
||||
|
||||
Consider configuring this to 1 on low power devices with limited amount of RAM.
|
||||
|
||||
Defaults to 2.
|
||||
|
||||
USERMAP_UID=<uid>
|
||||
The ID of the paperless user in the container. Set this to your actual user ID on the
|
||||
host system, which you can get by executing
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ id -u
|
||||
|
||||
Paperless will change ownership on its folders to this user, so you need to get this right
|
||||
in order to be able to write to the consumption directory.
|
||||
|
||||
Defaults to 1000.
|
||||
|
||||
USERMAP_GID=<gid>
|
||||
The ID of the paperless Group in the container. Set this to your actual group ID on the
|
||||
host system, which you can get by executing
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ id -g
|
||||
|
||||
Paperless will change ownership on its folders to this group, so you need to get this right
|
||||
in order to be able to write to the consumption directory.
|
||||
|
||||
Defaults to 1000.
|
||||
|
||||
PAPERLESS_OCR_LANGUAGES=<list>
|
||||
Additional OCR languages to install. By default, paperless comes with
|
||||
English, German, Italian, Spanish and French. If your language is not in this list, install
|
||||
additional languages with this configuration option:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
PAPERLESS_OCR_LANGUAGES=tur ces
|
||||
|
||||
To actually use these languages, also set the default OCR language of paperless:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
PAPERLESS_OCR_LANGUAGE=tur
|
||||
|
||||
Defaults to none, which does not install any additional languages.
|
||||
|
||||
@@ -763,7 +763,8 @@ configuring some options in paperless can help improve performance immensely:
|
||||
|
||||
* Stick with SQLite to save some resources.
|
||||
* Consider setting ``PAPERLESS_OCR_PAGES`` to 1, so that paperless will only OCR
|
||||
the first page of your documents.
|
||||
the first page of your documents. In most cases, this page contains enough
|
||||
information to be able to find it.
|
||||
* ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
|
||||
to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
|
||||
paperless will use 2 workers and 2 threads per worker. This may result in
|
||||
@@ -776,6 +777,8 @@ configuring some options in paperless can help improve performance immensely:
|
||||
file generation for already ocr'ed documents entirely.
|
||||
* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
|
||||
times. Thumbnails will be about 20% larger.
|
||||
* If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to
|
||||
1. This will save some memory.
|
||||
|
||||
For details, refer to :ref:`configuration`.
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import os
|
||||
|
||||
bind = '0.0.0.0:8000'
|
||||
workers = 2
|
||||
workers = int(os.getenv("PAPERLESS_WEBSERVER_WORKERS", 2))
|
||||
worker_class = 'uvicorn.workers.UvicornWorker'
|
||||
timeout = 120
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
<app-widget-frame title="Statistics" i18n-title>
|
||||
<ng-container content>
|
||||
<p class="card-text" i18n>Documents in inbox: {{statistics.documents_inbox}}</p>
|
||||
<p class="card-text" i18n>Total documents: {{statistics.documents_total}}</p>
|
||||
<p class="card-text" i18n *ngIf="statistics?.documents_inbox != null">Documents in inbox: {{statistics?.documents_inbox}}</p>
|
||||
<p class="card-text" i18n>Total documents: {{statistics?.documents_total}}</p>
|
||||
</ng-container>
|
||||
</app-widget-frame>
|
||||
|
||||
@@ -169,7 +169,12 @@ export class ConsumerStatusService {
|
||||
}
|
||||
|
||||
dismiss(status: FileStatus) {
|
||||
let index = this.consumerStatus.findIndex(s => s.filename == status.filename)
|
||||
let index
|
||||
if (status.taskId != null) {
|
||||
index = this.consumerStatus.findIndex(s => s.taskId == status.taskId)
|
||||
} else {
|
||||
index = this.consumerStatus.findIndex(s => s.filename == status.filename)
|
||||
}
|
||||
|
||||
if (index > -1) {
|
||||
this.consumerStatus.splice(index, 1)
|
||||
|
||||
@@ -2,7 +2,7 @@ export const environment = {
|
||||
production: true,
|
||||
apiBaseUrl: "/api/",
|
||||
appTitle: "Paperless-ng",
|
||||
version: "1.1.2",
|
||||
version: "1.1.3",
|
||||
webSocketHost: window.location.host,
|
||||
webSocketProtocol: (window.location.protocol == "https:" ? "wss:" : "ws:")
|
||||
};
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.utils.safestring import mark_safe
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from . import index
|
||||
from .models import Correspondent, Document, DocumentType, Log, Tag, \
|
||||
from .models import Correspondent, Document, DocumentType, Tag, \
|
||||
SavedView, SavedViewFilterRule
|
||||
|
||||
|
||||
@@ -86,17 +82,21 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
created_.short_description = "Created"
|
||||
|
||||
def delete_queryset(self, request, queryset):
|
||||
ix = index.open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
from documents import index
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
for o in queryset:
|
||||
index.remove_document(writer, o)
|
||||
|
||||
super(DocumentAdmin, self).delete_queryset(request, queryset)
|
||||
|
||||
def delete_model(self, request, obj):
|
||||
from documents import index
|
||||
index.remove_document_from_index(obj)
|
||||
super(DocumentAdmin, self).delete_model(request, obj)
|
||||
|
||||
def save_model(self, request, obj, form, change):
|
||||
from documents import index
|
||||
index.add_or_update_document(obj)
|
||||
super(DocumentAdmin, self).save_model(request, obj, form, change)
|
||||
|
||||
|
||||
@@ -2,9 +2,7 @@ import itertools
|
||||
|
||||
from django.db.models import Q
|
||||
from django_q.tasks import async_task
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents.models import Document, Correspondent, DocumentType
|
||||
|
||||
|
||||
@@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags):
|
||||
def delete(doc_ids):
|
||||
Document.objects.filter(id__in=doc_ids).delete()
|
||||
|
||||
ix = index.open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
from documents import index
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
for id in doc_ids:
|
||||
index.remove_document_by_id(writer, id)
|
||||
|
||||
|
||||
@@ -95,9 +95,6 @@ class DocumentClassifier(object):
|
||||
pickle.dump(self.document_type_classifier, f)
|
||||
|
||||
def train(self):
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
|
||||
|
||||
data = list()
|
||||
labels_tags = list()
|
||||
@@ -162,6 +159,10 @@ class DocumentClassifier(object):
|
||||
)
|
||||
)
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
|
||||
|
||||
# Step 2: vectorize data
|
||||
logger.debug("Vectorizing data...")
|
||||
self.data_vectorizer = CountVectorizer(
|
||||
|
||||
@@ -86,6 +86,22 @@ def open_index(recreate=False):
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_index_writer(ix=None, optimize=False):
|
||||
if ix:
|
||||
writer = AsyncWriter(ix)
|
||||
else:
|
||||
writer = AsyncWriter(open_index())
|
||||
|
||||
try:
|
||||
yield writer
|
||||
except Exception as e:
|
||||
logger.exception(str(e))
|
||||
writer.cancel()
|
||||
finally:
|
||||
writer.commit(optimize=optimize)
|
||||
|
||||
|
||||
def update_document(writer, doc):
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
writer.update_document(
|
||||
@@ -110,14 +126,12 @@ def remove_document_by_id(writer, doc_id):
|
||||
|
||||
|
||||
def add_or_update_document(document):
|
||||
ix = open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
with open_index_writer() as writer:
|
||||
update_document(writer, document)
|
||||
|
||||
|
||||
def remove_document_from_index(document):
|
||||
ix = open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
with open_index_writer() as writer:
|
||||
remove_document(writer, document)
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
import dateparser
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
@@ -200,6 +199,8 @@ def parse_date(filename, text):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
import dateparser
|
||||
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
|
||||
@@ -11,7 +11,7 @@ from django.dispatch import receiver
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
|
||||
from .. import index, matching
|
||||
from .. import matching
|
||||
from ..file_handling import delete_empty_directories, \
|
||||
create_source_path_directory, \
|
||||
generate_unique_filename
|
||||
@@ -305,4 +305,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
||||
|
||||
def add_to_index(sender, document, **kwargs):
|
||||
from documents import index
|
||||
|
||||
index.add_or_update_document(document)
|
||||
|
||||
@@ -4,6 +4,7 @@ from django.contrib.admin.sites import AdminSite
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from documents import index
|
||||
from documents.admin import DocumentAdmin
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
@@ -11,37 +12,52 @@ from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
class TestDocumentAdmin(DirectoriesMixin, TestCase):
|
||||
|
||||
def get_document_from_index(self, doc):
|
||||
ix = index.open_index()
|
||||
with ix.searcher() as searcher:
|
||||
return searcher.document(id=doc.id)
|
||||
|
||||
def setUp(self) -> None:
|
||||
super(TestDocumentAdmin, self).setUp()
|
||||
self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
|
||||
|
||||
@mock.patch("documents.admin.index.add_or_update_document")
|
||||
def test_save_model(self, m):
|
||||
def test_save_model(self):
|
||||
doc = Document.objects.create(title="test")
|
||||
|
||||
doc.title = "new title"
|
||||
self.doc_admin.save_model(None, doc, None, None)
|
||||
self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
|
||||
m.assert_called_once()
|
||||
self.assertEqual(self.get_document_from_index(doc)['title'], "new title")
|
||||
|
||||
@mock.patch("documents.admin.index.remove_document")
|
||||
def test_delete_model(self, m):
|
||||
def test_delete_model(self):
|
||||
doc = Document.objects.create(title="test")
|
||||
self.doc_admin.delete_model(None, doc)
|
||||
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
|
||||
m.assert_called_once()
|
||||
index.add_or_update_document(doc)
|
||||
self.assertIsNotNone(self.get_document_from_index(doc))
|
||||
|
||||
@mock.patch("documents.admin.index.remove_document")
|
||||
def test_delete_queryset(self, m):
|
||||
self.doc_admin.delete_model(None, doc)
|
||||
|
||||
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
|
||||
self.assertIsNone(self.get_document_from_index(doc))
|
||||
|
||||
def test_delete_queryset(self):
|
||||
docs = []
|
||||
for i in range(42):
|
||||
Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
|
||||
doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
|
||||
docs.append(doc)
|
||||
index.add_or_update_document(doc)
|
||||
|
||||
self.assertEqual(Document.objects.count(), 42)
|
||||
|
||||
for doc in docs:
|
||||
self.assertIsNotNone(self.get_document_from_index(doc))
|
||||
|
||||
self.doc_admin.delete_queryset(None, Document.objects.all())
|
||||
|
||||
self.assertEqual(m.call_count, 42)
|
||||
self.assertEqual(Document.objects.count(), 0)
|
||||
|
||||
for doc in docs:
|
||||
self.assertIsNone(self.get_document_from_index(doc))
|
||||
|
||||
def test_created(self):
|
||||
doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
|
||||
self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
|
||||
|
||||
@@ -442,6 +442,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.data['documents_total'], 3)
|
||||
self.assertEqual(response.data['documents_inbox'], 1)
|
||||
|
||||
def test_statistics_no_inbox_tag(self):
|
||||
Document.objects.create(title="none1", checksum="A")
|
||||
|
||||
response = self.client.get("/api/statistics/")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.data['documents_inbox'], None)
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload(self, m):
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@ from rest_framework.viewsets import (
|
||||
ViewSet
|
||||
)
|
||||
|
||||
import documents.index as index
|
||||
from paperless.db import GnuPG
|
||||
from paperless.views import StandardPagination
|
||||
from .classifier import load_classifier
|
||||
@@ -176,10 +175,12 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
def update(self, request, *args, **kwargs):
|
||||
response = super(DocumentViewSet, self).update(
|
||||
request, *args, **kwargs)
|
||||
from documents import index
|
||||
index.add_or_update_document(self.get_object())
|
||||
return response
|
||||
|
||||
def destroy(self, request, *args, **kwargs):
|
||||
from documents import index
|
||||
index.remove_document_from_index(self.get_object())
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
@@ -501,10 +502,6 @@ class SearchView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SearchView, self).__init__(*args, **kwargs)
|
||||
self.ix = index.open_index()
|
||||
|
||||
def add_infos_to_hit(self, r):
|
||||
try:
|
||||
doc = Document.objects.get(id=r['id'])
|
||||
@@ -525,6 +522,7 @@ class SearchView(APIView):
|
||||
}
|
||||
|
||||
def get(self, request, format=None):
|
||||
from documents import index
|
||||
|
||||
if 'query' in request.query_params:
|
||||
query = request.query_params['query']
|
||||
@@ -554,8 +552,10 @@ class SearchView(APIView):
|
||||
if page < 1:
|
||||
page = 1
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
try:
|
||||
with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
|
||||
with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
@@ -570,10 +570,6 @@ class SearchAutoCompleteView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
|
||||
self.ix = index.open_index()
|
||||
|
||||
def get(self, request, format=None):
|
||||
if 'term' in request.query_params:
|
||||
term = request.query_params['term']
|
||||
@@ -587,7 +583,11 @@ class SearchAutoCompleteView(APIView):
|
||||
else:
|
||||
limit = 10
|
||||
|
||||
return Response(index.autocomplete(self.ix, term, limit))
|
||||
from documents import index
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
return Response(index.autocomplete(ix, term, limit))
|
||||
|
||||
|
||||
class StatisticsView(APIView):
|
||||
@@ -595,8 +595,14 @@ class StatisticsView(APIView):
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
def get(self, request, format=None):
|
||||
return Response({
|
||||
'documents_total': Document.objects.all().count(),
|
||||
'documents_inbox': Document.objects.filter(
|
||||
documents_total = Document.objects.all().count()
|
||||
if Tag.objects.filter(is_inbox_tag=True).exists():
|
||||
documents_inbox = Document.objects.filter(
|
||||
tags__is_inbox_tag=True).distinct().count()
|
||||
else:
|
||||
documents_inbox = None
|
||||
|
||||
return Response({
|
||||
'documents_total': documents_total,
|
||||
'documents_inbox': documents_inbox,
|
||||
})
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = (1, 1, 2)
|
||||
__version__ = (1, 1, 3)
|
||||
|
||||
@@ -2,12 +2,8 @@ import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import ocrmypdf
|
||||
import pdftotext
|
||||
import pikepdf
|
||||
from PIL import Image
|
||||
from django.conf import settings
|
||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||
|
||||
from documents.parsers import DocumentParser, ParseError, \
|
||||
make_thumbnail_from_pdf
|
||||
@@ -22,6 +18,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
logging_name = "paperless.parsing.tesseract"
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
import pikepdf
|
||||
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
result = []
|
||||
@@ -91,6 +89,9 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
import ocrmypdf
|
||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||
|
||||
mode = settings.OCR_MODE
|
||||
|
||||
text_original = get_text_from_pdf(document_path)
|
||||
@@ -223,6 +224,7 @@ def strip_excess_whitespace(text):
|
||||
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
import pdftotext
|
||||
|
||||
if not os.path.isfile(pdf_file):
|
||||
return None
|
||||
|
||||
@@ -164,17 +164,12 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
|
||||
def test_image_calc_a4_dpi(self, m):
|
||||
def test_image_calc_a4_dpi(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||
dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
|
||||
self.assertEqual(kwargs['image_dpi'], 62)
|
||||
self.assertEqual(dpi, 62)
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
|
||||
def test_image_dpi_fail(self, m):
|
||||
|
||||
Reference in New Issue
Block a user