Compare commits

...

12 Commits

Author SHA1 Message Date
jonaswinkler
2c4e34dd0c changelog 2021-02-15 23:44:48 +01:00
jonaswinkler
cb308fae7b only show inbox statistics if inbox tags are defined 2021-02-15 23:14:54 +01:00
jonaswinkler
3f03d51b24 version bump 2021-02-15 16:52:45 +01:00
jonaswinkler
831db6ab87 note regarding Python 3.6 2021-02-15 16:46:06 +01:00
jonaswinkler
43fdf634f2 added a note regarding python 3.6 2021-02-15 16:37:44 +01:00
jonaswinkler
f07a6b4586 PAPERLESS_WEBSERVER_WORKERS option 2021-02-15 16:27:35 +01:00
jonaswinkler
2fcf484229 bugfix dismissing wrong status messages 2021-02-15 14:52:47 +01:00
jonaswinkler
8bf4241b16 some search index optimizations 2021-02-15 13:26:36 +01:00
jonaswinkler
56bd966c02 local import of ocrmypdf so that the webserver does not load that 2021-02-15 12:18:10 +01:00
jonaswinkler
416101d557 only import dateparser when required 2021-02-15 11:52:46 +01:00
jonaswinkler
c330cca2c9 remove unused imports 2021-02-15 11:26:13 +01:00
jonaswinkler
7e88085377 load sklearn modules only when training data has changed 2021-02-15 11:25:25 +01:00
19 changed files with 201 additions and 65 deletions

View File

@@ -5,6 +5,27 @@
Changelog
*********
paperless-ng 1.1.3
##################
* Added a docker-specific configuration option to adjust the number of
worker processes of the web server. See :ref:`configuration-docker`.
* Some more memory usage optimizations.
* Don't show inbox statistics if no inbox tag is defined.
.. note::
Some packages that paperless depends on are slowly dropping Python 3.6
support one after another, including the web server. Supporting Python
3.6 means that I cannot update these packages anymore.
At some point, paperless will drop Python 3.6 support. If using a bare
metal installation and you're still on Python 3.6, upgrade to 3.7 or newer.
If using docker, this does not affect you.
paperless-ng 1.1.2
##################

View File

@@ -555,3 +555,65 @@ PAPERLESS_GS_BINARY=<path>
PAPERLESS_OPTIPNG_BINARY=<path>
Defaults to "/usr/bin/optipng".
.. _configuration-docker:
Docker-specific options
#######################
These options don't have any effect in ``paperless.conf``. These options adjust
the behavior of the docker container. Configure these in `docker-compose.env`.
PAPERLESS_WEBSERVER_WORKERS=<num>
The number of worker processes the webserver should spawn. More worker processes
usually result in the front end to load data much quicker. However, each worker process
also loads the entire application into memory separately, so increasing this value
will increase RAM usage.
Consider configuring this to 1 on low power devices with limited amount of RAM.
Defaults to 2.
USERMAP_UID=<uid>
The ID of the paperless user in the container. Set this to your actual user ID on the
host system, which you can get by executing
.. code:: shell-session
$ id -u
Paperless will change ownership on its folders to this user, so you need to get this right
in order to be able to write to the consumption directory.
Defaults to 1000.
USERMAP_GID=<gid>
The ID of the paperless Group in the container. Set this to your actual group ID on the
host system, which you can get by executing
.. code:: shell-session
$ id -g
Paperless will change ownership on its folders to this group, so you need to get this right
in order to be able to write to the consumption directory.
Defaults to 1000.
PAPERLESS_OCR_LANGUAGES=<list>
Additional OCR languages to install. By default, paperless comes with
English, German, Italian, Spanish and French. If your language is not in this list, install
additional languages with this configuration option:
.. code:: bash
PAPERLESS_OCR_LANGUAGES=tur ces
To actually use these languages, also set the default OCR language of paperless:
.. code:: bash
PAPERLESS_OCR_LANGUAGE=tur
Defaults to none, which does not install any additional languages.

View File

@@ -763,7 +763,8 @@ configuring some options in paperless can help improve performance immensely:
* Stick with SQLite to save some resources.
* Consider setting ``PAPERLESS_OCR_PAGES`` to 1, so that paperless will only OCR
the first page of your documents.
the first page of your documents. In most cases, this page contains enough
information to be able to find it.
* ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
paperless will use 2 workers and 2 threads per worker. This may result in
@@ -776,6 +777,8 @@ configuring some options in paperless can help improve performance immensely:
file generation for already ocr'ed documents entirely.
* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
times. Thumbnails will be about 20% larger.
* If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to
1. This will save some memory.
For details, refer to :ref:`configuration`.

View File

@@ -1,5 +1,7 @@
import os
bind = '0.0.0.0:8000'
workers = 2
workers = int(os.getenv("PAPERLESS_WEBSERVER_WORKERS", 2))
worker_class = 'uvicorn.workers.UvicornWorker'
timeout = 120

View File

@@ -1,6 +1,6 @@
<app-widget-frame title="Statistics" i18n-title>
<ng-container content>
<p class="card-text" i18n>Documents in inbox: {{statistics.documents_inbox}}</p>
<p class="card-text" i18n>Total documents: {{statistics.documents_total}}</p>
<p class="card-text" i18n *ngIf="statistics?.documents_inbox != null">Documents in inbox: {{statistics?.documents_inbox}}</p>
<p class="card-text" i18n>Total documents: {{statistics?.documents_total}}</p>
</ng-container>
</app-widget-frame>

View File

@@ -169,7 +169,12 @@ export class ConsumerStatusService {
}
dismiss(status: FileStatus) {
let index = this.consumerStatus.findIndex(s => s.filename == status.filename)
let index
if (status.taskId != null) {
index = this.consumerStatus.findIndex(s => s.taskId == status.taskId)
} else {
index = this.consumerStatus.findIndex(s => s.filename == status.filename)
}
if (index > -1) {
this.consumerStatus.splice(index, 1)

View File

@@ -2,7 +2,7 @@ export const environment = {
production: true,
apiBaseUrl: "/api/",
appTitle: "Paperless-ng",
version: "1.1.2",
version: "1.1.3",
webSocketHost: window.location.host,
webSocketProtocol: (window.location.protocol == "https:" ? "wss:" : "ws:")
};

View File

@@ -1,10 +1,6 @@
from django.contrib import admin
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from whoosh.writing import AsyncWriter
from . import index
from .models import Correspondent, Document, DocumentType, Log, Tag, \
from .models import Correspondent, Document, DocumentType, Tag, \
SavedView, SavedViewFilterRule
@@ -86,17 +82,21 @@ class DocumentAdmin(admin.ModelAdmin):
created_.short_description = "Created"
def delete_queryset(self, request, queryset):
ix = index.open_index()
with AsyncWriter(ix) as writer:
from documents import index
with index.open_index_writer() as writer:
for o in queryset:
index.remove_document(writer, o)
super(DocumentAdmin, self).delete_queryset(request, queryset)
def delete_model(self, request, obj):
from documents import index
index.remove_document_from_index(obj)
super(DocumentAdmin, self).delete_model(request, obj)
def save_model(self, request, obj, form, change):
from documents import index
index.add_or_update_document(obj)
super(DocumentAdmin, self).save_model(request, obj, form, change)

View File

@@ -2,9 +2,7 @@ import itertools
from django.db.models import Q
from django_q.tasks import async_task
from whoosh.writing import AsyncWriter
from documents import index
from documents.models import Document, Correspondent, DocumentType
@@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags):
def delete(doc_ids):
Document.objects.filter(id__in=doc_ids).delete()
ix = index.open_index()
with AsyncWriter(ix) as writer:
from documents import index
with index.open_index_writer() as writer:
for id in doc_ids:
index.remove_document_by_id(writer, id)

View File

@@ -95,9 +95,6 @@ class DocumentClassifier(object):
pickle.dump(self.document_type_classifier, f)
def train(self):
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
data = list()
labels_tags = list()
@@ -162,6 +159,10 @@ class DocumentClassifier(object):
)
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
# Step 2: vectorize data
logger.debug("Vectorizing data...")
self.data_vectorizer = CountVectorizer(

View File

@@ -86,6 +86,22 @@ def open_index(recreate=False):
return create_in(settings.INDEX_DIR, get_schema())
@contextmanager
def open_index_writer(ix=None, optimize=False):
if ix:
writer = AsyncWriter(ix)
else:
writer = AsyncWriter(open_index())
try:
yield writer
except Exception as e:
logger.exception(str(e))
writer.cancel()
finally:
writer.commit(optimize=optimize)
def update_document(writer, doc):
tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document(
@@ -110,14 +126,12 @@ def remove_document_by_id(writer, doc_id):
def add_or_update_document(document):
ix = open_index()
with AsyncWriter(ix) as writer:
with open_index_writer() as writer:
update_document(writer, document)
def remove_document_from_index(document):
ix = open_index()
with AsyncWriter(ix) as writer:
with open_index_writer() as writer:
remove_document(writer, document)

View File

@@ -6,7 +6,6 @@ import shutil
import subprocess
import tempfile
import dateparser
import magic
from django.conf import settings
from django.utils import timezone
@@ -200,6 +199,8 @@ def parse_date(filename, text):
"""
Call dateparser.parse with a particular date ordering
"""
import dateparser
return dateparser.parse(
ds,
settings={

View File

@@ -11,7 +11,7 @@ from django.dispatch import receiver
from django.utils import timezone
from filelock import FileLock
from .. import index, matching
from .. import matching
from ..file_handling import delete_empty_directories, \
create_source_path_directory, \
generate_unique_filename
@@ -305,4 +305,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
def add_to_index(sender, document, **kwargs):
from documents import index
index.add_or_update_document(document)

View File

@@ -4,6 +4,7 @@ from django.contrib.admin.sites import AdminSite
from django.test import TestCase
from django.utils import timezone
from documents import index
from documents.admin import DocumentAdmin
from documents.models import Document
from documents.tests.utils import DirectoriesMixin
@@ -11,37 +12,52 @@ from documents.tests.utils import DirectoriesMixin
class TestDocumentAdmin(DirectoriesMixin, TestCase):
def get_document_from_index(self, doc):
ix = index.open_index()
with ix.searcher() as searcher:
return searcher.document(id=doc.id)
def setUp(self) -> None:
super(TestDocumentAdmin, self).setUp()
self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
@mock.patch("documents.admin.index.add_or_update_document")
def test_save_model(self, m):
def test_save_model(self):
doc = Document.objects.create(title="test")
doc.title = "new title"
self.doc_admin.save_model(None, doc, None, None)
self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
m.assert_called_once()
self.assertEqual(self.get_document_from_index(doc)['title'], "new title")
@mock.patch("documents.admin.index.remove_document")
def test_delete_model(self, m):
def test_delete_model(self):
doc = Document.objects.create(title="test")
self.doc_admin.delete_model(None, doc)
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
m.assert_called_once()
index.add_or_update_document(doc)
self.assertIsNotNone(self.get_document_from_index(doc))
@mock.patch("documents.admin.index.remove_document")
def test_delete_queryset(self, m):
self.doc_admin.delete_model(None, doc)
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
self.assertIsNone(self.get_document_from_index(doc))
def test_delete_queryset(self):
docs = []
for i in range(42):
Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
docs.append(doc)
index.add_or_update_document(doc)
self.assertEqual(Document.objects.count(), 42)
for doc in docs:
self.assertIsNotNone(self.get_document_from_index(doc))
self.doc_admin.delete_queryset(None, Document.objects.all())
self.assertEqual(m.call_count, 42)
self.assertEqual(Document.objects.count(), 0)
for doc in docs:
self.assertIsNone(self.get_document_from_index(doc))
def test_created(self):
doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")

View File

@@ -442,6 +442,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.data['documents_total'], 3)
self.assertEqual(response.data['documents_inbox'], 1)
def test_statistics_no_inbox_tag(self):
Document.objects.create(title="none1", checksum="A")
response = self.client.get("/api/statistics/")
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data['documents_inbox'], None)
@mock.patch("documents.views.async_task")
def test_upload(self, m):

View File

@@ -32,7 +32,6 @@ from rest_framework.viewsets import (
ViewSet
)
import documents.index as index
from paperless.db import GnuPG
from paperless.views import StandardPagination
from .classifier import load_classifier
@@ -176,10 +175,12 @@ class DocumentViewSet(RetrieveModelMixin,
def update(self, request, *args, **kwargs):
response = super(DocumentViewSet, self).update(
request, *args, **kwargs)
from documents import index
index.add_or_update_document(self.get_object())
return response
def destroy(self, request, *args, **kwargs):
from documents import index
index.remove_document_from_index(self.get_object())
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
@@ -501,10 +502,6 @@ class SearchView(APIView):
permission_classes = (IsAuthenticated,)
def __init__(self, *args, **kwargs):
super(SearchView, self).__init__(*args, **kwargs)
self.ix = index.open_index()
def add_infos_to_hit(self, r):
try:
doc = Document.objects.get(id=r['id'])
@@ -525,6 +522,7 @@ class SearchView(APIView):
}
def get(self, request, format=None):
from documents import index
if 'query' in request.query_params:
query = request.query_params['query']
@@ -554,8 +552,10 @@ class SearchView(APIView):
if page < 1:
page = 1
ix = index.open_index()
try:
with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
@@ -570,10 +570,6 @@ class SearchAutoCompleteView(APIView):
permission_classes = (IsAuthenticated,)
def __init__(self, *args, **kwargs):
super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
self.ix = index.open_index()
def get(self, request, format=None):
if 'term' in request.query_params:
term = request.query_params['term']
@@ -587,7 +583,11 @@ class SearchAutoCompleteView(APIView):
else:
limit = 10
return Response(index.autocomplete(self.ix, term, limit))
from documents import index
ix = index.open_index()
return Response(index.autocomplete(ix, term, limit))
class StatisticsView(APIView):
@@ -595,8 +595,14 @@ class StatisticsView(APIView):
permission_classes = (IsAuthenticated,)
def get(self, request, format=None):
return Response({
'documents_total': Document.objects.all().count(),
'documents_inbox': Document.objects.filter(
documents_total = Document.objects.all().count()
if Tag.objects.filter(is_inbox_tag=True).exists():
documents_inbox = Document.objects.filter(
tags__is_inbox_tag=True).distinct().count()
else:
documents_inbox = None
return Response({
'documents_total': documents_total,
'documents_inbox': documents_inbox,
})

View File

@@ -1 +1 @@
__version__ = (1, 1, 2)
__version__ = (1, 1, 3)

View File

@@ -2,12 +2,8 @@ import json
import os
import re
import ocrmypdf
import pdftotext
import pikepdf
from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
@@ -22,6 +18,8 @@ class RasterisedDocumentParser(DocumentParser):
logging_name = "paperless.parsing.tesseract"
def extract_metadata(self, document_path, mime_type):
import pikepdf
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
@@ -91,6 +89,9 @@ class RasterisedDocumentParser(DocumentParser):
return None
def parse(self, document_path, mime_type, file_name=None):
import ocrmypdf
from ocrmypdf import InputFileError, EncryptedPdfError
mode = settings.OCR_MODE
text_original = get_text_from_pdf(document_path)
@@ -223,6 +224,7 @@ def strip_excess_whitespace(text):
def get_text_from_pdf(pdf_file):
import pdftotext
if not os.path.isfile(pdf_file):
return None

View File

@@ -164,17 +164,12 @@ class TestParser(DirectoriesMixin, TestCase):
self.assertRaises(ParseError, f)
@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
def test_image_calc_a4_dpi(self, m):
def test_image_calc_a4_dpi(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
m.assert_called_once()
args, kwargs = m.call_args
self.assertEqual(kwargs['image_dpi'], 62)
self.assertEqual(dpi, 62)
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m):