changelog

only show inbox statistics if inbox tags are defined
version bump
2025-12-19 09:31:18 +00:00 · 2021-02-15 23:44:48 +01:00 · 2021-02-15 23:14:54 +01:00 · 2021-02-15 16:52:45 +01:00 · 2021-02-15 16:46:06 +01:00 · 2021-02-15 16:37:44 +01:00
19 changed files with 201 additions and 65 deletions
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -5,6 +5,27 @@
 Changelog
 *********

+paperless-ng 1.1.3
+##################
+
+* Added a docker-specific configuration option to adjust the number of
+  worker processes of the web server. See :ref:`configuration-docker`.
+
+* Some more memory usage optimizations.
+
+* Don't show inbox statistics if no inbox tag is defined.
+
+.. note::
+
+  Some packages that paperless depends on are slowly dropping Python 3.6
+  support one after another, including the web server. Supporting Python
+  3.6 means that I cannot update these packages anymore.
+
+  At some point, paperless will drop Python 3.6 support. If using a bare
+  metal installation and you're still on Python 3.6, upgrade to 3.7 or newer.
+
+  If using docker, this does not affect you.
+
 paperless-ng 1.1.2
 ##################

--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -555,3 +555,65 @@ PAPERLESS_GS_BINARY=<path>

 PAPERLESS_OPTIPNG_BINARY=<path>
    Defaults to "/usr/bin/optipng".
+
+
+.. _configuration-docker:
+
+Docker-specific options
+#######################
+
+These options don't have any effect in ``paperless.conf``. These options adjust
+the behavior of the docker container. Configure these in `docker-compose.env`.
+
+PAPERLESS_WEBSERVER_WORKERS=<num>
+    The number of worker processes the webserver should spawn. More worker processes
+    usually result in the front end to load data much quicker. However, each worker process
+    also loads the entire application into memory separately, so increasing this value
+    will increase RAM usage.
+
+    Consider configuring this to 1 on low power devices with limited amount of RAM.
+
+    Defaults to 2.
+
+USERMAP_UID=<uid>
+    The ID of the paperless user in the container. Set this to your actual user ID on the
+    host system, which you can get by executing
+
+    .. code:: shell-session
+
+        $ id -u
+    
+    Paperless will change ownership on its folders to this user, so you need to get this right
+    in order to be able to write to the consumption directory.
+    
+    Defaults to 1000.
+
+USERMAP_GID=<gid>
+    The ID of the paperless Group in the container. Set this to your actual group ID on the
+    host system, which you can get by executing
+
+    .. code:: shell-session
+
+        $ id -g
+    
+    Paperless will change ownership on its folders to this group, so you need to get this right
+    in order to be able to write to the consumption directory.
+    
+    Defaults to 1000.
+
+PAPERLESS_OCR_LANGUAGES=<list>
+    Additional OCR languages to install. By default, paperless comes with
+    English, German, Italian, Spanish and French. If your language is not in this list, install
+    additional languages with this configuration option:
+
+    .. code:: bash
+
+        PAPERLESS_OCR_LANGUAGES=tur ces
+
+    To actually use these languages, also set the default OCR language of paperless:
+
+    .. code:: bash
+
+        PAPERLESS_OCR_LANGUAGE=tur
+
+    Defaults to none, which does not install any additional languages.
--- a/docs/setup.rst
+++ b/docs/setup.rst
@@ -763,7 +763,8 @@ configuring some options in paperless can help improve performance immensely:

 *   Stick with SQLite to save some resources.
 *   Consider setting ``PAPERLESS_OCR_PAGES`` to 1, so that paperless will only OCR
-    the first page of your documents.
+    the first page of your documents. In most cases, this page contains enough
+    information to be able to find it.
 *   ``PAPERLESS_TASK_WORKERS`` and ``PAPERLESS_THREADS_PER_WORKER`` are configured
    to use all cores. The Raspberry Pi models 3 and up have 4 cores, meaning that
    paperless will use 2 workers and 2 threads per worker. This may result in
@@ -776,6 +777,8 @@ configuring some options in paperless can help improve performance immensely:
    file generation for already ocr'ed documents entirely.
 *   Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
    times. Thumbnails will be about 20% larger.
+*   If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to
+    1. This will save some memory.

 For details, refer to :ref:`configuration`.

--- a/gunicorn.conf.py
+++ b/gunicorn.conf.py
@@ -1,5 +1,7 @@
+import os
+
 bind = '0.0.0.0:8000'
-workers = 2
+workers = int(os.getenv("PAPERLESS_WEBSERVER_WORKERS", 2))
 worker_class = 'uvicorn.workers.UvicornWorker'
 timeout = 120

--- a/src-ui/src/app/components/dashboard/widgets/statistics-widget/statistics-widget.component.html
+++ b/src-ui/src/app/components/dashboard/widgets/statistics-widget/statistics-widget.component.html
@@ -1,6 +1,6 @@
 <app-widget-frame title="Statistics" i18n-title>
  <ng-container content>
-    <p class="card-text" i18n>Documents in inbox: {{statistics.documents_inbox}}</p>
-    <p class="card-text" i18n>Total documents: {{statistics.documents_total}}</p>
+    <p class="card-text" i18n *ngIf="statistics?.documents_inbox != null">Documents in inbox: {{statistics?.documents_inbox}}</p>
+    <p class="card-text" i18n>Total documents: {{statistics?.documents_total}}</p>
  </ng-container>
 </app-widget-frame>
--- a/src-ui/src/app/services/consumer-status.service.ts
+++ b/src-ui/src/app/services/consumer-status.service.ts
@@ -169,7 +169,12 @@ export class ConsumerStatusService {
  }

  dismiss(status: FileStatus) {
-    let index = this.consumerStatus.findIndex(s => s.filename == status.filename)
+    let index
+    if (status.taskId != null) {
+      index = this.consumerStatus.findIndex(s => s.taskId == status.taskId)
+    } else {
+      index = this.consumerStatus.findIndex(s => s.filename == status.filename)
+    }

    if (index > -1) {
      this.consumerStatus.splice(index, 1)
--- a/src-ui/src/environments/environment.prod.ts
+++ b/src-ui/src/environments/environment.prod.ts
@@ -2,7 +2,7 @@ export const environment = {
  production: true,
  apiBaseUrl: "/api/",
  appTitle: "Paperless-ng",
-  version: "1.1.2",
+  version: "1.1.3",
  webSocketHost: window.location.host,
  webSocketProtocol: (window.location.protocol == "https:" ? "wss:" : "ws:")
 };
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -1,10 +1,6 @@
 from django.contrib import admin
-from django.utils.html import format_html, format_html_join
-from django.utils.safestring import mark_safe
-from whoosh.writing import AsyncWriter

-from . import index
-from .models import Correspondent, Document, DocumentType, Log, Tag, \
+from .models import Correspondent, Document, DocumentType, Tag, \
    SavedView, SavedViewFilterRule


@@ -86,17 +82,21 @@ class DocumentAdmin(admin.ModelAdmin):
    created_.short_description = "Created"

    def delete_queryset(self, request, queryset):
-        ix = index.open_index()
-        with AsyncWriter(ix) as writer:
+        from documents import index
+
+        with index.open_index_writer() as writer:
            for o in queryset:
                index.remove_document(writer, o)
+
        super(DocumentAdmin, self).delete_queryset(request, queryset)

    def delete_model(self, request, obj):
+        from documents import index
        index.remove_document_from_index(obj)
        super(DocumentAdmin, self).delete_model(request, obj)

    def save_model(self, request, obj, form, change):
+        from documents import index
        index.add_or_update_document(obj)
        super(DocumentAdmin, self).save_model(request, obj, form, change)

--- a/src/documents/bulk_edit.py
+++ b/src/documents/bulk_edit.py
@@ -2,9 +2,7 @@ import itertools

 from django.db.models import Q
 from django_q.tasks import async_task
-from whoosh.writing import AsyncWriter

-from documents import index
 from documents.models import Document, Correspondent, DocumentType


@@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags):
 def delete(doc_ids):
    Document.objects.filter(id__in=doc_ids).delete()

-    ix = index.open_index()
-    with AsyncWriter(ix) as writer:
+    from documents import index
+
+    with index.open_index_writer() as writer:
        for id in doc_ids:
            index.remove_document_by_id(writer, id)

--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -95,9 +95,6 @@ class DocumentClassifier(object):
            pickle.dump(self.document_type_classifier, f)

    def train(self):
-        from sklearn.feature_extraction.text import CountVectorizer
-        from sklearn.neural_network import MLPClassifier
-        from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer

        data = list()
        labels_tags = list()
@@ -162,6 +159,10 @@ class DocumentClassifier(object):
            )
        )

+        from sklearn.feature_extraction.text import CountVectorizer
+        from sklearn.neural_network import MLPClassifier
+        from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
+
        # Step 2: vectorize data
        logger.debug("Vectorizing data...")
        self.data_vectorizer = CountVectorizer(
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -86,6 +86,22 @@ def open_index(recreate=False):
    return create_in(settings.INDEX_DIR, get_schema())


+@contextmanager
+def open_index_writer(ix=None, optimize=False):
+    if ix:
+        writer = AsyncWriter(ix)
+    else:
+        writer = AsyncWriter(open_index())
+
+    try:
+        yield writer
+    except Exception as e:
+        logger.exception(str(e))
+        writer.cancel()
+    finally:
+        writer.commit(optimize=optimize)
+
+
 def update_document(writer, doc):
    tags = ",".join([t.name for t in doc.tags.all()])
    writer.update_document(
@@ -110,14 +126,12 @@ def remove_document_by_id(writer, doc_id):


 def add_or_update_document(document):
-    ix = open_index()
-    with AsyncWriter(ix) as writer:
+    with open_index_writer() as writer:
        update_document(writer, document)


 def remove_document_from_index(document):
-    ix = open_index()
-    with AsyncWriter(ix) as writer:
+    with open_index_writer() as writer:
        remove_document(writer, document)


--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -6,7 +6,6 @@ import shutil
 import subprocess
 import tempfile

-import dateparser
 import magic
 from django.conf import settings
 from django.utils import timezone
@@ -200,6 +199,8 @@ def parse_date(filename, text):
        """
        Call dateparser.parse with a particular date ordering
        """
+        import dateparser
+
        return dateparser.parse(
            ds,
            settings={
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -11,7 +11,7 @@ from django.dispatch import receiver
 from django.utils import timezone
 from filelock import FileLock

-from .. import index, matching
+from .. import matching
 from ..file_handling import delete_empty_directories, \
    create_source_path_directory, \
    generate_unique_filename
@@ -305,4 +305,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):


 def add_to_index(sender, document, **kwargs):
+    from documents import index
+
    index.add_or_update_document(document)
--- a/src/documents/tests/test_admin.py
+++ b/src/documents/tests/test_admin.py
@@ -4,6 +4,7 @@ from django.contrib.admin.sites import AdminSite
 from django.test import TestCase
 from django.utils import timezone

+from documents import index
 from documents.admin import DocumentAdmin
 from documents.models import Document
 from documents.tests.utils import DirectoriesMixin
@@ -11,37 +12,52 @@ from documents.tests.utils import DirectoriesMixin

 class TestDocumentAdmin(DirectoriesMixin, TestCase):

+    def get_document_from_index(self, doc):
+        ix = index.open_index()
+        with ix.searcher() as searcher:
+            return searcher.document(id=doc.id)
+
    def setUp(self) -> None:
        super(TestDocumentAdmin, self).setUp()
        self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())

-    @mock.patch("documents.admin.index.add_or_update_document")
-    def test_save_model(self, m):
+    def test_save_model(self):
        doc = Document.objects.create(title="test")
+
        doc.title = "new title"
        self.doc_admin.save_model(None, doc, None, None)
        self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
-        m.assert_called_once()
+        self.assertEqual(self.get_document_from_index(doc)['title'], "new title")

-    @mock.patch("documents.admin.index.remove_document")
-    def test_delete_model(self, m):
+    def test_delete_model(self):
        doc = Document.objects.create(title="test")
-        self.doc_admin.delete_model(None, doc)
-        self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
-        m.assert_called_once()
+        index.add_or_update_document(doc)
+        self.assertIsNotNone(self.get_document_from_index(doc))

-    @mock.patch("documents.admin.index.remove_document")
-    def test_delete_queryset(self, m):
+        self.doc_admin.delete_model(None, doc)
+
+        self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
+        self.assertIsNone(self.get_document_from_index(doc))
+
+    def test_delete_queryset(self):
+        docs = []
        for i in range(42):
-            Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
+            doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
+            docs.append(doc)
+            index.add_or_update_document(doc)

        self.assertEqual(Document.objects.count(), 42)

+        for doc in docs:
+            self.assertIsNotNone(self.get_document_from_index(doc))
+
        self.doc_admin.delete_queryset(None, Document.objects.all())

-        self.assertEqual(m.call_count, 42)
        self.assertEqual(Document.objects.count(), 0)

+        for doc in docs:
+            self.assertIsNone(self.get_document_from_index(doc))
+
    def test_created(self):
        doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
        self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -442,6 +442,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.data['documents_total'], 3)
        self.assertEqual(response.data['documents_inbox'], 1)

+    def test_statistics_no_inbox_tag(self):
+        Document.objects.create(title="none1", checksum="A")
+
+        response = self.client.get("/api/statistics/")
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.data['documents_inbox'], None)
+
    @mock.patch("documents.views.async_task")
    def test_upload(self, m):

--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -32,7 +32,6 @@ from rest_framework.viewsets import (
    ViewSet
 )

-import documents.index as index
 from paperless.db import GnuPG
 from paperless.views import StandardPagination
 from .classifier import load_classifier
@@ -176,10 +175,12 @@ class DocumentViewSet(RetrieveModelMixin,
    def update(self, request, *args, **kwargs):
        response = super(DocumentViewSet, self).update(
            request, *args, **kwargs)
+        from documents import index
        index.add_or_update_document(self.get_object())
        return response

    def destroy(self, request, *args, **kwargs):
+        from documents import index
        index.remove_document_from_index(self.get_object())
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)

@@ -501,10 +502,6 @@ class SearchView(APIView):

    permission_classes = (IsAuthenticated,)

-    def __init__(self, *args, **kwargs):
-        super(SearchView, self).__init__(*args, **kwargs)
-        self.ix = index.open_index()
-
    def add_infos_to_hit(self, r):
        try:
            doc = Document.objects.get(id=r['id'])
@@ -525,6 +522,7 @@ class SearchView(APIView):
                }

    def get(self, request, format=None):
+        from documents import index

        if 'query' in request.query_params:
            query = request.query_params['query']
@@ -554,8 +552,10 @@ class SearchView(APIView):
        if page < 1:
            page = 1

+        ix = index.open_index()
+
        try:
-            with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query):  # NOQA: E501
+            with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query):  # NOQA: E501
                return Response(
                    {'count': len(result_page),
                     'page': result_page.pagenum,
@@ -570,10 +570,6 @@ class SearchAutoCompleteView(APIView):

    permission_classes = (IsAuthenticated,)

-    def __init__(self, *args, **kwargs):
-        super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
-        self.ix = index.open_index()
-
    def get(self, request, format=None):
        if 'term' in request.query_params:
            term = request.query_params['term']
@@ -587,7 +583,11 @@ class SearchAutoCompleteView(APIView):
        else:
            limit = 10

-        return Response(index.autocomplete(self.ix, term, limit))
+        from documents import index
+
+        ix = index.open_index()
+
+        return Response(index.autocomplete(ix, term, limit))


 class StatisticsView(APIView):
@@ -595,8 +595,14 @@ class StatisticsView(APIView):
    permission_classes = (IsAuthenticated,)

    def get(self, request, format=None):
-        return Response({
-            'documents_total': Document.objects.all().count(),
-            'documents_inbox': Document.objects.filter(
+        documents_total = Document.objects.all().count()
+        if Tag.objects.filter(is_inbox_tag=True).exists():
+            documents_inbox = Document.objects.filter(
                tags__is_inbox_tag=True).distinct().count()
+        else:
+            documents_inbox = None
+
+        return Response({
+            'documents_total': documents_total,
+            'documents_inbox': documents_inbox,
        })
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1 +1 @@
-__version__ = (1, 1, 2)
+__version__ = (1, 1, 3)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -2,12 +2,8 @@ import json
 import os
 import re

-import ocrmypdf
-import pdftotext
-import pikepdf
 from PIL import Image
 from django.conf import settings
-from ocrmypdf import InputFileError, EncryptedPdfError

 from documents.parsers import DocumentParser, ParseError, \
    make_thumbnail_from_pdf
@@ -22,6 +18,8 @@ class RasterisedDocumentParser(DocumentParser):
    logging_name = "paperless.parsing.tesseract"

    def extract_metadata(self, document_path, mime_type):
+        import pikepdf
+
        namespace_pattern = re.compile(r"\{(.*)\}(.*)")

        result = []
@@ -91,6 +89,9 @@ class RasterisedDocumentParser(DocumentParser):
            return None

    def parse(self, document_path, mime_type, file_name=None):
+        import ocrmypdf
+        from ocrmypdf import InputFileError, EncryptedPdfError
+
        mode = settings.OCR_MODE

        text_original = get_text_from_pdf(document_path)
@@ -223,6 +224,7 @@ def strip_excess_whitespace(text):


 def get_text_from_pdf(pdf_file):
+    import pdftotext

    if not os.path.isfile(pdf_file):
        return None
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -164,17 +164,12 @@ class TestParser(DirectoriesMixin, TestCase):

        self.assertRaises(ParseError, f)

-    @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
-    def test_image_calc_a4_dpi(self, m):
+    def test_image_calc_a4_dpi(self):
        parser = RasterisedDocumentParser(None)

-        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
+        dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))

-        m.assert_called_once()
-
-        args, kwargs = m.call_args
-
-        self.assertEqual(kwargs['image_dpi'], 62)
+        self.assertEqual(dpi, 62)

    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
    def test_image_dpi_fail(self, m):
Author	SHA1	Message	Date
jonaswinkler	2c4e34dd0c	changelog	2021-02-15 23:44:48 +01:00
jonaswinkler	cb308fae7b	only show inbox statistics if inbox tags are defined	2021-02-15 23:14:54 +01:00
jonaswinkler	3f03d51b24	version bump	2021-02-15 16:52:45 +01:00
jonaswinkler	831db6ab87	note regarding Python 3.6	2021-02-15 16:46:06 +01:00
jonaswinkler	43fdf634f2	added a note regarding python 3.6	2021-02-15 16:37:44 +01:00
jonaswinkler	f07a6b4586	PAPERLESS_WEBSERVER_WORKERS option	2021-02-15 16:27:35 +01:00
jonaswinkler	2fcf484229	bugfix dismissing wrong status messages	2021-02-15 14:52:47 +01:00
jonaswinkler	8bf4241b16	some search index optimizations	2021-02-15 13:26:36 +01:00
jonaswinkler	56bd966c02	local import of ocrmypdf so that the webserver does not load that	2021-02-15 12:18:10 +01:00
jonaswinkler	416101d557	only import dateparser when required	2021-02-15 11:52:46 +01:00
jonaswinkler	c330cca2c9	remove unused imports	2021-02-15 11:26:13 +01:00
jonaswinkler	7e88085377	load sklearn modules only when training data has changed	2021-02-15 11:25:25 +01:00