mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-23 19:41:22 +00:00
Compare commits
42 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1f68223752 | ||
|
|
d6b626ec50 | ||
|
|
926016f1d8 | ||
|
|
8ea1cc24a7 | ||
|
|
08adb2f540 | ||
|
|
4333ee01bc | ||
|
|
46db0edd5d | ||
|
|
10d017ebe4 | ||
|
|
76a13dadb8 | ||
|
|
72750b9243 | ||
|
|
fba58f3bdd | ||
|
|
6662ca3467 | ||
|
|
6f1ed89e26 | ||
|
|
7e99e42924 | ||
|
|
5d01410dc0 | ||
|
|
9a47dba494 | ||
|
|
6edee78145 | ||
|
|
30eac99a61 | ||
|
|
ea6d040809 | ||
|
|
8e9d5caa37 | ||
|
|
122aa2b9f1 | ||
|
|
fb1da4834c | ||
|
|
96c7222269 | ||
|
|
1737e27b34 | ||
|
|
39f198138a | ||
|
|
c74bb84c83 | ||
|
|
07d06d9aee | ||
|
|
9cef689106 | ||
|
|
11a9c756b3 | ||
|
|
11accaff7f | ||
|
|
77aee832e4 | ||
|
|
4bde14368c | ||
|
|
151d85f2be | ||
|
|
e7c23cfb92 | ||
|
|
3f9ea7b971 | ||
|
|
998c3ef51b | ||
|
|
c85b6b425d | ||
|
|
273916973c | ||
|
|
96c517d65c | ||
|
|
e70ad3d493 | ||
|
|
516bc48a33 | ||
|
|
7f97716ae9 |
@@ -17,7 +17,7 @@ ENV PAPERLESS_EXPORT_DIR=/export \
|
||||
|
||||
# Install dependencies
|
||||
RUN apk --no-cache --update add \
|
||||
python3 gnupg libmagic bash \
|
||||
python3 gnupg libmagic bash shadow \
|
||||
sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
|
||||
apk --no-cache add --virtual .build-dependencies \
|
||||
python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
|
||||
@@ -30,8 +30,6 @@ RUN apk --no-cache --update add \
|
||||
apk del .build-dependencies && \
|
||||
# Create the consumption directory
|
||||
mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
|
||||
# Migrate database
|
||||
./src/manage.py migrate && \
|
||||
# Create user
|
||||
addgroup -g 1000 paperless && \
|
||||
adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \
|
||||
|
||||
@@ -1,12 +1,19 @@
|
||||
version: '2'
|
||||
version: '2.1'
|
||||
|
||||
services:
|
||||
webserver:
|
||||
build: ./
|
||||
# uncomment the following line to start automatically on system boot
|
||||
# restart: always
|
||||
ports:
|
||||
# You can adapt the port you want Paperless to listen on by
|
||||
# modifying the part before the `:`.
|
||||
- "8000:8000"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl" , "-f", "http://localhost:8000"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
@@ -21,6 +28,11 @@ services:
|
||||
|
||||
consumer:
|
||||
build: ./
|
||||
# uncomment the following line to start automatically on system boot
|
||||
# restart: always
|
||||
depends_on:
|
||||
webserver:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- data:/usr/src/paperless/data
|
||||
- media:/usr/src/paperless/media
|
||||
|
||||
@@ -1,6 +1,29 @@
|
||||
Changelog
|
||||
#########
|
||||
|
||||
1.3.0 (Unreleased)
|
||||
==================
|
||||
|
||||
* You can now run Paperless without a login, though you'll still have to create
|
||||
at least one user. This is thanks to a pull-request from `matthewmoto`_:
|
||||
`#295`_. Note that logins are still required by default, and that you need
|
||||
to disable them by setting ``PAPERLESS_DISABLE_LOGIN="true"`` in your
|
||||
environment or in ``/etc/paperless.conf``.
|
||||
* Fix for `#303`_ where sketchily-formatted documents could cause the consumer
|
||||
to break and insert half-records into the database breaking all sorts of
|
||||
things. We now capture the return codes of both ``convert`` and ``unpaper``
|
||||
and fail-out nicely.
|
||||
* Fix for additional date types thanks to input from `Isaac`_ and code from
|
||||
`BastianPoe`_ (`#301`_).
|
||||
* Fix for running migrations in the Docker container (`#299`_). Thanks to
|
||||
`Georgi Todorov`_ for the fix (`#300`_) and to `Pit`_ for the review.
|
||||
* Fix for Docker cases where the issuing user is not UID 1000. This was a
|
||||
collaborative fix between `Jeffrey Portman`_ and `Pit`_ in `#311`_ and
|
||||
`#312`_ to fix `#306`_.
|
||||
* Patch the historical migrations to support MySQL's um, *interesting* way of
|
||||
handing indexes (`#308`_). Thanks to `Simon Taddiken`_ for reporting the
|
||||
problem and helping me find where to fix it.
|
||||
|
||||
1.2.0
|
||||
=====
|
||||
|
||||
@@ -329,6 +352,11 @@ Changelog
|
||||
.. _Dan Panzarella: https://github.com/pzl
|
||||
.. _addadi: https://github.com/addadi
|
||||
.. _BastianPoe: https://github.com/BastianPoe
|
||||
.. _matthewmoto: https://github.com/BastianPoe
|
||||
.. _Isaac: https://github.com/isaacsando
|
||||
.. _Georgi Todorov: https://github.com/TeraHz
|
||||
.. _Jeffrey Portman: https://github.com/ChromoX
|
||||
.. _Simon Taddiken: https://github.com/skuzzle
|
||||
|
||||
.. _#20: https://github.com/danielquinn/paperless/issues/20
|
||||
.. _#44: https://github.com/danielquinn/paperless/issues/44
|
||||
@@ -380,6 +408,15 @@ Changelog
|
||||
.. _#256: https://github.com/danielquinn/paperless/pull/256
|
||||
.. _#285: https://github.com/danielquinn/paperless/pull/285
|
||||
.. _#291: https://github.com/danielquinn/paperless/pull/291
|
||||
.. _#295: https://github.com/danielquinn/paperless/pull/295
|
||||
.. _#299: https://github.com/danielquinn/paperless/issues/299
|
||||
.. _#300: https://github.com/danielquinn/paperless/pull/300
|
||||
.. _#301: https://github.com/danielquinn/paperless/issues/301
|
||||
.. _#303: https://github.com/danielquinn/paperless/issues/303
|
||||
.. _#306: https://github.com/danielquinn/paperless/issues/306
|
||||
.. _#308: https://github.com/danielquinn/paperless/issues/308
|
||||
.. _#311: https://github.com/danielquinn/paperless/pull/311
|
||||
.. _#312: https://github.com/danielquinn/paperless/pull/312
|
||||
|
||||
.. _pipenv: https://docs.pipenv.org/
|
||||
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
|
||||
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
|
||||
|
||||
@@ -50,9 +50,19 @@ this:
|
||||
|
||||
1. Consumer finds a file in the consumption directory.
|
||||
2. It asks all the available parsers: *"Hey, can you handle this file?"*
|
||||
3. The first parser that says yes gets to handle the file. The order in which
|
||||
the parsers are asked is handled by sorting ``INSTALLED_APPS`` in
|
||||
``settings.py``.
|
||||
3. Each parser responds with either ``None`` meaning they can't handle the
|
||||
file, or a dictionary in the following format:
|
||||
|
||||
.. code:: python
|
||||
|
||||
{
|
||||
"parser": <the class name>,
|
||||
"weight": <an integer>
|
||||
}
|
||||
|
||||
The consumer compares the ``weight`` values from all respondents and uses the
|
||||
class with the highest value to consume the document. The default parser,
|
||||
``RasterisedDocumentParser`` has a weight of ``0``.
|
||||
|
||||
|
||||
.. _extending-parsers-appspy:
|
||||
@@ -61,7 +71,7 @@ apps.py
|
||||
.......
|
||||
|
||||
This is a standard Django file, but you'll need to add some code to it to
|
||||
register your parser as being able to handle particular files.
|
||||
connect your parser to the ``document_consumer_declaration`` signal.
|
||||
|
||||
|
||||
.. _extending-parsers-finally:
|
||||
@@ -79,14 +89,12 @@ the list like this:
|
||||
INSTALLED_APPS = [
|
||||
...
|
||||
"my_module.apps.MyModuleConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
...
|
||||
]
|
||||
|
||||
Note that we're placing our module *above* ``PaperlessTesseractConfig``. This
|
||||
is to ensure that if your module wants to handle any files typically handled by
|
||||
the default module, yours will win instead. If there's no conflict between
|
||||
what your module does and the default, then order doesn't matter.
|
||||
Order doesn't matter, but generally it's a good idea to place your module lower
|
||||
in the list so that you don't end up accidentally overriding project defaults
|
||||
somewhere.
|
||||
|
||||
|
||||
.. _extending-parsers-example:
|
||||
|
||||
@@ -94,7 +94,7 @@ You may want to take a look at the ``paperless.conf.example`` file to see if
|
||||
there's anything new in there compared to what you've got int ``/etc``.
|
||||
|
||||
If you are :ref:`using Docker <setup-installation-docker>` the update process
|
||||
requires only one additional step:
|
||||
is similar:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
@@ -102,7 +102,6 @@ requires only one additional step:
|
||||
$ git pull
|
||||
$ docker build -t paperless .
|
||||
$ docker-compose up -d
|
||||
$ docker-compose run --rm webserver migrate
|
||||
|
||||
If ``git pull`` doesn't report any changes, there is no need to continue with
|
||||
the remaining steps.
|
||||
|
||||
@@ -86,6 +86,11 @@ PAPERLESS_PASSPHRASE="secret"
|
||||
# https://docs.djangoproject.com/en/1.11/ref/settings/#force-script-name
|
||||
#PAPERLESS_FORCE_SCRIPT_NAME=""
|
||||
|
||||
# If you are using alternative authentication means or are just using paperless
|
||||
# as a single user on a small private network, this option allows you to disable
|
||||
# user authentication if you set it to "true"
|
||||
#PAPERLESS_DISABLE_LOGIN="false"
|
||||
|
||||
###############################################################################
|
||||
#### Software Tweaks ####
|
||||
###############################################################################
|
||||
|
||||
@@ -4,13 +4,13 @@ set -e
|
||||
# Source: https://github.com/sameersbn/docker-gitlab/
|
||||
map_uidgid() {
|
||||
USERMAP_ORIG_UID=$(id -u paperless)
|
||||
USERMAP_ORIG_UID=$(id -g paperless)
|
||||
USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
|
||||
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||
if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
|
||||
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
|
||||
addgroup -g "${USERMAP_GID}" paperless
|
||||
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
|
||||
USERMAP_ORIG_GID=$(id -g paperless)
|
||||
USERMAP_NEW_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
|
||||
USERMAP_NEW_GID=${USERMAP_GID:-${USERMAP_ORIG_GID:-$USERMAP_NEW_UID}}
|
||||
if [[ ${USERMAP_NEW_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_NEW_GID} != "${USERMAP_ORIG_GID}" ]]; then
|
||||
echo "Mapping UID and GID for paperless:paperless to $USERMAP_NEW_UID:$USERMAP_NEW_GID"
|
||||
usermod -u "${USERMAP_NEW_UID}" paperless
|
||||
groupmod -g "${USERMAP_NEW_GID}" paperless
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -42,9 +42,24 @@ set_permissions() {
|
||||
chown -Rh paperless:paperless /usr/src/paperless
|
||||
}
|
||||
|
||||
migrations() {
|
||||
# A simple lock file in case other containers use this startup
|
||||
LOCKFILE="/usr/src/paperless/data/db.sqlite3.migration"
|
||||
|
||||
set -o noclobber
|
||||
# check for and create lock file in one command
|
||||
(> ${LOCKFILE}) &> /dev/null
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
sudo -HEu paperless "/usr/src/paperless/src/manage.py" "migrate"
|
||||
rm ${LOCKFILE}
|
||||
fi
|
||||
}
|
||||
|
||||
initialize() {
|
||||
map_uidgid
|
||||
set_permissions
|
||||
migrations
|
||||
}
|
||||
|
||||
install_languages() {
|
||||
@@ -64,7 +79,7 @@ install_languages() {
|
||||
if [ "$lang" == "eng" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
if apk info -e "$pkg" > /dev/null 2>&1; then
|
||||
continue
|
||||
fi
|
||||
|
||||
@@ -22,7 +22,7 @@ class ConsumerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Consumer(object):
|
||||
class Consumer:
|
||||
"""
|
||||
Loop over every file found in CONSUMPTION_DIR and:
|
||||
1. Convert it to a greyscale pnm
|
||||
@@ -117,10 +117,10 @@ class Consumer(object):
|
||||
)
|
||||
|
||||
parsed_document = parser_class(doc)
|
||||
thumbnail = parsed_document.get_thumbnail()
|
||||
date = parsed_document.get_date()
|
||||
|
||||
try:
|
||||
thumbnail = parsed_document.get_thumbnail()
|
||||
date = parsed_document.get_date()
|
||||
document = self._store(
|
||||
parsed_document.get_text(),
|
||||
doc,
|
||||
|
||||
82
src/documents/management/commands/document_correspondents.py
Normal file
82
src/documents/management/commands/document_correspondents.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import sys
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Correspondent, Document
|
||||
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
||||
help = """
|
||||
Using the current set of correspondent rules, apply said rules to all
|
||||
documents in the database, effectively allowing you to back-tag all
|
||||
previously indexed documents with correspondent created (or modified)
|
||||
after their initial import.
|
||||
""".replace(" ", "")
|
||||
|
||||
TOO_MANY_CONTINUE = (
|
||||
"Detected {} potential correspondents for {}, so we've opted for {}")
|
||||
TOO_MANY_SKIP = (
|
||||
"Detected {} potential correspondents for {}, so we're skipping it")
|
||||
CHANGE_MESSAGE = (
|
||||
'Document {}: "{}" was given the correspondent id {}: "{}"')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"--use-first",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="By default this command won't try to assign a correspondent "
|
||||
"if more than one matches the document. Use this flag if "
|
||||
"you'd rather it just pick the first one it finds."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
|
||||
for document in Document.objects.filter(correspondent__isnull=True):
|
||||
|
||||
potential_correspondents = list(
|
||||
Correspondent.match_all(document.content))
|
||||
|
||||
if not potential_correspondents:
|
||||
continue
|
||||
|
||||
potential_count = len(potential_correspondents)
|
||||
correspondent = potential_correspondents[0]
|
||||
|
||||
if potential_count > 1:
|
||||
if not options["use_first"]:
|
||||
print(
|
||||
self.TOO_MANY_SKIP.format(potential_count, document),
|
||||
file=sys.stderr
|
||||
)
|
||||
continue
|
||||
print(
|
||||
self.TOO_MANY_CONTINUE.format(
|
||||
potential_count,
|
||||
document,
|
||||
correspondent
|
||||
),
|
||||
file=sys.stderr
|
||||
)
|
||||
|
||||
document.correspondent = correspondent
|
||||
document.save(update_fields=("correspondent",))
|
||||
|
||||
print(
|
||||
self.CHANGE_MESSAGE.format(
|
||||
document.pk,
|
||||
document.title,
|
||||
correspondent.pk,
|
||||
correspondent.name
|
||||
),
|
||||
file=sys.stderr
|
||||
)
|
||||
@@ -3,6 +3,7 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -15,6 +16,6 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='content',
|
||||
field=models.TextField(blank=True, db_index=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
|
||||
field=models.TextField(blank=True, db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]), help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -30,15 +30,8 @@ from .serialisers import (
|
||||
|
||||
|
||||
class IndexView(TemplateView):
|
||||
|
||||
template_name = "documents/index.html"
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
print(kwargs)
|
||||
print(self.request.GET)
|
||||
print(self.request.POST)
|
||||
return TemplateView.get_context_data(self, **kwargs)
|
||||
|
||||
|
||||
class FetchView(SessionOrBasicAuthMixin, DetailView):
|
||||
|
||||
|
||||
14
src/paperless/middleware.py
Normal file
14
src/paperless/middleware.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from django.utils.deprecation import MiddlewareMixin
|
||||
from .models import User
|
||||
|
||||
|
||||
class Middleware (MiddlewareMixin):
|
||||
"""
|
||||
This is a dummy authentication middleware class that creates what
|
||||
is roughly an Anonymous authenticated user so we can disable login
|
||||
and not interfere with existing user ID's. It's only used if
|
||||
login is disabled in paperless.conf (default is to require login)
|
||||
"""
|
||||
|
||||
def process_request(self, request):
|
||||
request.user = User()
|
||||
26
src/paperless/models.py
Normal file
26
src/paperless/models.py
Normal file
@@ -0,0 +1,26 @@
|
||||
class User:
|
||||
"""
|
||||
This is a dummy django User used with our middleware to disable
|
||||
login authentication if that is configured in paperless.conf
|
||||
"""
|
||||
is_superuser = True
|
||||
is_active = True
|
||||
is_staff = True
|
||||
is_authenticated = True
|
||||
|
||||
# Must be -1 to avoid colliding with real user ID's (which start at 1)
|
||||
id = -1
|
||||
|
||||
@property
|
||||
def pk(self):
|
||||
return self.id
|
||||
|
||||
|
||||
"""
|
||||
NOTE: These are here as a hack instead of being in the User definition
|
||||
above due to the way pycodestyle handles lamdbdas.
|
||||
See https://github.com/PyCQA/pycodestyle/issues/379 for more.
|
||||
"""
|
||||
|
||||
User.has_module_perms = lambda *_: True
|
||||
User.has_perm = lambda *_: True
|
||||
@@ -77,6 +77,8 @@ INSTALLED_APPS = [
|
||||
if os.getenv("PAPERLESS_INSTALLED_APPS"):
|
||||
INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
|
||||
|
||||
|
||||
|
||||
MIDDLEWARE_CLASSES = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
@@ -88,6 +90,12 @@ MIDDLEWARE_CLASSES = [
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
#If AUTH is disabled, we just use our "bypass" authentication middleware
|
||||
if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")):
|
||||
_index = MIDDLEWARE_CLASSES.index('django.contrib.auth.middleware.AuthenticationMiddleware')
|
||||
MIDDLEWARE_CLASSES[_index] = 'paperless.middleware.Middleware'
|
||||
MIDDLEWARE_CLASSES.remove('django.contrib.auth.middleware.SessionAuthenticationMiddleware')
|
||||
|
||||
ROOT_URLCONF = 'paperless.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = (1, 2, 0)
|
||||
__version__ = (1, 3, 0)
|
||||
|
||||
@@ -3,18 +3,19 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
import dateparser
|
||||
import pdftotext
|
||||
|
||||
import dateparser
|
||||
import langdetect
|
||||
import pyocr
|
||||
from django.conf import settings
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
from PIL import Image
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
import pdftotext
|
||||
from documents.parsers import DocumentParser, ParseError
|
||||
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
@@ -35,7 +36,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
DATE_ORDER = settings.DATE_ORDER
|
||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||
OCR_ALWAYS = settings.OCR_ALWAYS
|
||||
TEXT_CACHE = None
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
self._text = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
@@ -52,31 +56,29 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
return os.path.join(self.tempdir, "convert-0000.png")
|
||||
|
||||
def _is_ocred(self):
|
||||
|
||||
# Extract text from PDF using pdftotext
|
||||
text = get_text_from_pdf(self.document_path)
|
||||
|
||||
# We assume, that a PDF with at least 50 characters contains text
|
||||
# (so no OCR required)
|
||||
if len(text) > 50:
|
||||
return True
|
||||
|
||||
return False
|
||||
return len(text) > 50
|
||||
|
||||
def get_text(self):
|
||||
if self.TEXT_CACHE is not None:
|
||||
return self.TEXT_CACHE
|
||||
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
|
||||
if not self.OCR_ALWAYS and self._is_ocred():
|
||||
self.log("info", "Skipping OCR, using Text from PDF")
|
||||
self.TEXT_CACHE = get_text_from_pdf(self.document_path)
|
||||
return self.TEXT_CACHE
|
||||
self._text = get_text_from_pdf(self.document_path)
|
||||
return self._text
|
||||
|
||||
images = self._get_greyscale()
|
||||
|
||||
try:
|
||||
|
||||
self.TEXT_CACHE = self._get_ocr(images)
|
||||
return self.TEXT_CACHE
|
||||
self._text = self._get_ocr(images)
|
||||
return self._text
|
||||
except OCRError as e:
|
||||
raise ParseError(e)
|
||||
|
||||
@@ -200,7 +202,13 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
return text
|
||||
|
||||
def get_date(self):
|
||||
text = self.get_text()
|
||||
date = None
|
||||
datestring = None
|
||||
|
||||
try:
|
||||
text = self.get_text()
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
@@ -208,19 +216,38 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ
|
||||
m = re.search(
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
pattern = re.compile(
|
||||
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||
r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^ ]{3,9} [0-9]{4})\b', text)
|
||||
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
|
||||
|
||||
if m is None:
|
||||
return None
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(pattern, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
return dateparser.parse(m.group(0),
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
break
|
||||
|
||||
if date is not None:
|
||||
self.log("info", "Detected document date " + date.strftime("%x") +
|
||||
" based on string " + datestring)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
@@ -231,13 +258,15 @@ def run_convert(*args):
|
||||
if settings.CONVERT_TMPDIR:
|
||||
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
|
||||
|
||||
subprocess.Popen(args, env=environment).wait()
|
||||
if not subprocess.Popen(args, env=environment).wait() == 0:
|
||||
raise ParseError("Convert failed at {}".format(args))
|
||||
|
||||
|
||||
def run_unpaper(args):
|
||||
unpaper, pnm = args
|
||||
subprocess.Popen(
|
||||
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
|
||||
command_args = unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
|
||||
if not subprocess.Popen(command_args).wait() == 0:
|
||||
raise ParseError("Unpaper failed at {}".format(command_args))
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
@@ -262,6 +291,7 @@ def image_to_string(args):
|
||||
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
|
||||
with open(pdf_file, "rb") as f:
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
|
||||
BIN
src/paperless_tesseract/tests/samples/tests_date_8.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_8.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/tests_date_9.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_9.pdf
Normal file
Binary file not shown.
@@ -25,14 +25,145 @@ class TestDate(TestCase):
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_date_format_1(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = "lorem ipsum 130218 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_2(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = "lorem ipsum 2018 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_3(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = "lorem ipsum 20180213 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_4(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_5(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = (
|
||||
"lorem ipsum 130218, 2018, 20180213 and 13.02.2018 lorem ipsum")
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_6(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_7(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"März 2019\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_8(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = ("lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020")
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_9(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document._text = ("lorem ipsum\n"
|
||||
"27. Nullmonth 2020\n"
|
||||
"März 2020\n"
|
||||
"lorem ipsum")
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_get_text_1_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -43,9 +174,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -56,9 +188,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2013, 2, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -69,9 +202,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2013, 2, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -82,9 +216,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -95,9 +230,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -108,9 +244,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -121,9 +258,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -134,9 +272,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -147,9 +286,10 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -161,9 +301,10 @@ class TestDate(TestCase):
|
||||
document.get_text()
|
||||
document.DATE_ORDER = "MDY"
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -175,9 +316,10 @@ class TestDate(TestCase):
|
||||
document.get_text()
|
||||
document.DATE_ORDER = "MDY"
|
||||
self.assertEqual(document._is_ocred(), False)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
@@ -210,6 +352,35 @@ class TestDate(TestCase):
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_get_text_8_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_get_text_9_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user