Compare commits

..

42 Commits
1.2.0 ... 1.3.0

Author SHA1 Message Date
Daniel Quinn
1f68223752 Sort new features to the top 2018-02-25 11:57:19 +00:00
Daniel Quinn
d6b626ec50 Update changelog for #308 fix 2018-02-25 11:56:11 +00:00
Daniel Quinn
926016f1d8 Merge pull request #309 from danielquinn/mysql-hack
Add another db_index hack for MySQL.

Fixes #308.
2018-02-25 12:53:38 +01:00
Daniel Quinn
8ea1cc24a7 Update for 1.3.0 2018-02-25 11:47:39 +00:00
Daniel Quinn
08adb2f540 Merge pull request #312 from pitkley/docker-fix-groupmod
Fix modifying the GID under alpine
2018-02-25 12:43:22 +01:00
Pit Kleyersburg
4333ee01bc Fix modifying the GID under alpine
Fixes #306.
2018-02-25 12:36:51 +01:00
Daniel Quinn
46db0edd5d Merge pull request #300 from TeraHz/master
Handle Django migrations at startup
2018-02-25 12:13:09 +01:00
Georgi Todorov
10d017ebe4 expanding tabs in yaml file 2018-02-24 20:36:35 -05:00
Daniel Quinn
76a13dadb8 Add another db_index hack for MySQL 2018-02-20 11:17:11 +00:00
Daniel Quinn
72750b9243 Merge pull request #307 from BastianPoe/bugfix/add_more_unit_test_for_date_detection
Bugfix/add more unit test for date detection
2018-02-20 12:03:41 +01:00
Wolf-Bastian Pöttner
fba58f3bdd Increase testcoverage by testing two more date detection cases 2018-02-19 21:36:48 +01:00
Daniel Quinn
6662ca3467 Fix formatting 2018-02-18 18:00:34 +00:00
Daniel Quinn
6f1ed89e26 Fix tests to use _text instead of TEXT_CACHE 2018-02-18 18:00:22 +00:00
Daniel Quinn
7e99e42924 Update changelog with #301 2018-02-18 16:26:40 +00:00
Daniel Quinn
5d01410dc0 Merge pull request #302 from BastianPoe/bugfix/extend_regex_to_find_more_dates
Extends the regex to find dates in documents as reported by @isaacsando
2018-02-18 17:23:49 +01:00
Daniel Quinn
9a47dba494 Add notes about optional disabling of logins 2018-02-18 16:19:29 +00:00
Daniel Quinn
6edee78145 Merge pull request #295 from matthewmoto/disable_login
Disable login with a creative hack
2018-02-18 17:15:51 +01:00
Daniel Quinn
30eac99a61 Update changelog with #303 2018-02-18 16:06:10 +00:00
Daniel Quinn
ea6d040809 Monitor return codes of calls to convert and unpaper
...and handle the failures nicely.  Addresses #303.
2018-02-18 16:02:27 +00:00
Daniel Quinn
8e9d5caa37 Rename .TEXT_CACHE to .text
Properties should use snake_case, and only constants should be ALL_CAPS.
This change also makes use of the convention of "private" properties
being prefixed with `_`.
2018-02-18 16:00:43 +00:00
Daniel Quinn
122aa2b9f1 Make isort happy 2018-02-18 16:00:03 +00:00
Daniel Quinn
fb1da4834c Style and removal of Python 2.7 stuff 2018-02-18 15:55:55 +00:00
Wolf-Bastian Pöttner
96c7222269 Improved regular expression to only match for (unicode) characters in month names + parsed one regex match after another until one gave a parsable date 2018-02-14 21:41:04 +01:00
Wolf-Bastian Pöttner
1737e27b34 Add more (fast-running) unit tests 2018-02-14 21:41:01 +01:00
Wolf-Bastian Pöttner
39f198138a Extended exception handling 2018-02-12 22:43:16 +01:00
Wolf-Bastian Pöttner
c74bb84c83 Added log output for date detected in document 2018-02-12 22:41:19 +01:00
Wolf-Bastian Pöttner
07d06d9aee Extends the regex to find dates in documents as reported by @isaacsando 2018-02-12 22:41:15 +01:00
Daniel Quinn
9cef689106 Merge branch 'rhaamo-add_retagger_correspondents' 2018-02-08 20:03:51 +00:00
Daniel Quinn
11a9c756b3 Updated for style and to add a --use-first option 2018-02-08 20:03:29 +00:00
Dashie
11accaff7f Fix line length 2018-02-08 19:34:48 +00:00
Dashie
77aee832e4 Add manager command to re-tag documents without correspondent 2018-02-08 19:34:48 +00:00
Matt
4bde14368c Rejiggering for more pycodestyle issues... 2018-02-08 09:01:10 -05:00
Matt
151d85f2be Moving auto-auth logic to more Django-flavored locations and correcting some readability/stylistic considerations requested by the upstream maintainer 2018-02-08 08:46:33 -05:00
Matt
e7c23cfb92 Making paperless.conf DISABLE_LOGIN default explicitly to "false" for clarity 2018-02-06 10:03:50 -05:00
Georgi Todorov
3f9ea7b971 Add a dependency link between two containers and change the way Django migrations are done. Provide example for auto-starting the containers 2018-02-05 18:23:35 -05:00
Matt
998c3ef51b Merge branch 'master' of https://github.com/danielquinn/paperless into disable_login 2018-02-05 11:48:15 -05:00
Daniel Quinn
c85b6b425d Fix docs to better reflect reality of extensions 2018-02-04 13:33:08 +00:00
Daniel Quinn
273916973c Remove superfluous code 2018-02-04 13:27:26 +00:00
Matt
96c517d65c Rejiggering code style to make upstream pycodestyle checks happy 2018-02-02 09:50:43 -05:00
Matt
e70ad3d493 Fixing formatting to be compatible with upstream repo for login disabling patch 2018-02-01 17:32:08 -05:00
Matt Meno
516bc48a33 Updating changes to allow for disabling login 2018-02-01 17:28:19 -05:00
Matt
7f97716ae9 Testing auth disabling mods 2018-02-01 17:28:04 -05:00
19 changed files with 503 additions and 104 deletions

View File

@@ -17,7 +17,7 @@ ENV PAPERLESS_EXPORT_DIR=/export \
# Install dependencies
RUN apk --no-cache --update add \
python3 gnupg libmagic bash \
python3 gnupg libmagic bash shadow \
sudo poppler tesseract-ocr imagemagick ghostscript unpaper && \
apk --no-cache add --virtual .build-dependencies \
python3-dev poppler-dev gcc g++ musl-dev zlib-dev jpeg-dev && \
@@ -30,8 +30,6 @@ RUN apk --no-cache --update add \
apk del .build-dependencies && \
# Create the consumption directory
mkdir -p $PAPERLESS_CONSUMPTION_DIR && \
# Migrate database
./src/manage.py migrate && \
# Create user
addgroup -g 1000 paperless && \
adduser -D -u 1000 -G paperless -h /usr/src/paperless paperless && \

View File

@@ -1,12 +1,19 @@
version: '2'
version: '2.1'
services:
webserver:
build: ./
# uncomment the following line to start automatically on system boot
# restart: always
ports:
# You can adapt the port you want Paperless to listen on by
# modifying the part before the `:`.
- "8000:8000"
healthcheck:
test: ["CMD", "curl" , "-f", "http://localhost:8000"]
interval: 30s
timeout: 10s
retries: 5
volumes:
- data:/usr/src/paperless/data
- media:/usr/src/paperless/media
@@ -21,6 +28,11 @@ services:
consumer:
build: ./
# uncomment the following line to start automatically on system boot
# restart: always
depends_on:
webserver:
condition: service_healthy
volumes:
- data:/usr/src/paperless/data
- media:/usr/src/paperless/media

View File

@@ -1,6 +1,29 @@
Changelog
#########
1.3.0 (Unreleased)
==================
* You can now run Paperless without a login, though you'll still have to create
at least one user. This is thanks to a pull-request from `matthewmoto`_:
`#295`_. Note that logins are still required by default, and that you need
to disable them by setting ``PAPERLESS_DISABLE_LOGIN="true"`` in your
environment or in ``/etc/paperless.conf``.
* Fix for `#303`_ where sketchily-formatted documents could cause the consumer
to break and insert half-records into the database breaking all sorts of
things. We now capture the return codes of both ``convert`` and ``unpaper``
and fail-out nicely.
* Fix for additional date types thanks to input from `Isaac`_ and code from
`BastianPoe`_ (`#301`_).
* Fix for running migrations in the Docker container (`#299`_). Thanks to
`Georgi Todorov`_ for the fix (`#300`_) and to `Pit`_ for the review.
* Fix for Docker cases where the issuing user is not UID 1000. This was a
collaborative fix between `Jeffrey Portman`_ and `Pit`_ in `#311`_ and
`#312`_ to fix `#306`_.
* Patch the historical migrations to support MySQL's um, *interesting* way of
handing indexes (`#308`_). Thanks to `Simon Taddiken`_ for reporting the
problem and helping me find where to fix it.
1.2.0
=====
@@ -329,6 +352,11 @@ Changelog
.. _Dan Panzarella: https://github.com/pzl
.. _addadi: https://github.com/addadi
.. _BastianPoe: https://github.com/BastianPoe
.. _matthewmoto: https://github.com/BastianPoe
.. _Isaac: https://github.com/isaacsando
.. _Georgi Todorov: https://github.com/TeraHz
.. _Jeffrey Portman: https://github.com/ChromoX
.. _Simon Taddiken: https://github.com/skuzzle
.. _#20: https://github.com/danielquinn/paperless/issues/20
.. _#44: https://github.com/danielquinn/paperless/issues/44
@@ -380,6 +408,15 @@ Changelog
.. _#256: https://github.com/danielquinn/paperless/pull/256
.. _#285: https://github.com/danielquinn/paperless/pull/285
.. _#291: https://github.com/danielquinn/paperless/pull/291
.. _#295: https://github.com/danielquinn/paperless/pull/295
.. _#299: https://github.com/danielquinn/paperless/issues/299
.. _#300: https://github.com/danielquinn/paperless/pull/300
.. _#301: https://github.com/danielquinn/paperless/issues/301
.. _#303: https://github.com/danielquinn/paperless/issues/303
.. _#306: https://github.com/danielquinn/paperless/issues/306
.. _#308: https://github.com/danielquinn/paperless/issues/308
.. _#311: https://github.com/danielquinn/paperless/pull/311
.. _#312: https://github.com/danielquinn/paperless/pull/312
.. _pipenv: https://docs.pipenv.org/
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/
.. _a new home on Docker Hub: https://hub.docker.com/r/danielquinn/paperless/

View File

@@ -50,9 +50,19 @@ this:
1. Consumer finds a file in the consumption directory.
2. It asks all the available parsers: *"Hey, can you handle this file?"*
3. The first parser that says yes gets to handle the file. The order in which
the parsers are asked is handled by sorting ``INSTALLED_APPS`` in
``settings.py``.
3. Each parser responds with either ``None`` meaning they can't handle the
file, or a dictionary in the following format:
.. code:: python
{
"parser": <the class name>,
"weight": <an integer>
}
The consumer compares the ``weight`` values from all respondents and uses the
class with the highest value to consume the document. The default parser,
``RasterisedDocumentParser`` has a weight of ``0``.
.. _extending-parsers-appspy:
@@ -61,7 +71,7 @@ apps.py
.......
This is a standard Django file, but you'll need to add some code to it to
register your parser as being able to handle particular files.
connect your parser to the ``document_consumer_declaration`` signal.
.. _extending-parsers-finally:
@@ -79,14 +89,12 @@ the list like this:
INSTALLED_APPS = [
...
"my_module.apps.MyModuleConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
...
]
Note that we're placing our module *above* ``PaperlessTesseractConfig``. This
is to ensure that if your module wants to handle any files typically handled by
the default module, yours will win instead. If there's no conflict between
what your module does and the default, then order doesn't matter.
Order doesn't matter, but generally it's a good idea to place your module lower
in the list so that you don't end up accidentally overriding project defaults
somewhere.
.. _extending-parsers-example:

View File

@@ -94,7 +94,7 @@ You may want to take a look at the ``paperless.conf.example`` file to see if
there's anything new in there compared to what you've got int ``/etc``.
If you are :ref:`using Docker <setup-installation-docker>` the update process
requires only one additional step:
is similar:
.. code-block:: shell-session
@@ -102,7 +102,6 @@ requires only one additional step:
$ git pull
$ docker build -t paperless .
$ docker-compose up -d
$ docker-compose run --rm webserver migrate
If ``git pull`` doesn't report any changes, there is no need to continue with
the remaining steps.

View File

@@ -86,6 +86,11 @@ PAPERLESS_PASSPHRASE="secret"
# https://docs.djangoproject.com/en/1.11/ref/settings/#force-script-name
#PAPERLESS_FORCE_SCRIPT_NAME=""
# If you are using alternative authentication means or are just using paperless
# as a single user on a small private network, this option allows you to disable
# user authentication if you set it to "true"
#PAPERLESS_DISABLE_LOGIN="false"
###############################################################################
#### Software Tweaks ####
###############################################################################

View File

@@ -4,13 +4,13 @@ set -e
# Source: https://github.com/sameersbn/docker-gitlab/
map_uidgid() {
USERMAP_ORIG_UID=$(id -u paperless)
USERMAP_ORIG_UID=$(id -g paperless)
USERMAP_GID=${USERMAP_GID:-${USERMAP_UID:-$USERMAP_ORIG_GID}}
USERMAP_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
if [[ ${USERMAP_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_GID} != "${USERMAP_ORIG_GID}" ]]; then
echo "Mapping UID and GID for paperless:paperless to $USERMAP_UID:$USERMAP_GID"
addgroup -g "${USERMAP_GID}" paperless
sed -i -e "s|:${USERMAP_ORIG_UID}:${USERMAP_GID}:|:${USERMAP_UID}:${USERMAP_GID}:|" /etc/passwd
USERMAP_ORIG_GID=$(id -g paperless)
USERMAP_NEW_UID=${USERMAP_UID:-$USERMAP_ORIG_UID}
USERMAP_NEW_GID=${USERMAP_GID:-${USERMAP_ORIG_GID:-$USERMAP_NEW_UID}}
if [[ ${USERMAP_NEW_UID} != "${USERMAP_ORIG_UID}" || ${USERMAP_NEW_GID} != "${USERMAP_ORIG_GID}" ]]; then
echo "Mapping UID and GID for paperless:paperless to $USERMAP_NEW_UID:$USERMAP_NEW_GID"
usermod -u "${USERMAP_NEW_UID}" paperless
groupmod -g "${USERMAP_NEW_GID}" paperless
fi
}
@@ -42,9 +42,24 @@ set_permissions() {
chown -Rh paperless:paperless /usr/src/paperless
}
migrations() {
# A simple lock file in case other containers use this startup
LOCKFILE="/usr/src/paperless/data/db.sqlite3.migration"
set -o noclobber
# check for and create lock file in one command
(> ${LOCKFILE}) &> /dev/null
if [ $? -eq 0 ]
then
sudo -HEu paperless "/usr/src/paperless/src/manage.py" "migrate"
rm ${LOCKFILE}
fi
}
initialize() {
map_uidgid
set_permissions
migrations
}
install_languages() {
@@ -64,7 +79,7 @@ install_languages() {
if [ "$lang" == "eng" ]; then
continue
fi
if apk info -e "$pkg" > /dev/null 2>&1; then
continue
fi

View File

@@ -22,7 +22,7 @@ class ConsumerError(Exception):
pass
class Consumer(object):
class Consumer:
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale pnm
@@ -117,10 +117,10 @@ class Consumer(object):
)
parsed_document = parser_class(doc)
thumbnail = parsed_document.get_thumbnail()
date = parsed_document.get_date()
try:
thumbnail = parsed_document.get_thumbnail()
date = parsed_document.get_date()
document = self._store(
parsed_document.get_text(),
doc,

View File

@@ -0,0 +1,82 @@
import sys
from django.core.management.base import BaseCommand
from documents.models import Correspondent, Document
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
help = """
Using the current set of correspondent rules, apply said rules to all
documents in the database, effectively allowing you to back-tag all
previously indexed documents with correspondent created (or modified)
after their initial import.
""".replace(" ", "")
TOO_MANY_CONTINUE = (
"Detected {} potential correspondents for {}, so we've opted for {}")
TOO_MANY_SKIP = (
"Detected {} potential correspondents for {}, so we're skipping it")
CHANGE_MESSAGE = (
'Document {}: "{}" was given the correspondent id {}: "{}"')
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument(
"--use-first",
default=False,
action="store_true",
help="By default this command won't try to assign a correspondent "
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds."
)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
for document in Document.objects.filter(correspondent__isnull=True):
potential_correspondents = list(
Correspondent.match_all(document.content))
if not potential_correspondents:
continue
potential_count = len(potential_correspondents)
correspondent = potential_correspondents[0]
if potential_count > 1:
if not options["use_first"]:
print(
self.TOO_MANY_SKIP.format(potential_count, document),
file=sys.stderr
)
continue
print(
self.TOO_MANY_CONTINUE.format(
potential_count,
document,
correspondent
),
file=sys.stderr
)
document.correspondent = correspondent
document.save(update_fields=("correspondent",))
print(
self.CHANGE_MESSAGE.format(
document.pk,
document.title,
correspondent.pk,
correspondent.name
),
file=sys.stderr
)

View File

@@ -3,6 +3,7 @@
from __future__ import unicode_literals
from django.db import migrations, models
from django.conf import settings
class Migration(migrations.Migration):
@@ -15,6 +16,6 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='document',
name='content',
field=models.TextField(blank=True, db_index=True, help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
field=models.TextField(blank=True, db_index=("mysql" not in settings.DATABASES["default"]["ENGINE"]), help_text='The raw, text-only data of the document. This field is primarily used for searching.'),
),
]

View File

@@ -30,15 +30,8 @@ from .serialisers import (
class IndexView(TemplateView):
template_name = "documents/index.html"
def get_context_data(self, **kwargs):
print(kwargs)
print(self.request.GET)
print(self.request.POST)
return TemplateView.get_context_data(self, **kwargs)
class FetchView(SessionOrBasicAuthMixin, DetailView):

View File

@@ -0,0 +1,14 @@
from django.utils.deprecation import MiddlewareMixin
from .models import User
class Middleware (MiddlewareMixin):
"""
This is a dummy authentication middleware class that creates what
is roughly an Anonymous authenticated user so we can disable login
and not interfere with existing user ID's. It's only used if
login is disabled in paperless.conf (default is to require login)
"""
def process_request(self, request):
request.user = User()

26
src/paperless/models.py Normal file
View File

@@ -0,0 +1,26 @@
class User:
"""
This is a dummy django User used with our middleware to disable
login authentication if that is configured in paperless.conf
"""
is_superuser = True
is_active = True
is_staff = True
is_authenticated = True
# Must be -1 to avoid colliding with real user ID's (which start at 1)
id = -1
@property
def pk(self):
return self.id
"""
NOTE: These are here as a hack instead of being in the User definition
above due to the way pycodestyle handles lamdbdas.
See https://github.com/PyCQA/pycodestyle/issues/379 for more.
"""
User.has_module_perms = lambda *_: True
User.has_perm = lambda *_: True

View File

@@ -77,6 +77,8 @@ INSTALLED_APPS = [
if os.getenv("PAPERLESS_INSTALLED_APPS"):
INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
MIDDLEWARE_CLASSES = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
@@ -88,6 +90,12 @@ MIDDLEWARE_CLASSES = [
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
#If AUTH is disabled, we just use our "bypass" authentication middleware
if bool(os.getenv("PAPERLESS_DISABLE_LOGIN", "false").lower() in ("yes", "y", "1", "t", "true")):
_index = MIDDLEWARE_CLASSES.index('django.contrib.auth.middleware.AuthenticationMiddleware')
MIDDLEWARE_CLASSES[_index] = 'paperless.middleware.Middleware'
MIDDLEWARE_CLASSES.remove('django.contrib.auth.middleware.SessionAuthenticationMiddleware')
ROOT_URLCONF = 'paperless.urls'
TEMPLATES = [

View File

@@ -1 +1 @@
__version__ = (1, 2, 0)
__version__ = (1, 3, 0)

View File

@@ -3,18 +3,19 @@ import os
import re
import subprocess
from multiprocessing.pool import Pool
import dateparser
import pdftotext
import dateparser
import langdetect
import pyocr
from django.conf import settings
from documents.parsers import DocumentParser, ParseError
from PIL import Image
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from pyocr.tesseract import TesseractError
import pdftotext
from documents.parsers import DocumentParser, ParseError
from .languages import ISO639
@@ -35,7 +36,10 @@ class RasterisedDocumentParser(DocumentParser):
DATE_ORDER = settings.DATE_ORDER
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
OCR_ALWAYS = settings.OCR_ALWAYS
TEXT_CACHE = None
def __init__(self, path):
super().__init__(path)
self._text = None
def get_thumbnail(self):
"""
@@ -52,31 +56,29 @@ class RasterisedDocumentParser(DocumentParser):
return os.path.join(self.tempdir, "convert-0000.png")
def _is_ocred(self):
# Extract text from PDF using pdftotext
text = get_text_from_pdf(self.document_path)
# We assume, that a PDF with at least 50 characters contains text
# (so no OCR required)
if len(text) > 50:
return True
return False
return len(text) > 50
def get_text(self):
if self.TEXT_CACHE is not None:
return self.TEXT_CACHE
if self._text is not None:
return self._text
if not self.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF")
self.TEXT_CACHE = get_text_from_pdf(self.document_path)
return self.TEXT_CACHE
self._text = get_text_from_pdf(self.document_path)
return self._text
images = self._get_greyscale()
try:
self.TEXT_CACHE = self._get_ocr(images)
return self.TEXT_CACHE
self._text = self._get_ocr(images)
return self._text
except OCRError as e:
raise ParseError(e)
@@ -200,7 +202,13 @@ class RasterisedDocumentParser(DocumentParser):
return text
def get_date(self):
text = self.get_text()
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
return None
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
@@ -208,19 +216,38 @@ class RasterisedDocumentParser(DocumentParser):
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ
m = re.search(
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^ ]{3,9} [0-9]{4})\b', text)
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
if m is None:
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
datestring = m.group(0)
return dateparser.parse(m.group(0),
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
break
if date is not None:
self.log("info", "Detected document date " + date.strftime("%x") +
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")
return date
def run_convert(*args):
@@ -231,13 +258,15 @@ def run_convert(*args):
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
subprocess.Popen(args, env=environment).wait()
if not subprocess.Popen(args, env=environment).wait() == 0:
raise ParseError("Convert failed at {}".format(args))
def run_unpaper(args):
unpaper, pnm = args
subprocess.Popen(
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
command_args = unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
if not subprocess.Popen(command_args).wait() == 0:
raise ParseError("Unpaper failed at {}".format(command_args))
def strip_excess_whitespace(text):
@@ -262,6 +291,7 @@ def image_to_string(args):
def get_text_from_pdf(pdf_file):
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)

Binary file not shown.

Binary file not shown.

View File

@@ -25,14 +25,145 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_1(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 130218 lorem ipsum"
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_2(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 2018 lorem ipsum"
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_3(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 20180213 lorem ipsum"
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_4(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 13.02.2018 lorem ipsum"
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_5(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = (
"lorem ipsum 130218, 2018, 20180213 and 13.02.2018 lorem ipsum")
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_6(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum"
)
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_7(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = (
"lorem ipsum\n"
"März 2019\n"
"lorem ipsum"
)
self.assertEqual(
document.get_date(),
datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_8(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = ("lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum\n"
"März 2020")
self.assertEqual(document.get_date(),
datetime.datetime(2020, 3, 1, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_9(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = ("lorem ipsum\n"
"27. Nullmonth 2020\n"
"März 2020\n"
"lorem ipsum")
self.assertEqual(document.get_date(),
datetime.datetime(2020, 3, 1, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_1_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -43,9 +174,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -56,9 +188,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -69,9 +202,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -82,9 +216,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -95,9 +230,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -108,9 +244,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -121,9 +258,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -134,9 +272,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -147,9 +286,10 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -161,9 +301,10 @@ class TestDate(TestCase):
document.get_text()
document.DATE_ORDER = "MDY"
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -175,9 +316,10 @@ class TestDate(TestCase):
document.get_text()
document.DATE_ORDER = "MDY"
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -210,6 +352,35 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.tzutc()))
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_8_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_9_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
)