import multiprocessing import ocrmypdf import logging import os import shutil import uuid from django.conf import settings from django.core.management import call_command from django.core.management.base import BaseCommand from ocrmypdf import Verbosity from whoosh.writing import AsyncWriter from documents.models import Document from ... import index from ...mixins import Renderable from ...parsers import get_parser_class_for_mime_type def handle_document(document): mime_type = document.mime_type parser_class = get_parser_class_for_mime_type(mime_type) parser = parser_class(logging_group=uuid.uuid4()) parser.parse(document.source_path, mime_type) if parser.get_archive_path(): shutil.copy(parser.get_archive_path(), document.archive_path) else: logging.getLogger(__name__).warning( f"Parser {parser} did not produce an archived document " f"for {document.file_name}" ) if parser.get_text(): document.content = parser.get_text() document.save() parser.cleanup() class Command(Renderable, BaseCommand): help = """ Using the current classification model, assigns correspondents, tags and document types to all documents, effectively allowing you to back-tag all previously indexed documents with metadata created (or modified) after their initial import. """.replace(" ", "") def __init__(self, *args, **kwargs): self.verbosity = 0 BaseCommand.__init__(self, *args, **kwargs) def add_arguments(self, parser): parser.add_argument( "-f", "--overwrite", default=False, action="store_true", help="Recreates the archived document for documents that already " "have an archived version." ) def handle(self, *args, **options): os.makedirs(settings.SCRATCH_DIR, exist_ok=True) overwrite = options["overwrite"] documents = Document.objects.all() documents_to_process = filter( lambda d: overwrite or not os.path.exists(d.archive_path), documents ) with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool: list( pool.imap( handle_document, list(documents_to_process) ) ) ix = index.open_index() with AsyncWriter(ix) as writer: for d in documents_to_process: index.update_document(writer, d)