diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 2c1cf025b..270289879 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -46,6 +46,7 @@ from documents.signals.handlers import run_workflows from documents.templating.workflows import parse_w_workflow_placeholders from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats +from documents.utils import normalize_nfc from documents.utils import run_subprocess from paperless_mail.parsers import MailDocumentParser @@ -111,7 +112,12 @@ class ConsumerPluginMixin: self.renew_logging_group() - self.filename = self.metadata.filename or self.input_doc.original_file.name + self.metadata.filename = normalize_nfc(self.metadata.filename) + self.metadata.title = normalize_nfc(self.metadata.title) + + self.filename = normalize_nfc( + self.metadata.filename or self.input_doc.original_file.name, + ) def _send_progress( self, @@ -652,6 +658,8 @@ class ConsumerPlugin( f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}", ) + title = normalize_nfc(title) + file_for_checksum = ( self.unmodified_original if self.unmodified_original is not None diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index 48cd57311..c4d6a0822 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -6,6 +6,7 @@ from django.conf import settings from documents.models import Document from documents.templating.filepath import validate_filepath_template_and_render from documents.templating.utils import convert_format_str_to_template_format +from documents.utils import normalize_nfc def create_source_path_directory(source_path: Path) -> None: @@ -55,11 +56,11 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path: """ if archive_filename: old_filename: Path | None = ( - Path(doc.archive_filename) if doc.archive_filename else None + Path(normalize_nfc(doc.archive_filename)) if doc.archive_filename else None ) root = settings.ARCHIVE_DIR else: - old_filename = Path(doc.filename) if doc.filename else None + old_filename = Path(normalize_nfc(doc.filename)) if doc.filename else None root = settings.ORIGINALS_DIR # If generating archive filenames, try to make a name that is similar to @@ -91,7 +92,7 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path: ) if new_filename == old_filename: # still the same as before. - return new_filename + return Path(normalize_nfc(str(new_filename))) if (root / new_filename).exists(): counter += 1 @@ -119,7 +120,7 @@ def format_filename(document: Document, template_str: str) -> str | None: "none", ) # backward compatibility - return rendered_filename + return normalize_nfc(rendered_filename) def generate_filename( @@ -174,4 +175,4 @@ def generate_filename( if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG: full_path = full_path.with_suffix(full_path.suffix + ".gpg") - return full_path + return Path(normalize_nfc(str(full_path))) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 63d6f8f5b..416eb2142 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -290,6 +290,23 @@ class TestConsumer( self._assert_first_last_send_progress() + def test_override_filename_normalized(self): + filename = self.get_test_file() + override_filename = "Inhaltsu\u0308bersicht.pdf" + + with self.get_consumer( + filename, + DocumentMetadataOverrides(filename=override_filename), + ) as consumer: + consumer.run() + + document = Document.objects.first() + + self.assertIsNotNone(document) + self.assertEqual(document.original_filename, "Inhaltsübersicht.pdf") + self.assertEqual(document.title, "Inhaltsübersicht") + self._assert_first_last_send_progress() + def testOverrideTitle(self): with self.get_consumer( self.get_test_file(), @@ -304,6 +321,25 @@ class TestConsumer( self.assertEqual(document.title, "Override Title") self._assert_first_last_send_progress() + @override_settings(FILENAME_FORMAT="{{ title }}") + def test_filename_format_normalized(self): + filename = self.get_test_file() + title = "Inhaltsu\u0308bersicht Faszination" + + with self.get_consumer( + filename, + DocumentMetadataOverrides(title=title), + ) as consumer: + consumer.run() + + document = Document.objects.first() + + self.assertIsNotNone(document) + self.assertEqual(document.title, "Inhaltsübersicht Faszination") + self.assertEqual(document.filename, "Inhaltsübersicht Faszination.pdf") + self.assertIsFile(document.source_path) + self._assert_first_last_send_progress() + def testOverrideCorrespondent(self): c = Correspondent.objects.create(name="test") diff --git a/src/documents/utils.py b/src/documents/utils.py index 2b6a60749..d21c63781 100644 --- a/src/documents/utils.py +++ b/src/documents/utils.py @@ -1,5 +1,7 @@ import logging import shutil +import unicodedata +from os import PathLike from os import utime from pathlib import Path from subprocess import CompletedProcess @@ -16,6 +18,14 @@ def _coerce_to_path( return Path(source).resolve(), Path(dest).resolve() +def normalize_nfc(value: str | PathLike[str] | None) -> str | None: + """Return NFC-normalized string for filesystem-safe comparisons.""" + + if value is None: + return None + return unicodedata.normalize("NFC", str(value)) + + def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None: """ Copies only the m_time and a_time attributes from source to destination.