From b972acee7afd97dd8bb1e2af195813e2e3cc9e19 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 14 Oct 2023 18:22:41 +0200 Subject: [PATCH] fix(data-extraction): Parse date from Gesellschaftsvertrag entry --- .../unternehmensregister/transform.py | 31 +++++++++++++++++-- .../unternehmensregister/transform_test.py | 19 +++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index 5b78278..21bc43e 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -367,6 +367,29 @@ def map_business_purpose(data: dict) -> str | None: return None +def extract_date_from_string(value: str) -> str | None: + """Extract a date in ISO format from the given string if possible. + + Args: + value (str): Input text + + Returns: + str | None: Date in ISO format, None if not found + """ + date_regex = [ # type: ignore + {"regex": r"\d{1,2}\.\d{1,2}\.\d{2,4}", "mapper": transform_date_to_iso}, + {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, + ] + for regex in date_regex: + result = re.findall(regex["regex"], value) # type: ignore + if len(result) == 1: + relevant_data = result[0] + if regex["mapper"] is not None: # type: ignore + return regex["mapper"](relevant_data) # type: ignore + return relevant_data + return None + + def map_founding_date(data: dict) -> str | None: """Extracts the founding date from a given Unternehmensregister export. @@ -392,9 +415,11 @@ def map_founding_date(data: dict) -> str | None: "Gruendungsmetadaten" in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"] ): - return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ - "Gruendungsmetadaten" - ]["Gruendungsdatum"] + return extract_date_from_string( + data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ + "Gruendungsmetadaten" + ]["Gruendungsdatum"] + ) # No reliable answer return None diff --git a/tests/utils/data_extraction/unternehmensregister/transform_test.py b/tests/utils/data_extraction/unternehmensregister/transform_test.py index 7d602eb..3e32453 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform_test.py @@ -601,6 +601,21 @@ def test_map_business_purpose_no_result() -> None: assert result is None +@pytest.mark.parametrize( + ("value", "expected_result"), + [ + ("", None), + ("Tag der ersten Eintragung: 01.05.2004", "2004-05-01"), + ("Gesellschaftsvertrag vom 06.04.2016 Hallo Welt", "2016-04-06"), + ("Str. des Tests vom 1999-04-05", "1999-04-05"), + ("Once upon a midnight dreary while I pondered weak and weary...", None), + ], +) +def test_extract_date_from_string(value: str, expected_result: str) -> None: + result = transform.extract_date_from_string(value) + assert result == expected_result + + def test_map_founding_date_from_tag_der_ersten_eintragung() -> None: data = { "some entry": "Tag der ersten Eintragung: 01.05.2004", @@ -626,7 +641,9 @@ def test_map_founding_date_from_gruendungsdatum() -> None: "XJustiz_Daten": { "Fachdaten_Register": { "Basisdaten_Register": { - "Gruendungsmetadaten": {"Gruendungsdatum": "1998-01-01"} + "Gruendungsmetadaten": { + "Gruendungsdatum": "Gesellschaftsvertrag vom 1998-01-01" + } } } }