From b972acee7afd97dd8bb1e2af195813e2e3cc9e19 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 14 Oct 2023 18:22:41 +0200 Subject: [PATCH 1/5] fix(data-extraction): Parse date from Gesellschaftsvertrag entry --- .../unternehmensregister/transform.py | 31 +++++++++++++++++-- .../unternehmensregister/transform_test.py | 19 +++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index 5b78278..21bc43e 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -367,6 +367,29 @@ def map_business_purpose(data: dict) -> str | None: return None +def extract_date_from_string(value: str) -> str | None: + """Extract a date in ISO format from the given string if possible. + + Args: + value (str): Input text + + Returns: + str | None: Date in ISO format, None if not found + """ + date_regex = [ # type: ignore + {"regex": r"\d{1,2}\.\d{1,2}\.\d{2,4}", "mapper": transform_date_to_iso}, + {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, + ] + for regex in date_regex: + result = re.findall(regex["regex"], value) # type: ignore + if len(result) == 1: + relevant_data = result[0] + if regex["mapper"] is not None: # type: ignore + return regex["mapper"](relevant_data) # type: ignore + return relevant_data + return None + + def map_founding_date(data: dict) -> str | None: """Extracts the founding date from a given Unternehmensregister export. @@ -392,9 +415,11 @@ def map_founding_date(data: dict) -> str | None: "Gruendungsmetadaten" in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"] ): - return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ - "Gruendungsmetadaten" - ]["Gruendungsdatum"] + return extract_date_from_string( + data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ + "Gruendungsmetadaten" + ]["Gruendungsdatum"] + ) # No reliable answer return None diff --git a/tests/utils/data_extraction/unternehmensregister/transform_test.py b/tests/utils/data_extraction/unternehmensregister/transform_test.py index 7d602eb..3e32453 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform_test.py @@ -601,6 +601,21 @@ def test_map_business_purpose_no_result() -> None: assert result is None +@pytest.mark.parametrize( + ("value", "expected_result"), + [ + ("", None), + ("Tag der ersten Eintragung: 01.05.2004", "2004-05-01"), + ("Gesellschaftsvertrag vom 06.04.2016 Hallo Welt", "2016-04-06"), + ("Str. des Tests vom 1999-04-05", "1999-04-05"), + ("Once upon a midnight dreary while I pondered weak and weary...", None), + ], +) +def test_extract_date_from_string(value: str, expected_result: str) -> None: + result = transform.extract_date_from_string(value) + assert result == expected_result + + def test_map_founding_date_from_tag_der_ersten_eintragung() -> None: data = { "some entry": "Tag der ersten Eintragung: 01.05.2004", @@ -626,7 +641,9 @@ def test_map_founding_date_from_gruendungsdatum() -> None: "XJustiz_Daten": { "Fachdaten_Register": { "Basisdaten_Register": { - "Gruendungsmetadaten": {"Gruendungsdatum": "1998-01-01"} + "Gruendungsmetadaten": { + "Gruendungsdatum": "Gesellschaftsvertrag vom 1998-01-01" + } } } } From 39c13ac74aee320c668e6ab74ee2650437bbdf7d Mon Sep 17 00:00:00 2001 From: Tristan Nolde Date: Sun, 15 Oct 2023 11:51:11 +0200 Subject: [PATCH 2/5] Update tests/utils/data_extraction/unternehmensregister/transform_test.py Co-authored-by: Philipp Horstenkamp --- .../data_extraction/unternehmensregister/transform_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/utils/data_extraction/unternehmensregister/transform_test.py b/tests/utils/data_extraction/unternehmensregister/transform_test.py index 3e32453..17da71e 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform_test.py @@ -606,6 +606,9 @@ def test_map_business_purpose_no_result() -> None: [ ("", None), ("Tag der ersten Eintragung: 01.05.2004", "2004-05-01"), + ("Tag der ersten Eintragung: 1.05.2004", "2004-05-01"), + ("Tag der ersten Eintragung: 1.5.2004", "2004-05-01"), + ("Tag der ersten Eintragung: 01.5.2004", "2004-05-01"), ("Gesellschaftsvertrag vom 06.04.2016 Hallo Welt", "2016-04-06"), ("Str. des Tests vom 1999-04-05", "1999-04-05"), ("Once upon a midnight dreary while I pondered weak and weary...", None), From d34a0ffeac300ab1d2412ea7041c8249acbe8c8e Mon Sep 17 00:00:00 2001 From: Tristan Nolde Date: Sun, 15 Oct 2023 11:55:58 +0200 Subject: [PATCH 3/5] Update src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py Co-authored-by: Philipp Horstenkamp --- .../utils/data_extraction/unternehmensregister/transform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index 21bc43e..c32fa56 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -378,7 +378,8 @@ def extract_date_from_string(value: str) -> str | None: """ date_regex = [ # type: ignore {"regex": r"\d{1,2}\.\d{1,2}\.\d{2,4}", "mapper": transform_date_to_iso}, - {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, + {"regex": r"(20|19)\d{2}-\d{1,2}-\d{1,2}", "mapper": None}, + ] for regex in date_regex: result = re.findall(regex["regex"], value) # type: ignore From 15ace5382d790b76a0c8155d50a8cb5cc46e572b Mon Sep 17 00:00:00 2001 From: Tristan Nolde Date: Sun, 15 Oct 2023 11:56:06 +0200 Subject: [PATCH 4/5] Update src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py Co-authored-by: Philipp Horstenkamp --- .../utils/data_extraction/unternehmensregister/transform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index c32fa56..926fe42 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -377,7 +377,8 @@ def extract_date_from_string(value: str) -> str | None: str | None: Date in ISO format, None if not found """ date_regex = [ # type: ignore - {"regex": r"\d{1,2}\.\d{1,2}\.\d{2,4}", "mapper": transform_date_to_iso}, + {"regex": r"\d{1,2}\.\d{1,2}\.(19|20)?\d{2}", "mapper": transform_date_to_iso}, + {"regex": r"(20|19)\d{2}-\d{1,2}-\d{1,2}", "mapper": None}, ] From eba5235dff56880efddafa34fabe27f2539ed22f Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sun, 15 Oct 2023 12:05:25 +0200 Subject: [PATCH 5/5] refactor: Implement PR feedback --- .../unternehmensregister/transform.py | 16 +++++++++------- .../unternehmensregister/transform_test.py | 4 ++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index 926fe42..468fdee 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -377,19 +377,21 @@ def extract_date_from_string(value: str) -> str | None: str | None: Date in ISO format, None if not found """ date_regex = [ # type: ignore - {"regex": r"\d{1,2}\.\d{1,2}\.(19|20)?\d{2}", "mapper": transform_date_to_iso}, - - {"regex": r"(20|19)\d{2}-\d{1,2}-\d{1,2}", "mapper": None}, - + {"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso}, + {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, ] + results = [] for regex in date_regex: result = re.findall(regex["regex"], value) # type: ignore if len(result) == 1: relevant_data = result[0] if regex["mapper"] is not None: # type: ignore - return regex["mapper"](relevant_data) # type: ignore - return relevant_data - return None + results.append(regex["mapper"](relevant_data)) # type: ignore + else: + results.append(relevant_data) + if len(results) != 1: + return None + return results[0] def map_founding_date(data: dict) -> str | None: diff --git a/tests/utils/data_extraction/unternehmensregister/transform_test.py b/tests/utils/data_extraction/unternehmensregister/transform_test.py index 17da71e..a312572 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform_test.py @@ -612,6 +612,10 @@ def test_map_business_purpose_no_result() -> None: ("Gesellschaftsvertrag vom 06.04.2016 Hallo Welt", "2016-04-06"), ("Str. des Tests vom 1999-04-05", "1999-04-05"), ("Once upon a midnight dreary while I pondered weak and weary...", None), + ( + "This company was first founded in 2016-06-10 and then again on 1.5.2004", + None, + ), ], ) def test_extract_date_from_string(value: str, expected_result: str) -> None: