From 2050b49fde947a98264ed5c0d799f658593abcfa Mon Sep 17 00:00:00 2001 From: TrisNol Date: Mon, 25 Sep 2023 18:37:39 +0200 Subject: [PATCH] fix(data-extraction): Resolve issue in different Bundesanzeiger formats --- .../utils/data_extraction/bundesanzeiger.py | 6 ++++-- tests/utils/data_extraction/bundesanzeiger_test.py | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index fd5f1ab..f69e78f 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -62,9 +62,11 @@ class Bundesanzeiger: pd.DataFrame: Filtered and pruned DataFrame """ df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0]) - df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"] + df_reports = df_reports.loc[ + (df_reports.type == "Jahresabschluss") | (df_reports.type == "Jahres-") + ] df_reports["jahr"] = df_reports.name.apply( - lambda name: name.split(" ")[-1].split(".")[-1] + lambda name: re.findall(r"\d{2}\.\d{2}.\d{4}", name)[0].split(".")[-1] ) return df_reports.drop(["name", "report", "type"], axis=1) diff --git a/tests/utils/data_extraction/bundesanzeiger_test.py b/tests/utils/data_extraction/bundesanzeiger_test.py index 8829bbd..94af1ca 100644 --- a/tests/utils/data_extraction/bundesanzeiger_test.py +++ b/tests/utils/data_extraction/bundesanzeiger_test.py @@ -70,7 +70,11 @@ def test_extracct_financial_results() -> None: def test_filter_reports() -> None: test_data = [ {"name": "Bedienungsanleitung", "report": "", "raw_report": ""}, - {"name": "Jahresabschluss 1998", "report": "", "raw_report": ""}, + { + "name": "Jahresabschluss vom 01.01.1998 bis zum 31.12.1998", + "report": "", + "raw_report": "", + }, ] test_df = pd.DataFrame(test_data) ba = Bundesanzeiger() @@ -91,7 +95,7 @@ def test_get_information(mock_bundesanzeiger: Mock) -> None: "raw_report": "", }, "2": { - "name": "Jahresabschluss 1998", + "name": "Jahresabschluss 01.01.1998", "report": "", "company": "PRJ 23 Transparenzregister GmbH", "raw_report": "",