From 2050b49fde947a98264ed5c0d799f658593abcfa Mon Sep 17 00:00:00 2001 From: TrisNol Date: Mon, 25 Sep 2023 18:37:39 +0200 Subject: [PATCH 1/3] fix(data-extraction): Resolve issue in different Bundesanzeiger formats --- .../utils/data_extraction/bundesanzeiger.py | 6 ++++-- tests/utils/data_extraction/bundesanzeiger_test.py | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index fd5f1ab..f69e78f 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -62,9 +62,11 @@ class Bundesanzeiger: pd.DataFrame: Filtered and pruned DataFrame """ df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0]) - df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"] + df_reports = df_reports.loc[ + (df_reports.type == "Jahresabschluss") | (df_reports.type == "Jahres-") + ] df_reports["jahr"] = df_reports.name.apply( - lambda name: name.split(" ")[-1].split(".")[-1] + lambda name: re.findall(r"\d{2}\.\d{2}.\d{4}", name)[0].split(".")[-1] ) return df_reports.drop(["name", "report", "type"], axis=1) diff --git a/tests/utils/data_extraction/bundesanzeiger_test.py b/tests/utils/data_extraction/bundesanzeiger_test.py index 8829bbd..94af1ca 100644 --- a/tests/utils/data_extraction/bundesanzeiger_test.py +++ b/tests/utils/data_extraction/bundesanzeiger_test.py @@ -70,7 +70,11 @@ def test_extracct_financial_results() -> None: def test_filter_reports() -> None: test_data = [ {"name": "Bedienungsanleitung", "report": "", "raw_report": ""}, - {"name": "Jahresabschluss 1998", "report": "", "raw_report": ""}, + { + "name": "Jahresabschluss vom 01.01.1998 bis zum 31.12.1998", + "report": "", + "raw_report": "", + }, ] test_df = pd.DataFrame(test_data) ba = Bundesanzeiger() @@ -91,7 +95,7 @@ def test_get_information(mock_bundesanzeiger: Mock) -> None: "raw_report": "", }, "2": { - "name": "Jahresabschluss 1998", + "name": "Jahresabschluss 01.01.1998", "report": "", "company": "PRJ 23 Transparenzregister GmbH", "raw_report": "", From 7b5cf16e4908b9b82e33e1c12ff54610771d9134 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Mon, 25 Sep 2023 19:33:23 +0200 Subject: [PATCH 2/3] feat: Add simple wrapper to update particual financial entries --- .../utils/data_extraction/bundesanzeiger.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index f69e78f..104e48b 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -61,12 +61,15 @@ class Bundesanzeiger: Returns: pd.DataFrame: Filtered and pruned DataFrame """ + date_regex = r"\d{2}\.\d{2}.\d{4}" + df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0]) df_reports = df_reports.loc[ - (df_reports.type == "Jahresabschluss") | (df_reports.type == "Jahres-") + ((df_reports.type == "Jahresabschluss") | (df_reports.type == "Jahres-")) + & df_reports.name.str.contains(date_regex, regex=True) ] df_reports["jahr"] = df_reports.name.apply( - lambda name: re.findall(r"\d{2}\.\d{2}.\d{4}", name)[0].split(".")[-1] + lambda name: re.findall(date_regex, name)[0].split(".")[-1] ) return df_reports.drop(["name", "report", "type"], axis=1) From 77711d8a2f2feba66bfb569755909ef7e645bf23 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Mon, 25 Sep 2023 19:34:10 +0200 Subject: [PATCH 3/3] feat: Add simple wrapper to update particual financial entries --- .../apps/fix_company_financials.py | 23 +++++++++++++++++++ tests/apps/fix_company_financials_test.py | 6 +++++ 2 files changed, 29 insertions(+) create mode 100644 src/aki_prj23_transparenzregister/apps/fix_company_financials.py create mode 100644 tests/apps/fix_company_financials_test.py diff --git a/src/aki_prj23_transparenzregister/apps/fix_company_financials.py b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py new file mode 100644 index 0000000..0fb2aed --- /dev/null +++ b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py @@ -0,0 +1,23 @@ +"""Fix fincancial data of particular companies identified by their ID.""" +from aki_prj23_transparenzregister.apps.enrich_company_financials import work +from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider +from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import ( + CompanyMongoService, +) +from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector + +if __name__ == "__main__": + config_provider = JsonFileConfigProvider("./secrets.json") + + mongo_connector = MongoConnector(config_provider.get_mongo_connection_string()) + company_service = CompanyMongoService(mongo_connector) + + entries = [ + "649f16a4e198338c3b442ab1", + "649f16a5e198338c3b442b0a", + "649f16a5e198338c3b442ac6", + ] + + companies = [company_service.get_by_object_id(entry) for entry in entries] + for company in companies: + work(company, company_service) diff --git a/tests/apps/fix_company_financials_test.py b/tests/apps/fix_company_financials_test.py new file mode 100644 index 0000000..3684269 --- /dev/null +++ b/tests/apps/fix_company_financials_test.py @@ -0,0 +1,6 @@ +"""test aki_prj23_transparenzregister.apps.fix_company_financials.""" +from aki_prj23_transparenzregister.apps import fix_company_financials + + +def test_main() -> None: + assert fix_company_financials