diff --git a/src/aki_prj23_transparenzregister/apps/fix_company_financials.py b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py new file mode 100644 index 0000000..0fb2aed --- /dev/null +++ b/src/aki_prj23_transparenzregister/apps/fix_company_financials.py @@ -0,0 +1,23 @@ +"""Fix fincancial data of particular companies identified by their ID.""" +from aki_prj23_transparenzregister.apps.enrich_company_financials import work +from aki_prj23_transparenzregister.config.config_providers import JsonFileConfigProvider +from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import ( + CompanyMongoService, +) +from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector + +if __name__ == "__main__": + config_provider = JsonFileConfigProvider("./secrets.json") + + mongo_connector = MongoConnector(config_provider.get_mongo_connection_string()) + company_service = CompanyMongoService(mongo_connector) + + entries = [ + "649f16a4e198338c3b442ab1", + "649f16a5e198338c3b442b0a", + "649f16a5e198338c3b442ac6", + ] + + companies = [company_service.get_by_object_id(entry) for entry in entries] + for company in companies: + work(company, company_service) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index fd5f1ab..104e48b 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -61,10 +61,15 @@ class Bundesanzeiger: Returns: pd.DataFrame: Filtered and pruned DataFrame """ + date_regex = r"\d{2}\.\d{2}.\d{4}" + df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0]) - df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"] + df_reports = df_reports.loc[ + ((df_reports.type == "Jahresabschluss") | (df_reports.type == "Jahres-")) + & df_reports.name.str.contains(date_regex, regex=True) + ] df_reports["jahr"] = df_reports.name.apply( - lambda name: name.split(" ")[-1].split(".")[-1] + lambda name: re.findall(date_regex, name)[0].split(".")[-1] ) return df_reports.drop(["name", "report", "type"], axis=1) diff --git a/tests/apps/fix_company_financials_test.py b/tests/apps/fix_company_financials_test.py new file mode 100644 index 0000000..3684269 --- /dev/null +++ b/tests/apps/fix_company_financials_test.py @@ -0,0 +1,6 @@ +"""test aki_prj23_transparenzregister.apps.fix_company_financials.""" +from aki_prj23_transparenzregister.apps import fix_company_financials + + +def test_main() -> None: + assert fix_company_financials diff --git a/tests/utils/data_extraction/bundesanzeiger_test.py b/tests/utils/data_extraction/bundesanzeiger_test.py index 8829bbd..94af1ca 100644 --- a/tests/utils/data_extraction/bundesanzeiger_test.py +++ b/tests/utils/data_extraction/bundesanzeiger_test.py @@ -70,7 +70,11 @@ def test_extracct_financial_results() -> None: def test_filter_reports() -> None: test_data = [ {"name": "Bedienungsanleitung", "report": "", "raw_report": ""}, - {"name": "Jahresabschluss 1998", "report": "", "raw_report": ""}, + { + "name": "Jahresabschluss vom 01.01.1998 bis zum 31.12.1998", + "report": "", + "raw_report": "", + }, ] test_df = pd.DataFrame(test_data) ba = Bundesanzeiger() @@ -91,7 +95,7 @@ def test_get_information(mock_bundesanzeiger: Mock) -> None: "raw_report": "", }, "2": { - "name": "Jahresabschluss 1998", + "name": "Jahresabschluss 01.01.1998", "report": "", "company": "PRJ 23 Transparenzregister GmbH", "raw_report": "",