From 8db04177be9903c26f38554ffaa4424862fee2c3 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 14 Oct 2023 19:21:26 +0200 Subject: [PATCH] feat(data-extraction): Extract c/o relation from street in company relation --- .../models/company.py | 1 + .../unternehmensregister/transform.py | 49 +++++++++++++++++-- .../unternehmensregister/transform_test.py | 34 +++++++++++++ 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/src/aki_prj23_transparenzregister/models/company.py b/src/aki_prj23_transparenzregister/models/company.py index 64b5ce3..82fc0f7 100644 --- a/src/aki_prj23_transparenzregister/models/company.py +++ b/src/aki_prj23_transparenzregister/models/company.py @@ -31,6 +31,7 @@ class RelationshipRoleEnum(str, MultiValueEnum): GESCHAEFTSLEITER = "Geschäftsleiter(in)", "Geschäftsleiter" ZWEIGNIEDERLASSUNG = "Zweigniederlassung" HAUPTNIEDERLASSUNG = "Hauptniederlassung" + LOKATION_BEI = "bei" class CompanyTypeEnum(str, MultiValueEnum): diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index 468fdee..a4df2e6 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -486,6 +486,48 @@ def map_last_update(data: dict) -> str: return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"]["letzte_Eintragung"] +def map_co_relation(data: dict) -> dict: + """Search for and map the c/o relation from location.street if possible. + + Args: + data (dict): Company dict + + Returns: + dict: Modified Company dict + """ + street = data["location"].street + if street is None: + return data + parts = street.split(",") + co_company = None + co_company_index = None + for index, part in enumerate(parts): + trimmed_part = part.strip() + result = re.findall(r"^c\/o (.*)$", trimmed_part) + if len(result) == 1: + co_company = result[0] + co_company_index = index + if co_company_index is not None: + del parts[co_company_index] + street = "".join(parts).strip() + data["location"].street = street + + if co_company is not None: + relation = CompanyToCompanyRelationship( + RelationshipRoleEnum.LOKATION_BEI, # type: ignore + Location( + data["location"].city, + street, + data["location"].house_number, + data["location"].zip_code, + ), + CompanyRelationshipEnum.COMPANY, # type: ignore + co_company, + ) + data["relationships"].append(relation) + return data + + def map_unternehmensregister_json(data: dict) -> Company: """Processes the Unternehmensregister structured export to a Company by using several helper methods. @@ -516,15 +558,13 @@ def map_unternehmensregister_json(data: dict) -> Company: data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][i] ) result["relationships"].append(people) + result = map_co_relation(result) return Company(**result) if __name__ == "__main__": from loguru import logger - # transform_xml_to_json( - # "./data/Unternehmensregister/scraping/", "./data/Unternehmensregister/export/" - # ) base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): path = os.path.join(f"{base_path}/export", file) @@ -544,6 +584,7 @@ if __name__ == "__main__": json.dump( dataclasses.asdict(company), export_file, ensure_ascii=False ) - except Exception: + except Exception as e: + logger.error(e) logger.error(f"Error in processing {path}") sys.exit(1) diff --git a/tests/utils/data_extraction/unternehmensregister/transform_test.py b/tests/utils/data_extraction/unternehmensregister/transform_test.py index a312572..d163184 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform_test.py @@ -709,6 +709,38 @@ def test_map_last_update() -> None: assert result == date +@pytest.mark.parametrize( + ("value", "expected_result"), + [ + ( + { + "location": Location( + "", "c/o Youco24 Business Center, Abc ffda", None, None + ), + "relationships": [], + }, + { + "location": Location("", "Abc ffda", None, None), + "relationships": [ + CompanyToCompanyRelationship( + RelationshipRoleEnum.LOKATION_BEI, # type: ignore + Location("", "Abc ffda", None, None), + CompanyRelationshipEnum.COMPANY, + "Youco24 Business Center", + ) + ], + }, + ), + ], +) +def test_map_co_relation(value: dict, expected_result: dict) -> None: + result = transform.map_co_relation(value) + assert result == expected_result + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_co_relation" +) @patch( "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_company_id" ) @@ -746,6 +778,7 @@ def test_map_unternehmensregister_json( # noqa: PLR0913 mock_loc_from_beteiligung: Mock, mock_map_name_from_beteiligung: Mock, mock_map_company_id: Mock, + mock_map_co_relation: Mock, ) -> None: expected_result = Company( **{ # type: ignore @@ -770,6 +803,7 @@ def test_map_unternehmensregister_json( # noqa: PLR0913 mock_map_business_purpose.return_value = expected_result.business_purpose mock_map_founding_date.return_value = expected_result.founding_date mock_map_parse_stakeholder.return_value = expected_result.relationships[0] + mock_map_co_relation.side_effect = lambda x: x data: dict = { "XJustiz_Daten": {