feat(data-extraction): Extract c/o relation from street (#222)

This commit is contained in:
Tristan Nolde 2023-10-15 13:46:10 +02:00 committed by GitHub
commit 99b61e7c2e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 128 additions and 4 deletions

View File

@ -31,6 +31,7 @@ class RelationshipRoleEnum(str, MultiValueEnum):
GESCHAEFTSLEITER = "Geschäftsleiter(in)", "Geschäftsleiter"
ZWEIGNIEDERLASSUNG = "Zweigniederlassung"
HAUPTNIEDERLASSUNG = "Hauptniederlassung"
CARE_OF = "c/o"
class CompanyTypeEnum(str, MultiValueEnum):

View File

@ -486,6 +486,48 @@ def map_last_update(data: dict) -> str:
return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"]["letzte_Eintragung"]
def map_co_relation(data: dict) -> dict:
"""Search for and map the c/o relation from location.street if possible.
Args:
data (dict): Company dict
Returns:
dict: Modified Company dict
"""
street = data["location"].street
if street is None:
return data
parts = street.split(",")
co_company = None
co_company_index = None
for index, part in enumerate(parts):
trimmed_part = part.strip()
result = re.findall(r"^c\/o(.*)$", trimmed_part)
if len(result) == 1:
co_company = result[0].strip()
co_company_index = index
if co_company_index is not None:
del parts[co_company_index]
street = "".join(parts).strip()
data["location"].street = street
if co_company is not None and co_company != "":
relation = CompanyToCompanyRelationship(
RelationshipRoleEnum.CARE_OF, # type: ignore
Location(
data["location"].city,
street,
data["location"].house_number,
data["location"].zip_code,
),
CompanyRelationshipEnum.COMPANY, # type: ignore
co_company,
)
data["relationships"].append(relation)
return data
def map_unternehmensregister_json(data: dict) -> Company:
"""Processes the Unternehmensregister structured export to a Company by using several helper methods.
@ -516,15 +558,13 @@ def map_unternehmensregister_json(data: dict) -> Company:
data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][i]
)
result["relationships"].append(people)
result = map_co_relation(result)
return Company(**result)
if __name__ == "__main__":
from loguru import logger
# transform_xml_to_json(
# "./data/Unternehmensregister/scraping/", "./data/Unternehmensregister/export/"
# )
base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister"
for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")):
path = os.path.join(f"{base_path}/export", file)
@ -544,6 +584,7 @@ if __name__ == "__main__":
json.dump(
dataclasses.asdict(company), export_file, ensure_ascii=False
)
except Exception:
except Exception as e:
logger.error(e)
logger.error(f"Error in processing {path}")
sys.exit(1)

View File

@ -709,6 +709,86 @@ def test_map_last_update() -> None:
assert result == date
@pytest.mark.parametrize(
("value", "expected_result"),
[
(
{
"location": Location(
"", "c/o Youco24 Business Center, Abc ffda", None, None
),
"relationships": [],
},
{
"location": Location("", "Abc ffda", None, None),
"relationships": [
CompanyToCompanyRelationship(
RelationshipRoleEnum.CARE_OF, # type: ignore
Location("", "Abc ffda", None, None),
CompanyRelationshipEnum.COMPANY,
"Youco24 Business Center",
)
],
},
),
(
{
"location": Location(
"Iserlohn", "c/o Youco24 Business Center, Abc Str.", "42", "58644"
),
"relationships": [],
},
{
"location": Location("Iserlohn", "Abc Str.", "42", "58644"),
"relationships": [
CompanyToCompanyRelationship(
RelationshipRoleEnum.CARE_OF, # type: ignore
Location("Iserlohn", "Abc Str.", "42", "58644"),
CompanyRelationshipEnum.COMPANY,
"Youco24 Business Center",
)
],
},
),
(
{
"location": Location(
"Iserlohn", "Abc Str., c/o Youco24 Business Center", "42", "58644"
),
"relationships": [],
},
{
"location": Location("Iserlohn", "Abc Str.", "42", "58644"),
"relationships": [
CompanyToCompanyRelationship(
RelationshipRoleEnum.CARE_OF, # type: ignore
Location("Iserlohn", "Abc Str.", "42", "58644"),
CompanyRelationshipEnum.COMPANY,
"Youco24 Business Center",
)
],
},
),
(
{
"location": Location("Iserlohn", "Abc Str., c/o", "42", "58644"),
"relationships": [],
},
{
"location": Location("Iserlohn", "Abc Str.", "42", "58644"),
"relationships": [],
},
),
],
)
def test_map_co_relation(value: dict, expected_result: dict) -> None:
result = transform.map_co_relation(value)
assert result == expected_result
@patch(
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_co_relation"
)
@patch(
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_company_id"
)
@ -746,6 +826,7 @@ def test_map_unternehmensregister_json( # noqa: PLR0913
mock_loc_from_beteiligung: Mock,
mock_map_name_from_beteiligung: Mock,
mock_map_company_id: Mock,
mock_map_co_relation: Mock,
) -> None:
expected_result = Company(
**{ # type: ignore
@ -770,6 +851,7 @@ def test_map_unternehmensregister_json( # noqa: PLR0913
mock_map_business_purpose.return_value = expected_result.business_purpose
mock_map_founding_date.return_value = expected_result.founding_date
mock_map_parse_stakeholder.return_value = expected_result.relationships[0]
mock_map_co_relation.side_effect = lambda x: x
data: dict = {
"XJustiz_Daten": {