From 7f8511c9d6ea042bd8ac3ed361d5dd9add29c735 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sun, 29 Oct 2023 12:53:27 +0100 Subject: [PATCH 01/14] checkpoint: Core data fetch routine --- .../apps/find_missing_companies.py | 85 +++++++++++++++++++ .../unternehmensregister/extract.py | 3 +- .../unternehmensregister/transform.py | 2 + 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 src/aki_prj23_transparenzregister/apps/find_missing_companies.py diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py new file mode 100644 index 0000000..513b256 --- /dev/null +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -0,0 +1,85 @@ +import os +import sys +import json +import glob +import argparse +import tempfile +import pandas as pd +from tqdm import tqdm +from pathlib import Path +from loguru import logger +from aki_prj23_transparenzregister.config.config_providers import ( + HELP_TEXT_CONFIG, + get_config_provider, +) +from aki_prj23_transparenzregister.utils.logger_config import ( + add_logger_options_to_argparse, + configer_logger, +) + +from aki_prj23_transparenzregister.utils.sql import connector +from aki_prj23_transparenzregister.utils.sql import entities + +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import ( + extract, + load, + transform, +) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Transparenzregister Webserver", + description="Starts an Dash Webserver that shows our Analysis.", + epilog="Example: webserver --log-level ERROR --log-path print.log", + ) + parser.add_argument( + "config", + metavar="config", + default="ENV", + ) + add_logger_options_to_argparse(parser) + + parsed = parser.parse_args(sys.argv[1:]) + configer_logger(namespace=parsed) + config = parsed.config + session = connector.get_session(get_config_provider(config)) + missing_companies = session.query(entities.MissingCompany).all() + + counter = 0 + # Scrape data from unternehmensregister + for company in missing_companies: + print(company.name) + extract.scrape(company.name, ["tmp", "xml"]) + counter = counter + 1 + if counter == 5: + break + # Transform input + output_path = os.path.join(str(Path.cwd()), *["tmp", "transformed"]) + xml_dir = os.path.join(str(Path.cwd()), *["tmp", "xml"]) + json_dir = os.path.join(str(Path.cwd()), *["tmp", "json"]) + transform.transform_xml_to_json( + os.path.join(xml_dir), + os.path.join(json_dir), + ) + for file in tqdm(glob.glob1(json_dir, "*.json")): + path = os.path.join(json_dir, file) + with open(path, encoding="utf-8") as file_object: + try: + company: Company = transform.map_unternehmensregister_json( + json.loads(file_object.read()) + ) + + name = "".join(e for e in company.name if e.isalnum())[:50] + + with open( + f"{output_path}/{name}.json", + "w+", + encoding="utf-8", + ) as export_file: + json.dump( + dataclasses.asdict(company), export_file, ensure_ascii=False + ) + except Exception as e: + logger.error(e) + logger.error(f"Error in processing {path}") + sys.exit(1) \ No newline at end of file diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py index c37b260..efff716 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py @@ -21,6 +21,7 @@ def scrape(query: str, download_dir: list[str]) -> None: download_dir (list[str]): Directory to place output files in """ download_path = os.path.join(str(Path.cwd()), *download_dir) + print(download_path) options = webdriver.ChromeOptions() preferences = { "profile.default_content_settings.popups": 0, @@ -32,7 +33,7 @@ def scrape(query: str, download_dir: list[str]) -> None: "default_directory": download_path, }, } - options.add_argument("--headless=new") + # options.add_argument("--headless=new") options.add_experimental_option("prefs", preferences) driver = webdriver.Chrome(options=options) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index 82a8028..eb2fd97 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -38,6 +38,8 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None: source_dir (str): Directory hosting the xml files target_dir (str): Target directory to move json files to """ + if not os.path.exists(target_dir): + os.makedirs(target_dir) for source_path in [ os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True) ]: From 9d7bb07989ffbf837a81121eb38cfa14eda0b4a2 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sun, 29 Oct 2023 14:46:06 +0100 Subject: [PATCH 02/14] checkpoint: Adapt data transformation to new structure --- .gitignore | 4 + .../apps/find_missing_companies.py | 7 +- tmp/transform.py | 645 ++++++++++++++++++ tmp/transformation.ipynb | 90 +++ 4 files changed, 743 insertions(+), 3 deletions(-) create mode 100644 tmp/transform.py create mode 100644 tmp/transformation.ipynb diff --git a/.gitignore b/.gitignore index 38bc337..4e8f59e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Data blobs +**/*.xml +**/*.json + # LaTeX temp files **/*.aux **/*-blx.bib diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py index 513b256..d4cf188 100644 --- a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -43,13 +43,14 @@ if __name__ == "__main__": configer_logger(namespace=parsed) config = parsed.config session = connector.get_session(get_config_provider(config)) - missing_companies = session.query(entities.MissingCompany).all() + # missing_companies = session.query(entities.MissingCompany).all() + missing_companies = ["GEA Farm Technologies"] counter = 0 # Scrape data from unternehmensregister for company in missing_companies: - print(company.name) - extract.scrape(company.name, ["tmp", "xml"]) + print(company) + extract.scrape(company, ["tmp", "xml"]) counter = counter + 1 if counter == 5: break diff --git a/tmp/transform.py b/tmp/transform.py new file mode 100644 index 0000000..b876d41 --- /dev/null +++ b/tmp/transform.py @@ -0,0 +1,645 @@ +"""Transform raw Unternehmensregister export (*.xml) to processed .json files for loading.""" +import dataclasses +import glob +import json +import os +import re +import sys + +import xmltodict +from tqdm import tqdm + +from aki_prj23_transparenzregister.models.company import ( + Capital, + CapitalTypeEnum, + Company, + CompanyID, + CompanyRelationship, + CompanyRelationshipEnum, + CompanyToCompanyRelationship, + CompanyTypeEnum, + CurrencyEnum, + DistrictCourt, + Location, + PersonName, + PersonToCompanyRelationship, + RelationshipRoleEnum, +) +from aki_prj23_transparenzregister.utils.string_tools import ( + remove_traling_and_leading_quotes, + transform_date_to_iso, +) + + +def transform_xml_to_json(source_dir: str, target_dir: str) -> None: + """Convert all xml files in a directory to json files. + + Args: + source_dir (str): Directory hosting the xml files + target_dir (str): Target directory to move json files to + """ + if not os.path.exists(target_dir): + os.makedirs(target_dir) + for source_path in [ + os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True) + ]: + target_path = os.path.join( + target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json") + ) + + with open(source_path, encoding="utf-8") as source_file: + # deepcode ignore HandleUnicode: Weird XML format no other solution + data = xmltodict.parse(source_file.read().encode()) + with open(target_path, "w", encoding="utf-8") as json_file: + json_file.write(json.dumps(data)) + + +def parse_date_of_birth(data: dict) -> str | None: + """Retreives the date of birth from a stakeholder entry if possible. + + Args: + data (dict): Stakeholder data + + Returns: + str | None: date of birth or None if not found + """ + if "tns:geburt" in (base := data["tns:beteiligter"]["tns:auswahl_beteililgter"]["tns:natuerlichePerson"]): + base = base["tns:geburt"]["tns:geburtsdatum"] + if isinstance(base, str): + return base + return None + +# def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: + + +def parse_stakeholder(data: dict) -> CompanyRelationship | None: + """Extract the company stakeholder/relation from a single "Beteiligung". + + Args: + data (dict): Data export + + Returns: + CompanyRelationship | None: Relationship if it could be processed + """ + if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: + # It's a Company serving as a "Kommanditist" or similar + # if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None: + # return CompanyToCompanyRelationship( + # **{ # type: ignore + # "name": remove_traling_and_leading_quotes( + # data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ + # "Nachname" + # ] + # ), + # "location": Location( + # **{ + # "city": data["Beteiligter"]["Natuerliche_Person"][ + # "Anschrift" + # ][-1]["Ort"] + # if isinstance( + # data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], + # list, + # ) + # else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ + # "Ort" + # ] + # } + # ), + # "role": RelationshipRoleEnum( + # data["Rolle"]["Rollenbezeichnung"]["content"] + # ), + # "type": CompanyRelationshipEnum.COMPANY, + # } + # ) + return PersonToCompanyRelationship( + **{ # type: ignore + "name": PersonName( + **{ + "firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ + "tns:vollerName" + ]["tns:vorname"], + "lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ + "tns:vollerName" + ]["tns:nachname"], + } + ), + "date_of_birth": parse_date_of_birth(data), + "location": Location( + **{ + "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ + -1 + ]["tns:ort"] + if isinstance( + data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"], list + ) + else data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ + "tns:ort" + ] + } + ), + # TODO get role via ID + "role": RelationshipRoleEnum( + data["Rolle"]["Rollenbezeichnung"]["content"] + ), + "type": CompanyRelationshipEnum.PERSON, + } + ) + if "Organisation" in data["Beteiligter"]: + return CompanyToCompanyRelationship( + **{ # type: ignore + "role": RelationshipRoleEnum( + data["Rolle"]["Rollenbezeichnung"]["content"] + ), + "name": remove_traling_and_leading_quotes( + data["Beteiligter"]["Organisation"]["Bezeichnung"][ + "Bezeichnung_Aktuell" + ] + ), + "location": Location( + **{ + "city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"], + "street": data["Beteiligter"]["Organisation"]["Anschrift"][ + "Strasse" + ] + if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + "house_number": data["Beteiligter"]["Organisation"][ + "Anschrift" + ]["Hausnummer"] + if "Hausnummer" + in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + "zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][ + "Postleitzahl" + ] + if "Postleitzahl" + in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + } + ), + "type": CompanyRelationshipEnum.COMPANY, + } + ) + return None + + +def normalize_street(street: str) -> str: + """Normalize street names by extending them to `Straße` or `straße`. + + Args: + street (str): Name of street + + Returns: + str: Normalized street name + """ + if street is None: + return None + regex = r"(Str\.|Strasse)" + street = re.sub(regex, "Straße", street) + regex = r"(str\.|strasse)" + street = re.sub(regex, "straße", street) + return street.strip() + + +def loc_from_beteiligung(data: dict) -> Location: + """Extract the company location from the first relationship in the export. + + Args: + data (dict): Data export + + Returns: + Location: location + """ + base_path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 0, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation", + "tns:anschrift" + ] + base = traversal(data, base_path) + + house_number = None + street = None + if "tns:strasse" in base: + regex = r".(\d+)$" + hits = re.findall(regex, base["tns:strasse"]) + if len(hits) == 1: + house_number = hits[0] + street = base["tns:strasse"][: (-1 * len(house_number))] + if "tns:hausnummer" in base: + house_number = house_number + base["tns:hausnummer"] + else: + if "tns:hausnummer" in base: + house_number = base["tns:hausnummer"] + street = base["tns:strasse"] + return Location( + **{ + "city": base["tns:ort"], + "zip_code": base["tns:postleitzahl"], + "street": normalize_street(street), # type: ignore + "house_number": house_number, + } + ) + + +def name_from_beteiligung(data: dict) -> str: + """Extract the Company name from an Unternehmensregister export by using the first relationship found. + + Args: + data (dict): Data export + + Returns: + str: Company name + """ + path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 0, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation", + "tns:bezeichnung", + "tns:bezeichnung.aktuell" + ] + name = traversal(data, path) + return remove_traling_and_leading_quotes(name) + + +def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None: + """Extracts the company type from a given Unternehmensregister export. + + Args: + company_name (str): Name of the company as a fallback solution + data (dict): Data export + + Returns: + CompanyTypeEnum | None: Company type if found + """ + try: + path = [ + "tns:fachdatenRegister", + "tns:basisdatenRegister", + "tns:rechtstraeger", + "tns:angabenZurRechtsform", + "tns:rechtsform", + "code" + ] + return CompanyTypeEnum( + traversal(data, path) + ) + except Exception: + if ( + company_name.endswith("GmbH") + or company_name.endswith("UG") + or company_name.endswith("UG (haftungsbeschränkt)") + ): + return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung") + if company_name.endswith("SE"): + return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)") + if company_name.endswith("KG"): + return CompanyTypeEnum("Kommanditgesellschaft") + return None + + +def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: + """Extracts the company capital from the given Unternehmensregister export. + + Args: + data (dict): Data export + company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung') + + Returns: + Capital | None: Company Capital if found + """ + # Early return + if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]: + return None + capital: dict = {"Zahl": 0.0, "Waehrung": ""} + if company_type == CompanyTypeEnum.KG: + capital_type = "Hafteinlage" + base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + "tns:personengesellschaft" + ]["tns:zusatzKG"]["tns:datenKommanditist"] + if isinstance(base, list): + for entry in base: + # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below + capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"]) + capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"] + elif isinstance(base, dict): + capital = base["Hafteinlage"] + elif company_type in [ + CompanyTypeEnum.GMBH, + CompanyTypeEnum.SE, + CompanyTypeEnum.AG, + CompanyTypeEnum.KGaA, + CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM, + CompanyTypeEnum.OHG, + ]: + if ( + "tns:kapitalgesellschaft" + not in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"] + ): + base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + "tns:personengesellschaft" + ] + else: + base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + "tns:kapitalgesellschaft" + ] + if "tns:zusatzGmbH" in base: + capital_type = "Stammkapital" + capital = base["tns:zusatzGmbH"]["tns:stammkapital"] + elif "tns:zusatzAktiengesellschaft" in base: + capital_type = "Grundkapital" + capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"]["tns:zahl"] + elif company_type in [ + CompanyTypeEnum.EINZELKAUFMANN, + CompanyTypeEnum.EG, + CompanyTypeEnum.PARTNERSCHAFT, + CompanyTypeEnum.PARTNERGESELLSCHAFT, + CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT, + None, + ]: + return None + # Catch entries having the dict but with null values + if not all(capital.values()): + return None + return Capital( + **{ # type: ignore + "value": float(capital["tns:zahl"]), + "currency": CurrencyEnum(capital["tns:waehrung"]["code"]), + "type": CapitalTypeEnum(capital_type), + } + ) + + +def map_business_purpose(data: dict) -> str | None: + """Extracts the "Geschäftszweck" from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Business purpose if found + """ + try: + path = [ + "tns:fachdatenRegister", + "tns:basisdatenRegister", + "tns:gegenstand" + ] + return traversal(data, path) + except KeyError: + return None + + +def extract_date_from_string(value: str) -> str | None: + """Extract a date in ISO format from the given string if possible. + + Args: + value (str): Input text + + Returns: + str | None: Date in ISO format, None if not found + """ + date_regex = [ # type: ignore + {"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso}, + {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, + ] + results = [] + for regex in date_regex: + result = re.findall(regex["regex"], value) # type: ignore + if len(result) == 1: + relevant_data = result[0] + if regex["mapper"] is not None: # type: ignore + results.append(regex["mapper"](relevant_data)) # type: ignore + else: + results.append(relevant_data) + if len(results) != 1: + return None + return results[0] + + +def map_founding_date(data: dict) -> str | None: + """Extracts the founding date from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Founding date if found + """ + text = str(data) + entry_date = re.findall( + r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text + ) + if len(entry_date) == 1: + return transform_date_to_iso(entry_date[0][1]) + + entry_date = re.findall( + r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text + ) + if len(entry_date) == 1: + return transform_date_to_iso(entry_date[0]) + if ( + "tns:satzungsdatum" + in data["tns:fachdatenRegister"]["tns:basisdatenRegister"] + ): + path = [ + "tns:fachdatenRegister", + "tns:basisdatenRegister", + "tns:satzungsdatum", + "tns:aktuellesSatzungsdatum" + ] + return traversal(data, path) + # No reliable answer + return None + +def traversal(data: dict, path: list[str | int]) -> any: + current = data + for key in path: + try: + current = current[key] + except: + raise KeyError(f"Key {key} not found") + return current + + +def map_hr_number(data: dict) -> str: + hr_prefix = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:instanzdaten"][ + "tns:aktenzeichen" + ]["tns:auswahl_aktenzeichen"]["tns:aktenzeichen.strukturiert"]["tns:register"][ + "code" + ] + hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:instanzdaten"][ + "tns:aktenzeichen" + ]["tns:auswahl_aktenzeichen"]["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"] + hr_full = f"{hr_prefix} {hr_number}" + return hr_full + +def map_district_court(data: dict) -> DistrictCourt: + base_path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 1, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation" + ] + path = [*base_path, + "tns:bezeichnung", + "tns:bezeichnung.aktuell" + ] + name = traversal(data, path) + path = [*base_path, + "tns:sitz", + "tns:ort" + ] + city = traversal(data, path) + return DistrictCourt(name=name, city=city) + + +def map_company_id(data: dict) -> CompanyID: + """Retrieve Company ID from export. + + Args: + data (dict): Data export + + Returns: + CompanyID: ID of the company + """ + return CompanyID( + **{ + "hr_number": map_hr_number(data), + "district_court": map_district_court(data) + } + ) + + +def map_last_update(data: dict) -> str: + """Extract last update date from export. + + Args: + data (dict): Unternehmensregister export + + Returns: + str: Last update date + """ + path = [ + "tns:fachdatenRegister", + "tns:auszug", + "tns:letzteEintragung" + ] + return traversal(data, path) + + +def map_co_relation(data: dict) -> dict: + """Search for and map the c/o relation from location.street if possible. + + Args: + data (dict): Company dict + + Returns: + dict: Modified Company dict + """ + street = data["location"].street + if street is None: + return data + parts = street.split(",") + co_company = None + co_company_index = None + for index, part in enumerate(parts): + trimmed_part = part.strip() + result = re.findall(r"^c\/o(.*)$", trimmed_part) + if len(result) == 1: + co_company = result[0].strip() + co_company_index = index + if co_company_index is not None: + del parts[co_company_index] + street = "".join(parts).strip() + data["location"].street = street + + if co_company is not None and co_company != "": + relation = CompanyToCompanyRelationship( + RelationshipRoleEnum.CARE_OF, # type: ignore + Location( + data["location"].city, + street, + data["location"].house_number, + data["location"].zip_code, + ), + CompanyRelationshipEnum.COMPANY, # type: ignore + co_company, + ) + data["relationships"].append(relation) + return data + + +def map_unternehmensregister_json(data: dict) -> Company: + """Processes the Unternehmensregister structured export to a Company by using several helper methods. + + Args: + data (dict): Data export + + Returns: + Company: Transformed data + """ + root_key = list(data.keys())[0] + data = data[root_key] + result: dict = {"relationships": []} + + result["id"] = map_company_id(data) + result["name"] = name_from_beteiligung(data) + + result["location"] = loc_from_beteiligung(data) + result["last_update"] = map_last_update(data) + + result["company_type"] = map_rechtsform(result["name"], data) + result["capital"] = map_capital(data, result["company_type"]) + result["business_purpose"] = map_business_purpose(data) + result["founding_date"] = map_founding_date(data) + + # TODO adapt... + # for i in range( + # 2, len(data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"]) + # ): + # people = parse_stakeholder( + # data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][i] + # ) + # result["relationships"].append(people) + result = map_co_relation(result) + return Company(**result) + + +if __name__ == "__main__": + from loguru import logger + + base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" + for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): + path = os.path.join(f"{base_path}/export", file) + with open(path, encoding="utf-8") as file_object: + try: + company: Company = map_unternehmensregister_json( + json.loads(file_object.read()) + ) + + name = "".join(e for e in company.name if e.isalnum())[:50] + + with open( + f"{base_path}/transformed/{name}.json", + "w+", + encoding="utf-8", + ) as export_file: + json.dump( + dataclasses.asdict(company), export_file, ensure_ascii=False + ) + except Exception as e: + logger.error(e) + logger.error(f"Error in processing {path}") + sys.exit(1) diff --git a/tmp/transformation.ipynb b/tmp/transformation.ipynb new file mode 100644 index 0000000..6fd84ae --- /dev/null +++ b/tmp/transformation.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'Beteiligter'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transformation.ipynb Cell 1\u001b[0m line \u001b[0;36m6\n\u001b[0;32m 4\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39m../tmp/json/GEAFarmTechnologiesGmbH.json\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m file:\n\u001b[0;32m 5\u001b[0m content \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(file)\n\u001b[1;32m----> 6\u001b[0m company_data \u001b[39m=\u001b[39m map_unternehmensregister_json(content)\n", + "File \u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transform.py:609\u001b[0m, in \u001b[0;36mmap_unternehmensregister_json\u001b[1;34m(data)\u001b[0m\n\u001b[0;32m 605\u001b[0m \u001b[39m# TODO adapt...\u001b[39;00m\n\u001b[0;32m 606\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\n\u001b[0;32m 607\u001b[0m \u001b[39m2\u001b[39m, \u001b[39mlen\u001b[39m(data[\u001b[39m\"\u001b[39m\u001b[39mtns:grunddaten\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtns:verfahrensdaten\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtns:beteiligung\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m 608\u001b[0m ):\n\u001b[1;32m--> 609\u001b[0m people \u001b[39m=\u001b[39m parse_stakeholder(\n\u001b[0;32m 610\u001b[0m data[\u001b[39m\"\u001b[39;49m\u001b[39mtns:grunddaten\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m\"\u001b[39;49m\u001b[39mtns:verfahrensdaten\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m\"\u001b[39;49m\u001b[39mtns:beteiligung\u001b[39;49m\u001b[39m\"\u001b[39;49m][i]\n\u001b[0;32m 611\u001b[0m )\n\u001b[0;32m 612\u001b[0m result[\u001b[39m\"\u001b[39m\u001b[39mrelationships\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mappend(people)\n\u001b[0;32m 613\u001b[0m result \u001b[39m=\u001b[39m map_co_relation(result)\n", + "File \u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transform.py:82\u001b[0m, in \u001b[0;36mparse_stakeholder\u001b[1;34m(data)\u001b[0m\n\u001b[0;32m 73\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mparse_stakeholder\u001b[39m(data: \u001b[39mdict\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m CompanyRelationship \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 74\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Extract the company stakeholder/relation from a single \"Beteiligung\".\u001b[39;00m\n\u001b[0;32m 75\u001b[0m \n\u001b[0;32m 76\u001b[0m \u001b[39m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[39m CompanyRelationship | None: Relationship if it could be processed\u001b[39;00m\n\u001b[0;32m 81\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 82\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mNatuerliche_Person\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m data[\u001b[39m\"\u001b[39;49m\u001b[39mBeteiligter\u001b[39;49m\u001b[39m\"\u001b[39;49m]:\n\u001b[0;32m 83\u001b[0m \u001b[39m# It's a Company serving as a \"Kommanditist\" or similar\u001b[39;00m\n\u001b[0;32m 84\u001b[0m \u001b[39mif\u001b[39;00m data[\u001b[39m\"\u001b[39m\u001b[39mBeteiligter\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mNatuerliche_Person\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mVoller_Name\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mVorname\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 85\u001b[0m \u001b[39mreturn\u001b[39;00m CompanyToCompanyRelationship(\n\u001b[0;32m 86\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m{ \u001b[39m# type: ignore\u001b[39;00m\n\u001b[0;32m 87\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mname\u001b[39m\u001b[39m\"\u001b[39m: remove_traling_and_leading_quotes(\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 110\u001b[0m }\n\u001b[0;32m 111\u001b[0m )\n", + "\u001b[1;31mKeyError\u001b[0m: 'Beteiligter'" + ] + } + ], + "source": [ + "import json\n", + "from transform import map_unternehmensregister_json\n", + "\n", + "with open('../tmp/json/GEAFarmTechnologiesGmbH.json', \"r\") as file:\n", + " content = json.load(file)\n", + " company_data = map_unternehmensregister_json(content)\n", + " print(company_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transformation.ipynb Cell 1\u001b[0m line \u001b[0;36m9\n\u001b[0;32m 7\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39m../tmp/tests/GEAFarmTechnologiesGmbH.json\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m file:\n\u001b[0;32m 8\u001b[0m expected_result \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(file)\n\u001b[1;32m----> 9\u001b[0m \u001b[39massert\u001b[39;00m result \u001b[39m==\u001b[39m expected_result\n", + "\u001b[1;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "import json\n", + "\n", + "result = None\n", + "expected_result = None\n", + "with open('../tmp/transformed/GEAFarmTechnologiesGmbH.json', 'r') as file_a:\n", + " result = json.load(file_a)\n", + "with open('../tmp/tests/GEAFarmTechnologiesGmbH.json', \"r\") as file:\n", + " expected_result = json.load(file)\n", + " assert result == expected_result" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aki-prj23-transparenzregister-jVJfu35g-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 2d9e3f19f9281770cda977db14a0de67bef56f3f Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sun, 29 Oct 2023 20:11:09 +0100 Subject: [PATCH 03/14] checkpoint: First iteration of fixed mapping --- tmp/transform.py | 29 ++++++++++++++++++----------- tmp/transformation.ipynb | 25 +++++++++++-------------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/tmp/transform.py b/tmp/transform.py index b876d41..9178805 100644 --- a/tmp/transform.py +++ b/tmp/transform.py @@ -63,13 +63,20 @@ def parse_date_of_birth(data: dict) -> str | None: Returns: str | None: date of birth or None if not found """ - if "tns:geburt" in (base := data["tns:beteiligter"]["tns:auswahl_beteililgter"]["tns:natuerlichePerson"]): + if "tns:geburt" in (base := data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]): base = base["tns:geburt"]["tns:geburtsdatum"] if isinstance(base, str): return base return None -# def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: +def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: + match role_id: + case "086": + return RelationshipRoleEnum.GESCHAEFTSFUEHRER + case "285": + return RelationshipRoleEnum.PROKURIST + case _: + raise KeyError(f'Uknown role_id: {role_id}') def parse_stakeholder(data: dict) -> CompanyRelationship | None: @@ -138,8 +145,8 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None: } ), # TODO get role via ID - "role": RelationshipRoleEnum( - data["Rolle"]["Rollenbezeichnung"]["content"] + "role": map_role_id_to_enum( + data["tns:rolle"]["tns:rollenbezeichnung"]["code"] ), "type": CompanyRelationshipEnum.PERSON, } @@ -606,13 +613,13 @@ def map_unternehmensregister_json(data: dict) -> Company: result["founding_date"] = map_founding_date(data) # TODO adapt... - # for i in range( - # 2, len(data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"]) - # ): - # people = parse_stakeholder( - # data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][i] - # ) - # result["relationships"].append(people) + for i in range( + 2, len(data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"]) + ): + people = parse_stakeholder( + data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][i] + ) + result["relationships"].append(people) result = map_co_relation(result) return Company(**result) diff --git a/tmp/transformation.ipynb b/tmp/transformation.ipynb index 6fd84ae..915b440 100644 --- a/tmp/transformation.ipynb +++ b/tmp/transformation.ipynb @@ -2,31 +2,28 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [ { - "ename": "KeyError", - "evalue": "'Beteiligter'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transformation.ipynb Cell 1\u001b[0m line \u001b[0;36m6\n\u001b[0;32m 4\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39m../tmp/json/GEAFarmTechnologiesGmbH.json\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m file:\n\u001b[0;32m 5\u001b[0m content \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(file)\n\u001b[1;32m----> 6\u001b[0m company_data \u001b[39m=\u001b[39m map_unternehmensregister_json(content)\n", - "File \u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transform.py:609\u001b[0m, in \u001b[0;36mmap_unternehmensregister_json\u001b[1;34m(data)\u001b[0m\n\u001b[0;32m 605\u001b[0m \u001b[39m# TODO adapt...\u001b[39;00m\n\u001b[0;32m 606\u001b[0m \u001b[39mfor\u001b[39;00m i \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\n\u001b[0;32m 607\u001b[0m \u001b[39m2\u001b[39m, \u001b[39mlen\u001b[39m(data[\u001b[39m\"\u001b[39m\u001b[39mtns:grunddaten\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtns:verfahrensdaten\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtns:beteiligung\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m 608\u001b[0m ):\n\u001b[1;32m--> 609\u001b[0m people \u001b[39m=\u001b[39m parse_stakeholder(\n\u001b[0;32m 610\u001b[0m data[\u001b[39m\"\u001b[39;49m\u001b[39mtns:grunddaten\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m\"\u001b[39;49m\u001b[39mtns:verfahrensdaten\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m\"\u001b[39;49m\u001b[39mtns:beteiligung\u001b[39;49m\u001b[39m\"\u001b[39;49m][i]\n\u001b[0;32m 611\u001b[0m )\n\u001b[0;32m 612\u001b[0m result[\u001b[39m\"\u001b[39m\u001b[39mrelationships\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mappend(people)\n\u001b[0;32m 613\u001b[0m result \u001b[39m=\u001b[39m map_co_relation(result)\n", - "File \u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transform.py:82\u001b[0m, in \u001b[0;36mparse_stakeholder\u001b[1;34m(data)\u001b[0m\n\u001b[0;32m 73\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mparse_stakeholder\u001b[39m(data: \u001b[39mdict\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m CompanyRelationship \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 74\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Extract the company stakeholder/relation from a single \"Beteiligung\".\u001b[39;00m\n\u001b[0;32m 75\u001b[0m \n\u001b[0;32m 76\u001b[0m \u001b[39m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 80\u001b[0m \u001b[39m CompanyRelationship | None: Relationship if it could be processed\u001b[39;00m\n\u001b[0;32m 81\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 82\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mNatuerliche_Person\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m data[\u001b[39m\"\u001b[39;49m\u001b[39mBeteiligter\u001b[39;49m\u001b[39m\"\u001b[39;49m]:\n\u001b[0;32m 83\u001b[0m \u001b[39m# It's a Company serving as a \"Kommanditist\" or similar\u001b[39;00m\n\u001b[0;32m 84\u001b[0m \u001b[39mif\u001b[39;00m data[\u001b[39m\"\u001b[39m\u001b[39mBeteiligter\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mNatuerliche_Person\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mVoller_Name\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mVorname\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 85\u001b[0m \u001b[39mreturn\u001b[39;00m CompanyToCompanyRelationship(\n\u001b[0;32m 86\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m{ \u001b[39m# type: ignore\u001b[39;00m\n\u001b[0;32m 87\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mname\u001b[39m\u001b[39m\"\u001b[39m: remove_traling_and_leading_quotes(\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 110\u001b[0m }\n\u001b[0;32m 111\u001b[0m )\n", - "\u001b[1;31mKeyError\u001b[0m: 'Beteiligter'" + "name": "stdout", + "output_type": "stream", + "text": [ + "Company(id=CompanyID(district_court=DistrictCourt(name='Amtsgericht Hamm', city='Hamm'), hr_number='HRB 5363'), location=Location(city='Bönen', street='Siemensstraße', house_number='25-27', zip_code='59199'), name='GEA Farm Technologies GmbH', last_update='2023-10-27', relationships=[PersonToCompanyRelationship(role=, location=Location(city='Oelde', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Reinhard', lastname='Gebing'), date_of_birth='1964-04-26'), PersonToCompanyRelationship(role=, location=Location(city='Wetter', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Markus', lastname='Kreft'), date_of_birth='1966-04-03'), PersonToCompanyRelationship(role=, location=Location(city='Holzminden', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Kai', lastname='Luntz'), date_of_birth='1970-12-04'), PersonToCompanyRelationship(role=, location=Location(city='Rheda-Wiedenbrück', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Thomas', lastname='Mader'), date_of_birth='1972-05-24'), PersonToCompanyRelationship(role=, location=Location(city='Düsseldorf', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Peter', lastname='Lauwers'), date_of_birth='1970-03-26'), PersonToCompanyRelationship(role=, location=Location(city='Erkrath', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Erkul', lastname='Basaran'), date_of_birth='1977-05-06'), PersonToCompanyRelationship(role=, location=Location(city='Bochum', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Henrik', lastname='Böttner'), date_of_birth='1982-11-07'), PersonToCompanyRelationship(role=, location=Location(city='Oelde', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ulrich', lastname='Raßenhövel'), date_of_birth='1969-04-16'), PersonToCompanyRelationship(role=, location=Location(city='Herdecke', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Andreas', lastname='Naroska'), date_of_birth='1967-03-23'), PersonToCompanyRelationship(role=, location=Location(city='Witten', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Mark', lastname='Kramps'), date_of_birth='1967-09-04'), PersonToCompanyRelationship(role=, location=Location(city='Dortmund', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ralf', lastname='Barkmeyer'), date_of_birth='1974-02-28'), PersonToCompanyRelationship(role=, location=Location(city='Tönnisvorst', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Holger', lastname='Siegwarth'), date_of_birth='1967-05-13'), PersonToCompanyRelationship(role=, location=Location(city='Herne', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Oliver', lastname='Liß'), date_of_birth='1981-04-13'), PersonToCompanyRelationship(role=, location=Location(city='Göppingen', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Liang', lastname='Cheng'), date_of_birth='1980-12-29'), PersonToCompanyRelationship(role=, location=Location(city='Beckum', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Astrid', lastname='Dörner-Rodeheger'), date_of_birth='1968-12-24'), PersonToCompanyRelationship(role=, location=Location(city='Dortmund', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Jon', lastname='Lange'), date_of_birth='1978-04-25'), PersonToCompanyRelationship(role=, location=Location(city='Werne', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ralf', lastname='Frombach'), date_of_birth='1977-01-25'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Sven', lastname='Hommel'), date_of_birth='1979-04-22'), PersonToCompanyRelationship(role=, location=Location(city='Oberhausen', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Matthias', lastname='Peters'), date_of_birth='1973-08-28')], company_type=, capital=Capital(value=5115000.0, currency=, type=), business_purpose='Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen (a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch; (b) für das Milchvieh-Herdenmanagement; (c) zur Tierhygiene und Sicherung der Milchqualität und (d) zur Aufstallung von Tieren; sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.', founding_date='1995-04-25')\n" ] } ], "source": [ "import json\n", + "import dataclasses\n", "from transform import map_unternehmensregister_json\n", "\n", "with open('../tmp/json/GEAFarmTechnologiesGmbH.json', \"r\") as file:\n", " content = json.load(file)\n", " company_data = map_unternehmensregister_json(content)\n", - " print(company_data)" + " print(company_data)\n", + " with open('../tmp/transformed/GEAFarmTechnologiesGmbH.json', \"w+\", encoding=\"utf-8\") as file:\n", + " json.dump(dataclasses.asdict(company_data), file, ensure_ascii=False)" ] }, { @@ -38,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -48,7 +45,7 @@ "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transformation.ipynb Cell 1\u001b[0m line \u001b[0;36m9\n\u001b[0;32m 7\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39m../tmp/tests/GEAFarmTechnologiesGmbH.json\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m file:\n\u001b[0;32m 8\u001b[0m expected_result \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(file)\n\u001b[1;32m----> 9\u001b[0m \u001b[39massert\u001b[39;00m result \u001b[39m==\u001b[39m expected_result\n", + "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transformation.ipynb Cell 3\u001b[0m line \u001b[0;36m9\n\u001b[0;32m 7\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39m../tmp/tests/GEAFarmTechnologiesGmbH.json\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m file:\n\u001b[0;32m 8\u001b[0m expected_result \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(file)\n\u001b[1;32m----> 9\u001b[0m \u001b[39massert\u001b[39;00m result \u001b[39m==\u001b[39m expected_result\n", "\u001b[1;31mAssertionError\u001b[0m: " ] } From b7f977138dd31ac7dacc955f6ecec9728a567c99 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Thu, 2 Nov 2023 16:12:23 +0100 Subject: [PATCH 04/14] checkpoint: Manual role mapping via ID --- .../apps/find_missing_companies.py | 18 +- .../unternehmensregister/transform.py | 312 +++++---- tmp/transform.py | 652 ------------------ tmp/transformation.ipynb | 73 +- 4 files changed, 244 insertions(+), 811 deletions(-) delete mode 100644 tmp/transform.py diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py index d4cf188..c8aa78b 100644 --- a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -4,6 +4,7 @@ import json import glob import argparse import tempfile +import dataclasses import pandas as pd from tqdm import tqdm from pathlib import Path @@ -43,17 +44,14 @@ if __name__ == "__main__": configer_logger(namespace=parsed) config = parsed.config session = connector.get_session(get_config_provider(config)) - # missing_companies = session.query(entities.MissingCompany).all() - missing_companies = ["GEA Farm Technologies"] + missing_companies = session.query(entities.MissingCompany).all() counter = 0 - # Scrape data from unternehmensregister - for company in missing_companies: - print(company) - extract.scrape(company, ["tmp", "xml"]) - counter = counter + 1 - if counter == 5: - break + # # Scrape data from unternehmensregister + # for company in missing_companies: + # print(company.name) + # extract.scrape(company.name, ["tmp", "xml"]) + # Transform input output_path = os.path.join(str(Path.cwd()), *["tmp", "transformed"]) xml_dir = os.path.join(str(Path.cwd()), *["tmp", "xml"]) @@ -66,7 +64,7 @@ if __name__ == "__main__": path = os.path.join(json_dir, file) with open(path, encoding="utf-8") as file_object: try: - company: Company = transform.map_unternehmensregister_json( + company = transform.map_unternehmensregister_json( json.loads(file_object.read()) ) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index eb2fd97..2e64e3c 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -63,12 +63,23 @@ def parse_date_of_birth(data: dict) -> str | None: Returns: str | None: date of birth or None if not found """ - if "Geburt" in (base := data["Beteiligter"]["Natuerliche_Person"]): - base = base["Geburt"]["Geburtsdatum"] + if "tns:geburt" in (base := data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]): + base = base["tns:geburt"]["tns:geburtsdatum"] if isinstance(base, str): return base return None +def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: + match role_id: + case "086": + return RelationshipRoleEnum.GESCHAEFTSFUEHRER + case "285": + return RelationshipRoleEnum.PROKURIST + case "194": + return RelationshipRoleEnum.VORSTAND + case _: + raise KeyError(f'Uknown role_id: {role_id}') + def parse_stakeholder(data: dict) -> CompanyRelationship | None: """Extract the company stakeholder/relation from a single "Beteiligung". @@ -79,64 +90,65 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None: Returns: CompanyRelationship | None: Relationship if it could be processed """ - if "Natuerliche_Person" in data["Beteiligter"]: + if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: # It's a Company serving as a "Kommanditist" or similar - if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None: - return CompanyToCompanyRelationship( - **{ # type: ignore - "name": remove_traling_and_leading_quotes( - data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ - "Nachname" - ] - ), - "location": Location( - **{ - "city": data["Beteiligter"]["Natuerliche_Person"][ - "Anschrift" - ][-1]["Ort"] - if isinstance( - data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], - list, - ) - else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ - "Ort" - ] - } - ), - "role": RelationshipRoleEnum( - data["Rolle"]["Rollenbezeichnung"]["content"] - ), - "type": CompanyRelationshipEnum.COMPANY, - } - ) + # if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None: + # return CompanyToCompanyRelationship( + # **{ # type: ignore + # "name": remove_traling_and_leading_quotes( + # data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ + # "Nachname" + # ] + # ), + # "location": Location( + # **{ + # "city": data["Beteiligter"]["Natuerliche_Person"][ + # "Anschrift" + # ][-1]["Ort"] + # if isinstance( + # data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], + # list, + # ) + # else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ + # "Ort" + # ] + # } + # ), + # "role": RelationshipRoleEnum( + # data["Rolle"]["Rollenbezeichnung"]["content"] + # ), + # "type": CompanyRelationshipEnum.COMPANY, + # } + # ) return PersonToCompanyRelationship( **{ # type: ignore "name": PersonName( **{ - "firstname": data["Beteiligter"]["Natuerliche_Person"][ - "Voller_Name" - ]["Vorname"], - "lastname": data["Beteiligter"]["Natuerliche_Person"][ - "Voller_Name" - ]["Nachname"], + "firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ + "tns:vollerName" + ]["tns:vorname"], + "lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ + "tns:vollerName" + ]["tns:nachname"], } ), "date_of_birth": parse_date_of_birth(data), "location": Location( **{ - "city": data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ + "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ -1 - ]["Ort"] + ]["tns:ort"] if isinstance( - data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], list + data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"], list ) - else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ - "Ort" + else data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ + "tns:ort" ] } ), - "role": RelationshipRoleEnum( - data["Rolle"]["Rollenbezeichnung"]["content"] + # TODO get role via ID + "role": map_role_id_to_enum( + data["tns:rolle"]["tns:rollenbezeichnung"]["code"] ), "type": CompanyRelationshipEnum.PERSON, } @@ -207,28 +219,36 @@ def loc_from_beteiligung(data: dict) -> Location: Returns: Location: location """ - base = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][ - "Beteiligter" - ]["Organisation"]["Anschrift"] + base_path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 0, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation", + "tns:anschrift" + ] + base = traversal(data, base_path) house_number = None street = None - if "Strasse" in base: + if "tns:strasse" in base: regex = r".(\d+)$" - hits = re.findall(regex, base["Strasse"]) + hits = re.findall(regex, base["tns:strasse"]) if len(hits) == 1: house_number = hits[0] - street = base["Strasse"][: (-1 * len(house_number))] - if "Hausnummer" in base: - house_number = house_number + base["Hausnummer"] + street = base["tns:strasse"][: (-1 * len(house_number))] + if "tns:hausnummer" in base: + house_number = house_number + base["tns:hausnummer"] else: - if "Hausnummer" in base: - house_number = base["Hausnummer"] - street = base["Strasse"] + if "tns:hausnummer" in base: + house_number = base["tns:hausnummer"] + street = base["tns:strasse"] return Location( **{ - "city": base["Ort"], - "zip_code": base["Postleitzahl"], + "city": base["tns:ort"], + "zip_code": base["tns:postleitzahl"], "street": normalize_street(street), # type: ignore "house_number": house_number, } @@ -244,9 +264,18 @@ def name_from_beteiligung(data: dict) -> str: Returns: str: Company name """ - name = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][ - "Beteiligter" - ]["Organisation"]["Bezeichnung"]["Bezeichnung_Aktuell"] + path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 0, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation", + "tns:bezeichnung", + "tns:bezeichnung.aktuell" + ] + name = traversal(data, path) return remove_traling_and_leading_quotes(name) @@ -261,12 +290,18 @@ def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None: CompanyTypeEnum | None: Company type if found """ try: + path = [ + "tns:fachdatenRegister", + "tns:basisdatenRegister", + "tns:rechtstraeger", + "tns:angabenZurRechtsform", + "tns:rechtsform", + "code" + ] return CompanyTypeEnum( - data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ - "Rechtstraeger" - ]["Rechtsform"]["content"] + traversal(data, path) ) - except KeyError: + except Exception: if ( company_name.endswith("GmbH") or company_name.endswith("UG") @@ -291,14 +326,14 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: Capital | None: Company Capital if found """ # Early return - if "Zusatzangaben" not in data["XJustiz_Daten"]["Fachdaten_Register"]: + if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]: return None capital: dict = {"Zahl": 0.0, "Waehrung": ""} if company_type == CompanyTypeEnum.KG: capital_type = "Hafteinlage" - base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ - "Personengesellschaft" - ]["Zusatz_KG"]["Daten_Kommanditist"] + base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + "tns:personengesellschaft" + ]["tns:zusatzKG"]["tns:datenKommanditist"] if isinstance(base, list): for entry in base: # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below @@ -315,22 +350,22 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: CompanyTypeEnum.OHG, ]: if ( - "Kapitalgesellschaft" - not in data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"] + "tns:kapitalgesellschaft" + not in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"] ): - base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ - "Personengesellschaft" + base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + "tns:personengesellschaft" ] else: - base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ - "Kapitalgesellschaft" + base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + "tns:kapitalgesellschaft" ] - if "Zusatz_GmbH" in base: + if "tns:zusatzGmbH" in base: capital_type = "Stammkapital" - capital = base["Zusatz_GmbH"]["Stammkapital"] - elif "Zusatz_Aktiengesellschaft" in base: + capital = base["tns:zusatzGmbH"]["tns:stammkapital"] + elif "tns:zusatzAktiengesellschaft" in base: capital_type = "Grundkapital" - capital = base["Zusatz_Aktiengesellschaft"]["Grundkapital"]["Hoehe"] + capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"]["tns:hoehe"] elif company_type in [ CompanyTypeEnum.EINZELKAUFMANN, CompanyTypeEnum.EG, @@ -345,8 +380,8 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: return None return Capital( **{ # type: ignore - "value": float(capital["Zahl"]), - "currency": CurrencyEnum(capital["Waehrung"]), + "value": float(capital["tns:zahl"]), + "currency": CurrencyEnum(capital["tns:waehrung"]["code"]), "type": CapitalTypeEnum(capital_type), } ) @@ -362,9 +397,12 @@ def map_business_purpose(data: dict) -> str | None: str | None: Business purpose if found """ try: - return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ - "Gegenstand_oder_Geschaeftszweck" + path = [ + "tns:fachdatenRegister", + "tns:basisdatenRegister", + "tns:gegenstand" ] + return traversal(data, path) except KeyError: return None @@ -418,17 +456,65 @@ def map_founding_date(data: dict) -> str | None: if len(entry_date) == 1: return transform_date_to_iso(entry_date[0]) if ( - "Gruendungsmetadaten" - in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"] + "tns:satzungsdatum" + in data["tns:fachdatenRegister"]["tns:basisdatenRegister"] ): - return extract_date_from_string( - data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ - "Gruendungsmetadaten" - ]["Gruendungsdatum"] - ) + path = [ + "tns:fachdatenRegister", + "tns:basisdatenRegister", + "tns:satzungsdatum", + "tns:aktuellesSatzungsdatum" + ] + return traversal(data, path) # No reliable answer return None +def traversal(data: dict, path: list[str | int]) -> any: + current = data + for key in path: + try: + current = current[key] + except: + raise KeyError(f"Key {key} not found") + return current + + +def map_hr_number(data: dict) -> str: + base = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:instanzdaten"][ + "tns:aktenzeichen" + ]["tns:auswahl_aktenzeichen"] + if "tns:aktenzeichen.strukturiert" in base: + hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"][ + "code" + ] + hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"] + return f"{hr_prefix} {hr_number}" + elif "tns:aktenzeichen.freitext" in base: + return base["tns:aktenzeichen.freitext"] + return hr_full + +def map_district_court(data: dict) -> DistrictCourt: + base_path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 1, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation" + ] + path = [*base_path, + "tns:bezeichnung", + "tns:bezeichnung.aktuell" + ] + name = traversal(data, path) + path = [*base_path, + "tns:anschrift", + "tns:ort" + ] + city = traversal(data, path) + return DistrictCourt(name=name, city=city) + def map_company_id(data: dict) -> CompanyID: """Retrieve Company ID from export. @@ -441,37 +527,8 @@ def map_company_id(data: dict) -> CompanyID: """ return CompanyID( **{ - "hr_number": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Instanzdaten" - ]["Aktenzeichen"], - "district_court": DistrictCourt( - **{ - "name": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"]["Organisation"]["Bezeichnung"][ - "Bezeichnung_Aktuell" - ] - if "Organisation" - in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"] - else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ - "Nachname" - ], - "city": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"]["Organisation"]["Sitz"]["Ort"] - if "Organisation" - in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"] - else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"]["Natuerliche_Person"]["Anschrift"]["Ort"], - } - ), + "hr_number": map_hr_number(data), + "district_court": map_district_court(data) } ) @@ -485,7 +542,12 @@ def map_last_update(data: dict) -> str: Returns: str: Last update date """ - return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"]["letzte_Eintragung"] + path = [ + "tns:fachdatenRegister", + "tns:auszug", + "tns:letzteEintragung" + ] + return traversal(data, path) def map_co_relation(data: dict) -> dict: @@ -539,9 +601,10 @@ def map_unternehmensregister_json(data: dict) -> Company: Returns: Company: Transformed data """ + root_key = list(data.keys())[0] + data = data[root_key] result: dict = {"relationships": []} - # TODO Refactor mapping - this is a nightmare... result["id"] = map_company_id(data) result["name"] = name_from_beteiligung(data) @@ -553,11 +616,12 @@ def map_unternehmensregister_json(data: dict) -> Company: result["business_purpose"] = map_business_purpose(data) result["founding_date"] = map_founding_date(data) + # TODO adapt... for i in range( - 2, len(data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"]) + 2, len(data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"]) ): people = parse_stakeholder( - data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][i] + data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][i] ) result["relationships"].append(people) result = map_co_relation(result) diff --git a/tmp/transform.py b/tmp/transform.py deleted file mode 100644 index 9178805..0000000 --- a/tmp/transform.py +++ /dev/null @@ -1,652 +0,0 @@ -"""Transform raw Unternehmensregister export (*.xml) to processed .json files for loading.""" -import dataclasses -import glob -import json -import os -import re -import sys - -import xmltodict -from tqdm import tqdm - -from aki_prj23_transparenzregister.models.company import ( - Capital, - CapitalTypeEnum, - Company, - CompanyID, - CompanyRelationship, - CompanyRelationshipEnum, - CompanyToCompanyRelationship, - CompanyTypeEnum, - CurrencyEnum, - DistrictCourt, - Location, - PersonName, - PersonToCompanyRelationship, - RelationshipRoleEnum, -) -from aki_prj23_transparenzregister.utils.string_tools import ( - remove_traling_and_leading_quotes, - transform_date_to_iso, -) - - -def transform_xml_to_json(source_dir: str, target_dir: str) -> None: - """Convert all xml files in a directory to json files. - - Args: - source_dir (str): Directory hosting the xml files - target_dir (str): Target directory to move json files to - """ - if not os.path.exists(target_dir): - os.makedirs(target_dir) - for source_path in [ - os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True) - ]: - target_path = os.path.join( - target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json") - ) - - with open(source_path, encoding="utf-8") as source_file: - # deepcode ignore HandleUnicode: Weird XML format no other solution - data = xmltodict.parse(source_file.read().encode()) - with open(target_path, "w", encoding="utf-8") as json_file: - json_file.write(json.dumps(data)) - - -def parse_date_of_birth(data: dict) -> str | None: - """Retreives the date of birth from a stakeholder entry if possible. - - Args: - data (dict): Stakeholder data - - Returns: - str | None: date of birth or None if not found - """ - if "tns:geburt" in (base := data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]): - base = base["tns:geburt"]["tns:geburtsdatum"] - if isinstance(base, str): - return base - return None - -def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: - match role_id: - case "086": - return RelationshipRoleEnum.GESCHAEFTSFUEHRER - case "285": - return RelationshipRoleEnum.PROKURIST - case _: - raise KeyError(f'Uknown role_id: {role_id}') - - -def parse_stakeholder(data: dict) -> CompanyRelationship | None: - """Extract the company stakeholder/relation from a single "Beteiligung". - - Args: - data (dict): Data export - - Returns: - CompanyRelationship | None: Relationship if it could be processed - """ - if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: - # It's a Company serving as a "Kommanditist" or similar - # if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None: - # return CompanyToCompanyRelationship( - # **{ # type: ignore - # "name": remove_traling_and_leading_quotes( - # data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ - # "Nachname" - # ] - # ), - # "location": Location( - # **{ - # "city": data["Beteiligter"]["Natuerliche_Person"][ - # "Anschrift" - # ][-1]["Ort"] - # if isinstance( - # data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], - # list, - # ) - # else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ - # "Ort" - # ] - # } - # ), - # "role": RelationshipRoleEnum( - # data["Rolle"]["Rollenbezeichnung"]["content"] - # ), - # "type": CompanyRelationshipEnum.COMPANY, - # } - # ) - return PersonToCompanyRelationship( - **{ # type: ignore - "name": PersonName( - **{ - "firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ - "tns:vollerName" - ]["tns:vorname"], - "lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ - "tns:vollerName" - ]["tns:nachname"], - } - ), - "date_of_birth": parse_date_of_birth(data), - "location": Location( - **{ - "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ - -1 - ]["tns:ort"] - if isinstance( - data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"], list - ) - else data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ - "tns:ort" - ] - } - ), - # TODO get role via ID - "role": map_role_id_to_enum( - data["tns:rolle"]["tns:rollenbezeichnung"]["code"] - ), - "type": CompanyRelationshipEnum.PERSON, - } - ) - if "Organisation" in data["Beteiligter"]: - return CompanyToCompanyRelationship( - **{ # type: ignore - "role": RelationshipRoleEnum( - data["Rolle"]["Rollenbezeichnung"]["content"] - ), - "name": remove_traling_and_leading_quotes( - data["Beteiligter"]["Organisation"]["Bezeichnung"][ - "Bezeichnung_Aktuell" - ] - ), - "location": Location( - **{ - "city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"], - "street": data["Beteiligter"]["Organisation"]["Anschrift"][ - "Strasse" - ] - if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - "house_number": data["Beteiligter"]["Organisation"][ - "Anschrift" - ]["Hausnummer"] - if "Hausnummer" - in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - "zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][ - "Postleitzahl" - ] - if "Postleitzahl" - in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - } - ), - "type": CompanyRelationshipEnum.COMPANY, - } - ) - return None - - -def normalize_street(street: str) -> str: - """Normalize street names by extending them to `Straße` or `straße`. - - Args: - street (str): Name of street - - Returns: - str: Normalized street name - """ - if street is None: - return None - regex = r"(Str\.|Strasse)" - street = re.sub(regex, "Straße", street) - regex = r"(str\.|strasse)" - street = re.sub(regex, "straße", street) - return street.strip() - - -def loc_from_beteiligung(data: dict) -> Location: - """Extract the company location from the first relationship in the export. - - Args: - data (dict): Data export - - Returns: - Location: location - """ - base_path = [ - "tns:grunddaten", - "tns:verfahrensdaten", - "tns:beteiligung", - 0, - "tns:beteiligter", - "tns:auswahl_beteiligter", - "tns:organisation", - "tns:anschrift" - ] - base = traversal(data, base_path) - - house_number = None - street = None - if "tns:strasse" in base: - regex = r".(\d+)$" - hits = re.findall(regex, base["tns:strasse"]) - if len(hits) == 1: - house_number = hits[0] - street = base["tns:strasse"][: (-1 * len(house_number))] - if "tns:hausnummer" in base: - house_number = house_number + base["tns:hausnummer"] - else: - if "tns:hausnummer" in base: - house_number = base["tns:hausnummer"] - street = base["tns:strasse"] - return Location( - **{ - "city": base["tns:ort"], - "zip_code": base["tns:postleitzahl"], - "street": normalize_street(street), # type: ignore - "house_number": house_number, - } - ) - - -def name_from_beteiligung(data: dict) -> str: - """Extract the Company name from an Unternehmensregister export by using the first relationship found. - - Args: - data (dict): Data export - - Returns: - str: Company name - """ - path = [ - "tns:grunddaten", - "tns:verfahrensdaten", - "tns:beteiligung", - 0, - "tns:beteiligter", - "tns:auswahl_beteiligter", - "tns:organisation", - "tns:bezeichnung", - "tns:bezeichnung.aktuell" - ] - name = traversal(data, path) - return remove_traling_and_leading_quotes(name) - - -def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None: - """Extracts the company type from a given Unternehmensregister export. - - Args: - company_name (str): Name of the company as a fallback solution - data (dict): Data export - - Returns: - CompanyTypeEnum | None: Company type if found - """ - try: - path = [ - "tns:fachdatenRegister", - "tns:basisdatenRegister", - "tns:rechtstraeger", - "tns:angabenZurRechtsform", - "tns:rechtsform", - "code" - ] - return CompanyTypeEnum( - traversal(data, path) - ) - except Exception: - if ( - company_name.endswith("GmbH") - or company_name.endswith("UG") - or company_name.endswith("UG (haftungsbeschränkt)") - ): - return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung") - if company_name.endswith("SE"): - return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)") - if company_name.endswith("KG"): - return CompanyTypeEnum("Kommanditgesellschaft") - return None - - -def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: - """Extracts the company capital from the given Unternehmensregister export. - - Args: - data (dict): Data export - company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung') - - Returns: - Capital | None: Company Capital if found - """ - # Early return - if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]: - return None - capital: dict = {"Zahl": 0.0, "Waehrung": ""} - if company_type == CompanyTypeEnum.KG: - capital_type = "Hafteinlage" - base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ - "tns:personengesellschaft" - ]["tns:zusatzKG"]["tns:datenKommanditist"] - if isinstance(base, list): - for entry in base: - # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below - capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"]) - capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"] - elif isinstance(base, dict): - capital = base["Hafteinlage"] - elif company_type in [ - CompanyTypeEnum.GMBH, - CompanyTypeEnum.SE, - CompanyTypeEnum.AG, - CompanyTypeEnum.KGaA, - CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM, - CompanyTypeEnum.OHG, - ]: - if ( - "tns:kapitalgesellschaft" - not in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"] - ): - base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ - "tns:personengesellschaft" - ] - else: - base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ - "tns:kapitalgesellschaft" - ] - if "tns:zusatzGmbH" in base: - capital_type = "Stammkapital" - capital = base["tns:zusatzGmbH"]["tns:stammkapital"] - elif "tns:zusatzAktiengesellschaft" in base: - capital_type = "Grundkapital" - capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"]["tns:zahl"] - elif company_type in [ - CompanyTypeEnum.EINZELKAUFMANN, - CompanyTypeEnum.EG, - CompanyTypeEnum.PARTNERSCHAFT, - CompanyTypeEnum.PARTNERGESELLSCHAFT, - CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT, - None, - ]: - return None - # Catch entries having the dict but with null values - if not all(capital.values()): - return None - return Capital( - **{ # type: ignore - "value": float(capital["tns:zahl"]), - "currency": CurrencyEnum(capital["tns:waehrung"]["code"]), - "type": CapitalTypeEnum(capital_type), - } - ) - - -def map_business_purpose(data: dict) -> str | None: - """Extracts the "Geschäftszweck" from a given Unternehmensregister export. - - Args: - data (dict): Data export - - Returns: - str | None: Business purpose if found - """ - try: - path = [ - "tns:fachdatenRegister", - "tns:basisdatenRegister", - "tns:gegenstand" - ] - return traversal(data, path) - except KeyError: - return None - - -def extract_date_from_string(value: str) -> str | None: - """Extract a date in ISO format from the given string if possible. - - Args: - value (str): Input text - - Returns: - str | None: Date in ISO format, None if not found - """ - date_regex = [ # type: ignore - {"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso}, - {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, - ] - results = [] - for regex in date_regex: - result = re.findall(regex["regex"], value) # type: ignore - if len(result) == 1: - relevant_data = result[0] - if regex["mapper"] is not None: # type: ignore - results.append(regex["mapper"](relevant_data)) # type: ignore - else: - results.append(relevant_data) - if len(results) != 1: - return None - return results[0] - - -def map_founding_date(data: dict) -> str | None: - """Extracts the founding date from a given Unternehmensregister export. - - Args: - data (dict): Data export - - Returns: - str | None: Founding date if found - """ - text = str(data) - entry_date = re.findall( - r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text - ) - if len(entry_date) == 1: - return transform_date_to_iso(entry_date[0][1]) - - entry_date = re.findall( - r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text - ) - if len(entry_date) == 1: - return transform_date_to_iso(entry_date[0]) - if ( - "tns:satzungsdatum" - in data["tns:fachdatenRegister"]["tns:basisdatenRegister"] - ): - path = [ - "tns:fachdatenRegister", - "tns:basisdatenRegister", - "tns:satzungsdatum", - "tns:aktuellesSatzungsdatum" - ] - return traversal(data, path) - # No reliable answer - return None - -def traversal(data: dict, path: list[str | int]) -> any: - current = data - for key in path: - try: - current = current[key] - except: - raise KeyError(f"Key {key} not found") - return current - - -def map_hr_number(data: dict) -> str: - hr_prefix = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:instanzdaten"][ - "tns:aktenzeichen" - ]["tns:auswahl_aktenzeichen"]["tns:aktenzeichen.strukturiert"]["tns:register"][ - "code" - ] - hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:instanzdaten"][ - "tns:aktenzeichen" - ]["tns:auswahl_aktenzeichen"]["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"] - hr_full = f"{hr_prefix} {hr_number}" - return hr_full - -def map_district_court(data: dict) -> DistrictCourt: - base_path = [ - "tns:grunddaten", - "tns:verfahrensdaten", - "tns:beteiligung", - 1, - "tns:beteiligter", - "tns:auswahl_beteiligter", - "tns:organisation" - ] - path = [*base_path, - "tns:bezeichnung", - "tns:bezeichnung.aktuell" - ] - name = traversal(data, path) - path = [*base_path, - "tns:sitz", - "tns:ort" - ] - city = traversal(data, path) - return DistrictCourt(name=name, city=city) - - -def map_company_id(data: dict) -> CompanyID: - """Retrieve Company ID from export. - - Args: - data (dict): Data export - - Returns: - CompanyID: ID of the company - """ - return CompanyID( - **{ - "hr_number": map_hr_number(data), - "district_court": map_district_court(data) - } - ) - - -def map_last_update(data: dict) -> str: - """Extract last update date from export. - - Args: - data (dict): Unternehmensregister export - - Returns: - str: Last update date - """ - path = [ - "tns:fachdatenRegister", - "tns:auszug", - "tns:letzteEintragung" - ] - return traversal(data, path) - - -def map_co_relation(data: dict) -> dict: - """Search for and map the c/o relation from location.street if possible. - - Args: - data (dict): Company dict - - Returns: - dict: Modified Company dict - """ - street = data["location"].street - if street is None: - return data - parts = street.split(",") - co_company = None - co_company_index = None - for index, part in enumerate(parts): - trimmed_part = part.strip() - result = re.findall(r"^c\/o(.*)$", trimmed_part) - if len(result) == 1: - co_company = result[0].strip() - co_company_index = index - if co_company_index is not None: - del parts[co_company_index] - street = "".join(parts).strip() - data["location"].street = street - - if co_company is not None and co_company != "": - relation = CompanyToCompanyRelationship( - RelationshipRoleEnum.CARE_OF, # type: ignore - Location( - data["location"].city, - street, - data["location"].house_number, - data["location"].zip_code, - ), - CompanyRelationshipEnum.COMPANY, # type: ignore - co_company, - ) - data["relationships"].append(relation) - return data - - -def map_unternehmensregister_json(data: dict) -> Company: - """Processes the Unternehmensregister structured export to a Company by using several helper methods. - - Args: - data (dict): Data export - - Returns: - Company: Transformed data - """ - root_key = list(data.keys())[0] - data = data[root_key] - result: dict = {"relationships": []} - - result["id"] = map_company_id(data) - result["name"] = name_from_beteiligung(data) - - result["location"] = loc_from_beteiligung(data) - result["last_update"] = map_last_update(data) - - result["company_type"] = map_rechtsform(result["name"], data) - result["capital"] = map_capital(data, result["company_type"]) - result["business_purpose"] = map_business_purpose(data) - result["founding_date"] = map_founding_date(data) - - # TODO adapt... - for i in range( - 2, len(data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"]) - ): - people = parse_stakeholder( - data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][i] - ) - result["relationships"].append(people) - result = map_co_relation(result) - return Company(**result) - - -if __name__ == "__main__": - from loguru import logger - - base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" - for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): - path = os.path.join(f"{base_path}/export", file) - with open(path, encoding="utf-8") as file_object: - try: - company: Company = map_unternehmensregister_json( - json.loads(file_object.read()) - ) - - name = "".join(e for e in company.name if e.isalnum())[:50] - - with open( - f"{base_path}/transformed/{name}.json", - "w+", - encoding="utf-8", - ) as export_file: - json.dump( - dataclasses.asdict(company), export_file, ensure_ascii=False - ) - except Exception as e: - logger.error(e) - logger.error(f"Error in processing {path}") - sys.exit(1) diff --git a/tmp/transformation.ipynb b/tmp/transformation.ipynb index 915b440..9a560bb 100644 --- a/tmp/transformation.ipynb +++ b/tmp/transformation.ipynb @@ -2,14 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Company(id=CompanyID(district_court=DistrictCourt(name='Amtsgericht Hamm', city='Hamm'), hr_number='HRB 5363'), location=Location(city='Bönen', street='Siemensstraße', house_number='25-27', zip_code='59199'), name='GEA Farm Technologies GmbH', last_update='2023-10-27', relationships=[PersonToCompanyRelationship(role=, location=Location(city='Oelde', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Reinhard', lastname='Gebing'), date_of_birth='1964-04-26'), PersonToCompanyRelationship(role=, location=Location(city='Wetter', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Markus', lastname='Kreft'), date_of_birth='1966-04-03'), PersonToCompanyRelationship(role=, location=Location(city='Holzminden', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Kai', lastname='Luntz'), date_of_birth='1970-12-04'), PersonToCompanyRelationship(role=, location=Location(city='Rheda-Wiedenbrück', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Thomas', lastname='Mader'), date_of_birth='1972-05-24'), PersonToCompanyRelationship(role=, location=Location(city='Düsseldorf', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Peter', lastname='Lauwers'), date_of_birth='1970-03-26'), PersonToCompanyRelationship(role=, location=Location(city='Erkrath', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Erkul', lastname='Basaran'), date_of_birth='1977-05-06'), PersonToCompanyRelationship(role=, location=Location(city='Bochum', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Henrik', lastname='Böttner'), date_of_birth='1982-11-07'), PersonToCompanyRelationship(role=, location=Location(city='Oelde', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ulrich', lastname='Raßenhövel'), date_of_birth='1969-04-16'), PersonToCompanyRelationship(role=, location=Location(city='Herdecke', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Andreas', lastname='Naroska'), date_of_birth='1967-03-23'), PersonToCompanyRelationship(role=, location=Location(city='Witten', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Mark', lastname='Kramps'), date_of_birth='1967-09-04'), PersonToCompanyRelationship(role=, location=Location(city='Dortmund', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ralf', lastname='Barkmeyer'), date_of_birth='1974-02-28'), PersonToCompanyRelationship(role=, location=Location(city='Tönnisvorst', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Holger', lastname='Siegwarth'), date_of_birth='1967-05-13'), PersonToCompanyRelationship(role=, location=Location(city='Herne', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Oliver', lastname='Liß'), date_of_birth='1981-04-13'), PersonToCompanyRelationship(role=, location=Location(city='Göppingen', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Liang', lastname='Cheng'), date_of_birth='1980-12-29'), PersonToCompanyRelationship(role=, location=Location(city='Beckum', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Astrid', lastname='Dörner-Rodeheger'), date_of_birth='1968-12-24'), PersonToCompanyRelationship(role=, location=Location(city='Dortmund', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Jon', lastname='Lange'), date_of_birth='1978-04-25'), PersonToCompanyRelationship(role=, location=Location(city='Werne', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ralf', lastname='Frombach'), date_of_birth='1977-01-25'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Sven', lastname='Hommel'), date_of_birth='1979-04-22'), PersonToCompanyRelationship(role=, location=Location(city='Oberhausen', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Matthias', lastname='Peters'), date_of_birth='1973-08-28')], company_type=, capital=Capital(value=5115000.0, currency=, type=), business_purpose='Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere von Komponenten und Anlagen (a) zur Gewinnung, Kühlung, Behandlung und Lagerung von Milch; (b) für das Milchvieh-Herdenmanagement; (c) zur Tierhygiene und Sicherung der Milchqualität und (d) zur Aufstallung von Tieren; sowie die Herstellung und der Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.', founding_date='1995-04-25')\n" + "Company(id=CompanyID(district_court=DistrictCourt(name='Amtsgericht Hamm', city='Hamm'), hr_number='HRB 5363'), location=Location(city='Bönen', street='Siemensstraße', house_number='25-27', zip_code='59199'), name='GEA Farm Technologies GmbH', last_update='2023-10-27', relationships=[PersonToCompanyRelationship(role=, location=Location(city='Oelde', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Reinhard', lastname='Gebing'), date_of_birth='1964-04-26'), PersonToCompanyRelationship(role=, location=Location(city='Wetter', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Markus', lastname='Kreft'), date_of_birth='1966-04-03'), PersonToCompanyRelationship(role=, location=Location(city='Holzminden', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Kai', lastname='Luntz'), date_of_birth='1970-12-04'), PersonToCompanyRelationship(role=, location=Location(city='Rheda-Wiedenbrück', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Thomas', lastname='Mader'), date_of_birth='1972-05-24'), PersonToCompanyRelationship(role=, location=Location(city='Düsseldorf', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Peter', lastname='Lauwers'), date_of_birth='1970-03-26'), PersonToCompanyRelationship(role=, location=Location(city='Erkrath', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Erkul', lastname='Basaran'), date_of_birth='1977-05-06'), PersonToCompanyRelationship(role=, location=Location(city='Bochum', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Henrik', lastname='Böttner'), date_of_birth='1982-11-07'), PersonToCompanyRelationship(role=, location=Location(city='Oelde', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ulrich', lastname='Raßenhövel'), date_of_birth='1969-04-16'), PersonToCompanyRelationship(role=, location=Location(city='Herdecke', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Andreas', lastname='Naroska'), date_of_birth='1967-03-23'), PersonToCompanyRelationship(role=, location=Location(city='Witten', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Mark', lastname='Kramps'), date_of_birth='1967-09-04'), PersonToCompanyRelationship(role=, location=Location(city='Dortmund', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ralf', lastname='Barkmeyer'), date_of_birth='1974-02-28'), PersonToCompanyRelationship(role=, location=Location(city='Tönnisvorst', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Holger', lastname='Siegwarth'), date_of_birth='1967-05-13'), PersonToCompanyRelationship(role=, location=Location(city='Herne', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Oliver', lastname='Liß'), date_of_birth='1981-04-13'), PersonToCompanyRelationship(role=, location=Location(city='Göppingen', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Liang', lastname='Cheng'), date_of_birth='1980-12-29'), PersonToCompanyRelationship(role=, location=Location(city='Beckum', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Astrid', lastname='Dörner-Rodeheger'), date_of_birth='1968-12-24'), PersonToCompanyRelationship(role=, location=Location(city='Dortmund', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Jon', lastname='Lange'), date_of_birth='1978-04-25'), PersonToCompanyRelationship(role=, location=Location(city='Werne', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ralf', lastname='Frombach'), date_of_birth='1977-01-25'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Sven', lastname='Hommel'), date_of_birth='1979-04-22'), PersonToCompanyRelationship(role=, location=Location(city='Oberhausen', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Matthias', lastname='Peters'), date_of_birth='1973-08-28')], company_type=, capital=Capital(value=5115000.0, currency=, type=), business_purpose='Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere\\n von Komponenten und Anlagen (a) zur Gewinnung, Kühlung, Behandlung und Lagerung von\\n Milch; (b) für das Milchvieh-Herdenmanagement; (c) zur Tierhygiene und Sicherung der\\n Milchqualität und (d) zur Aufstallung von Tieren; sowie die Herstellung und der\\n Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.', founding_date='1995-04-25')\n" ] } ], @@ -26,40 +26,63 @@ " json.dump(dataclasses.asdict(company_data), file, ensure_ascii=False)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test" - ] - }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [ { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mc:\\Users\\trist\\Documents\\Code\\M.Sc\\aki_prj23_transparenzregister\\tmp\\transformation.ipynb Cell 3\u001b[0m line \u001b[0;36m9\n\u001b[0;32m 7\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39m../tmp/tests/GEAFarmTechnologiesGmbH.json\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m file:\n\u001b[0;32m 8\u001b[0m expected_result \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(file)\n\u001b[1;32m----> 9\u001b[0m \u001b[39massert\u001b[39;00m result \u001b[39m==\u001b[39m expected_result\n", - "\u001b[1;31mAssertionError\u001b[0m: " + "name": "stdout", + "output_type": "stream", + "text": [ + "Company(id=CompanyID(district_court=DistrictCourt(name='Amtsgericht Charlottenburg', city='Berlin'), hr_number='HRB 153385 B'), location=Location(city='Berlin', street='Valeska-Gert-Straße', house_number='5', zip_code='10243'), name='Zalando Lounge Service GmbH', last_update='2022-10-05', relationships=[PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Martin', lastname='Rost'), date_of_birth='1982-09-24'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Karen', lastname='Kennes'), date_of_birth='1979-06-22'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Andre', lastname='Hörschel'), date_of_birth='1973-06-15')], company_type=, capital=Capital(value=25000.0, currency=, type=), business_purpose='Die Erbringung von Dienstleistungen für e-Commerce Unternehmen im Bereich Kundenservice und Logistik.', founding_date='2014-02-05')\n" ] } ], "source": [ "import json\n", + "import dataclasses\n", + "from transform import map_unternehmensregister_json\n", "\n", - "result = None\n", - "expected_result = None\n", - "with open('../tmp/transformed/GEAFarmTechnologiesGmbH.json', 'r') as file_a:\n", - " result = json.load(file_a)\n", - "with open('../tmp/tests/GEAFarmTechnologiesGmbH.json', \"r\") as file:\n", - " expected_result = json.load(file)\n", - " assert result == expected_result" + "with open('../tmp/json/ZalandoLoungeServiceGmbH.json', \"r\") as file:\n", + " content = json.load(file)\n", + " company_data = map_unternehmensregister_json(content)\n", + " print(company_data)\n", + " with open('../tmp/transformed/ZalandoLoungeServiceGmbH.json', \"w+\", encoding=\"utf-8\") as file:\n", + " json.dump(dataclasses.asdict(company_data), file, ensure_ascii=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Company(id=CompanyID(district_court=DistrictCourt(name='Amtsgericht Charlottenburg', city='Berlin'), hr_number='HRB 158855 B'), location=Location(city='Berlin', street='Valeska-Gert-Straße', house_number='5', zip_code='10243'), name='Zalando SE', last_update='2023-07-04', relationships=[PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Robert', lastname='Gentz'), date_of_birth='1983-09-24'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='David', lastname='Schneider'), date_of_birth='1982-07-29'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='David', lastname='Schröder'), date_of_birth='1982-11-27'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Astrid', lastname='Arndt'), date_of_birth='1971-11-05'), PersonToCompanyRelationship(role=, location=Location(city='Passau', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Sandra', lastname='Dembeck'), date_of_birth='1974-03-06'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Martin', lastname='Rost'), date_of_birth='1982-09-24'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Jan-Hendrik', lastname='Bartels'), date_of_birth='1980-11-05'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ulrich', lastname='Kalk'), date_of_birth='1978-05-27'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Anne', lastname='Pascual'), date_of_birth='1976-01-03'), PersonToCompanyRelationship(role=, location=Location(city='Knokke/Belgien', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Bruno', lastname='Vanhoorickx'), date_of_birth='1981-08-25'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Andreas', lastname='Antrup'), date_of_birth='1983-06-27'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Lena', lastname='Wallenhorst'), date_of_birth='1978-07-20')], company_type=, capital=Capital(value=263531672.0, currency=, type=), business_purpose='Die Entwicklung, Vermarktung und Erbringung von Internetdienstleistungen (E-Commerce-Handel mit Waren verschiedener Art, insbesondere Bekleidung und Schuhe), die Entwicklung, Herstellung, Vermarktung und der Handel mit solchen Waren, insbesondere Bekleidung und Schuhe, die Erbringung von Logistikdienstleistungen, digitalen Dienstleistungen und alle mit dem vorgenannten Unternehmensgegenstand zusammenhängenden Dienstleistungen.', founding_date='2023-05-24')\n" + ] + } + ], + "source": [ + "import json\n", + "import dataclasses\n", + "from transform import map_unternehmensregister_json\n", + "\n", + "with open('../tmp/json/ZalandoSE.json', \"r\") as file:\n", + " content = json.load(file)\n", + " company_data = map_unternehmensregister_json(content)\n", + " print(company_data)\n", + " with open('../tmp/transformed/ZalandoSE.json', \"w+\", encoding=\"utf-8\") as file:\n", + " json.dump(dataclasses.asdict(company_data), file, ensure_ascii=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test" ] } ], From 2458ad98ff15f8ffc2682ede0eb9c873c6539886 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 3 Nov 2023 11:35:45 +0100 Subject: [PATCH 05/14] checkpoint: Refactoring data-extraction from unternehmensregister to handle v1 and v3 --- .../apps/find_missing_companies.py | 44 +- .../transform/__init__.py | 0 .../unternehmensregister/transform/common.py | 0 .../unternehmensregister/transform/main.py | 81 + .../transform/v1/__init__.py | 0 .../unternehmensregister/transform/v1/v1.py | 569 ++++ .../transform/v3/__init__.py | 0 .../xjustiz_0040_cl_rollenbezeichnung_3_3.xsd | 2714 +++++++++++++++++ .../transform/v3/role_mapper.py | 34 + .../{transform.py => transform/v3/v3.py} | 288 +- tmp/transformation.ipynb | 1121 ++++++- 11 files changed, 4671 insertions(+), 180 deletions(-) create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/__init__.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/__init__.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/__init__.py create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/assets/xjustiz_0040_cl_rollenbezeichnung_3_3.xsd create mode 100644 src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/role_mapper.py rename src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/{transform.py => transform/v3/v3.py} (69%) diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py index c8aa78b..f8a7b5b 100644 --- a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -24,8 +24,8 @@ from aki_prj23_transparenzregister.utils.sql import entities from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import ( extract, load, - transform, ) +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import main as transform if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -43,8 +43,8 @@ if __name__ == "__main__": parsed = parser.parse_args(sys.argv[1:]) configer_logger(namespace=parsed) config = parsed.config - session = connector.get_session(get_config_provider(config)) - missing_companies = session.query(entities.MissingCompany).all() + # session = connector.get_session(get_config_provider(config)) + # missing_companies = session.query(entities.MissingCompany).all() counter = 0 # # Scrape data from unternehmensregister @@ -63,22 +63,24 @@ if __name__ == "__main__": for file in tqdm(glob.glob1(json_dir, "*.json")): path = os.path.join(json_dir, file) with open(path, encoding="utf-8") as file_object: - try: - company = transform.map_unternehmensregister_json( - json.loads(file_object.read()) + # try: + print(path) + company = transform.map_unternehmensregister_json( + json.loads(file_object.read()) + ) + + name = "".join(e for e in company.name if e.isalnum())[:50] + + with open( + f"{output_path}/{name}.json", + "w+", + encoding="utf-8", + ) as export_file: + json.dump( + dataclasses.asdict(company), export_file, ensure_ascii=False ) - - name = "".join(e for e in company.name if e.isalnum())[:50] - - with open( - f"{output_path}/{name}.json", - "w+", - encoding="utf-8", - ) as export_file: - json.dump( - dataclasses.asdict(company), export_file, ensure_ascii=False - ) - except Exception as e: - logger.error(e) - logger.error(f"Error in processing {path}") - sys.exit(1) \ No newline at end of file + # except Exception as e: + # logger.error(e.with_traceback()) + # logger.error(e) + # logger.error(f"Error in processing {path}") + # sys.exit(1) \ No newline at end of file diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/__init__.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py new file mode 100644 index 0000000..e69de29 diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py new file mode 100644 index 0000000..717c4d1 --- /dev/null +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py @@ -0,0 +1,81 @@ +"""Transform raw Unternehmensregister export (*.xml) to processed .json files for loading.""" +import dataclasses +import glob +import json +import os +import re +import sys + +import xmltodict +from tqdm import tqdm + +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import v1 +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3 import v3 +from aki_prj23_transparenzregister.models.company import Company + +def transform_xml_to_json(source_dir: str, target_dir: str) -> None: + """Convert all xml files in a directory to json files. + + Args: + source_dir (str): Directory hosting the xml files + target_dir (str): Target directory to move json files to + """ + for source_path in [ + os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True) + ]: + target_path = os.path.join( + target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json") + ) + + with open(source_path, encoding="utf-8") as source_file: + # deepcode ignore HandleUnicode: Weird XML format no other solution + data = xmltodict.parse(source_file.read().encode()) + with open(target_path, "w", encoding="utf-8") as json_file: + json_file.write(json.dumps(data)) + +def determine_version(data: dict): + if "XJustiz_Daten" in data: + return v1 + elif "tns:nachrichtenkopf" in data[list(data.keys())[0]]: + return v3 + raise ValueError("Could not determine Unternehmensregister version.") + +def map_unternehmensregister_json(data: dict) -> Company: + """Processes the Unternehmensregister structured export to a Company by using several helper methods. + + Args: + data (dict): Data export + + Returns: + Company: Transformed data + """ + version = determine_version(data) + return version.map_unternehmensregister_json(data) + + +if __name__ == "__main__": + from loguru import logger + + base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" + for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): + path = os.path.join(f"{base_path}/export", file) + with open(path, encoding="utf-8") as file_object: + try: + company: Company = map_unternehmensregister_json( + json.loads(file_object.read()) + ) + + name = "".join(e for e in company.name if e.isalnum())[:50] + + with open( + f"{base_path}/transformed/{name}.json", + "w+", + encoding="utf-8", + ) as export_file: + json.dump( + dataclasses.asdict(company), export_file, ensure_ascii=False + ) + except Exception as e: + logger.error(e) + logger.error(f"Error in processing {path}") + sys.exit(1) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/__init__.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py new file mode 100644 index 0000000..95405cb --- /dev/null +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py @@ -0,0 +1,569 @@ +"""Transform raw Unternehmensregister export (*.xml) to processed .json files for loading.""" +import dataclasses +import glob +import json +import os +import re +import sys + +import xmltodict +from tqdm import tqdm + +from aki_prj23_transparenzregister.models.company import ( + Capital, + CapitalTypeEnum, + Company, + CompanyID, + CompanyRelationship, + CompanyRelationshipEnum, + CompanyToCompanyRelationship, + CompanyTypeEnum, + CurrencyEnum, + DistrictCourt, + Location, + PersonName, + PersonToCompanyRelationship, + RelationshipRoleEnum, +) +from aki_prj23_transparenzregister.utils.string_tools import ( + remove_traling_and_leading_quotes, + transform_date_to_iso, +) + + +def parse_date_of_birth(data: dict) -> str | None: + """Retreives the date of birth from a stakeholder entry if possible. + + Args: + data (dict): Stakeholder data + + Returns: + str | None: date of birth or None if not found + """ + if "Geburt" in (base := data["Beteiligter"]["Natuerliche_Person"]): + base = base["Geburt"]["Geburtsdatum"] + if isinstance(base, str): + return base + return None + + +def parse_stakeholder(data: dict) -> CompanyRelationship | None: + """Extract the company stakeholder/relation from a single "Beteiligung". + + Args: + data (dict): Data export + + Returns: + CompanyRelationship | None: Relationship if it could be processed + """ + if "Natuerliche_Person" in data["Beteiligter"]: + # It's a Company serving as a "Kommanditist" or similar + if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None: + return CompanyToCompanyRelationship( + **{ # type: ignore + "name": remove_traling_and_leading_quotes( + data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ + "Nachname" + ] + ), + "location": Location( + **{ + "city": data["Beteiligter"]["Natuerliche_Person"][ + "Anschrift" + ][-1]["Ort"] + if isinstance( + data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], + list, + ) + else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ + "Ort" + ] + } + ), + "role": RelationshipRoleEnum( + data["Rolle"]["Rollenbezeichnung"]["content"] + ), + "type": CompanyRelationshipEnum.COMPANY, + } + ) + return PersonToCompanyRelationship( + **{ # type: ignore + "name": PersonName( + **{ + "firstname": data["Beteiligter"]["Natuerliche_Person"][ + "Voller_Name" + ]["Vorname"], + "lastname": data["Beteiligter"]["Natuerliche_Person"][ + "Voller_Name" + ]["Nachname"], + } + ), + "date_of_birth": parse_date_of_birth(data), + "location": Location( + **{ + "city": data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ + -1 + ]["Ort"] + if isinstance( + data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], list + ) + else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ + "Ort" + ] + } + ), + "role": RelationshipRoleEnum( + data["Rolle"]["Rollenbezeichnung"]["content"] + ), + "type": CompanyRelationshipEnum.PERSON, + } + ) + if "Organisation" in data["Beteiligter"]: + return CompanyToCompanyRelationship( + **{ # type: ignore + "role": RelationshipRoleEnum( + data["Rolle"]["Rollenbezeichnung"]["content"] + ), + "name": remove_traling_and_leading_quotes( + data["Beteiligter"]["Organisation"]["Bezeichnung"][ + "Bezeichnung_Aktuell" + ] + ), + "location": Location( + **{ + "city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"], + "street": data["Beteiligter"]["Organisation"]["Anschrift"][ + "Strasse" + ] + if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + "house_number": data["Beteiligter"]["Organisation"][ + "Anschrift" + ]["Hausnummer"] + if "Hausnummer" + in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + "zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][ + "Postleitzahl" + ] + if "Postleitzahl" + in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + } + ), + "type": CompanyRelationshipEnum.COMPANY, + } + ) + return None + + +def normalize_street(street: str) -> str: + """Normalize street names by extending them to `Straße` or `straße`. + + Args: + street (str): Name of street + + Returns: + str: Normalized street name + """ + if street is None: + return None + regex = r"(Str\.|Strasse)" + street = re.sub(regex, "Straße", street) + regex = r"(str\.|strasse)" + street = re.sub(regex, "straße", street) + return street.strip() + + +def loc_from_beteiligung(data: dict) -> Location: + """Extract the company location from the first relationship in the export. + + Args: + data (dict): Data export + + Returns: + Location: location + """ + base = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][ + "Beteiligter" + ]["Organisation"]["Anschrift"] + + house_number = None + street = None + if "Strasse" in base: + regex = r".(\d+)$" + hits = re.findall(regex, base["Strasse"]) + if len(hits) == 1: + house_number = hits[0] + street = base["Strasse"][: (-1 * len(house_number))] + if "Hausnummer" in base: + house_number = house_number + base["Hausnummer"] + else: + if "Hausnummer" in base: + house_number = base["Hausnummer"] + street = base["Strasse"] + return Location( + **{ + "city": base["Ort"], + "zip_code": base["Postleitzahl"], + "street": normalize_street(street), # type: ignore + "house_number": house_number, + } + ) + + +def name_from_beteiligung(data: dict) -> str: + """Extract the Company name from an Unternehmensregister export by using the first relationship found. + + Args: + data (dict): Data export + + Returns: + str: Company name + """ + name = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][ + "Beteiligter" + ]["Organisation"]["Bezeichnung"]["Bezeichnung_Aktuell"] + return remove_traling_and_leading_quotes(name) + + +def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None: + """Extracts the company type from a given Unternehmensregister export. + + Args: + company_name (str): Name of the company as a fallback solution + data (dict): Data export + + Returns: + CompanyTypeEnum | None: Company type if found + """ + try: + return CompanyTypeEnum( + data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ + "Rechtstraeger" + ]["Rechtsform"]["content"] + ) + except KeyError: + if ( + company_name.endswith("GmbH") + or company_name.endswith("UG") + or company_name.endswith("UG (haftungsbeschränkt)") + ): + return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung") + if company_name.endswith("SE"): + return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)") + if company_name.endswith("KG"): + return CompanyTypeEnum("Kommanditgesellschaft") + return None + + +def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: + """Extracts the company capital from the given Unternehmensregister export. + + Args: + data (dict): Data export + company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung') + + Returns: + Capital | None: Company Capital if found + """ + # Early return + if "Zusatzangaben" not in data["XJustiz_Daten"]["Fachdaten_Register"]: + return None + capital: dict = {"Zahl": 0.0, "Waehrung": ""} + if company_type == CompanyTypeEnum.KG: + capital_type = "Hafteinlage" + base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ + "Personengesellschaft" + ]["Zusatz_KG"]["Daten_Kommanditist"] + if isinstance(base, list): + for entry in base: + # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below + capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"]) + capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"] + elif isinstance(base, dict): + capital = base["Hafteinlage"] + elif company_type in [ + CompanyTypeEnum.GMBH, + CompanyTypeEnum.SE, + CompanyTypeEnum.AG, + CompanyTypeEnum.KGaA, + CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM, + CompanyTypeEnum.OHG, + ]: + if ( + "Kapitalgesellschaft" + not in data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"] + ): + base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ + "Personengesellschaft" + ] + else: + base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ + "Kapitalgesellschaft" + ] + if "Zusatz_GmbH" in base: + capital_type = "Stammkapital" + capital = base["Zusatz_GmbH"]["Stammkapital"] + elif "Zusatz_Aktiengesellschaft" in base: + capital_type = "Grundkapital" + capital = base["Zusatz_Aktiengesellschaft"]["Grundkapital"]["Hoehe"] + elif company_type in [ + CompanyTypeEnum.EINZELKAUFMANN, + CompanyTypeEnum.EG, + CompanyTypeEnum.PARTNERSCHAFT, + CompanyTypeEnum.PARTNERGESELLSCHAFT, + CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT, + None, + ]: + return None + # Catch entries having the dict but with null values + if not all(capital.values()): + return None + return Capital( + **{ # type: ignore + "value": float(capital["Zahl"]), + "currency": CurrencyEnum(capital["Waehrung"]), + "type": CapitalTypeEnum(capital_type), + } + ) + + +def map_business_purpose(data: dict) -> str | None: + """Extracts the "Geschäftszweck" from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Business purpose if found + """ + try: + return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ + "Gegenstand_oder_Geschaeftszweck" + ] + except KeyError: + return None + + +def extract_date_from_string(value: str) -> str | None: + """Extract a date in ISO format from the given string if possible. + + Args: + value (str): Input text + + Returns: + str | None: Date in ISO format, None if not found + """ + date_regex = [ # type: ignore + {"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso}, + {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, + ] + results = [] + for regex in date_regex: + result = re.findall(regex["regex"], value) # type: ignore + if len(result) == 1: + relevant_data = result[0] + if regex["mapper"] is not None: # type: ignore + results.append(regex["mapper"](relevant_data)) # type: ignore + else: + results.append(relevant_data) + if len(results) != 1: + return None + return results[0] + + +def map_founding_date(data: dict) -> str | None: + """Extracts the founding date from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Founding date if found + """ + text = str(data) + entry_date = re.findall( + r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text + ) + if len(entry_date) == 1: + return transform_date_to_iso(entry_date[0][1]) + + entry_date = re.findall( + r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text + ) + if len(entry_date) == 1: + return transform_date_to_iso(entry_date[0]) + if ( + "Gruendungsmetadaten" + in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"] + ): + return extract_date_from_string( + data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ + "Gruendungsmetadaten" + ]["Gruendungsdatum"] + ) + # No reliable answer + return None + + +def map_company_id(data: dict) -> CompanyID: + """Retrieve Company ID from export. + + Args: + data (dict): Data export + + Returns: + CompanyID: ID of the company + """ + return CompanyID( + **{ + "hr_number": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Instanzdaten" + ]["Aktenzeichen"], + "district_court": DistrictCourt( + **{ + "name": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"]["Organisation"]["Bezeichnung"][ + "Bezeichnung_Aktuell" + ] + if "Organisation" + in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"] + else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ + "Nachname" + ], + "city": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"]["Organisation"]["Sitz"]["Ort"] + if "Organisation" + in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"] + else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"]["Natuerliche_Person"]["Anschrift"]["Ort"], + } + ), + } + ) + + +def map_last_update(data: dict) -> str: + """Extract last update date from export. + + Args: + data (dict): Unternehmensregister export + + Returns: + str: Last update date + """ + return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"]["letzte_Eintragung"] + + +def map_co_relation(data: dict) -> dict: + """Search for and map the c/o relation from location.street if possible. + + Args: + data (dict): Company dict + + Returns: + dict: Modified Company dict + """ + street = data["location"].street + if street is None: + return data + parts = street.split(",") + co_company = None + co_company_index = None + for index, part in enumerate(parts): + trimmed_part = part.strip() + result = re.findall(r"^c\/o(.*)$", trimmed_part) + if len(result) == 1: + co_company = result[0].strip() + co_company_index = index + if co_company_index is not None: + del parts[co_company_index] + street = "".join(parts).strip() + data["location"].street = street + + if co_company is not None and co_company != "": + relation = CompanyToCompanyRelationship( + RelationshipRoleEnum.CARE_OF, # type: ignore + Location( + data["location"].city, + street, + data["location"].house_number, + data["location"].zip_code, + ), + CompanyRelationshipEnum.COMPANY, # type: ignore + co_company, + ) + data["relationships"].append(relation) + return data + + +def map_unternehmensregister_json(data: dict) -> Company: + """Processes the Unternehmensregister structured export to a Company by using several helper methods. + + Args: + data (dict): Data export + + Returns: + Company: Transformed data + """ + result: dict = {"relationships": []} + + # TODO Refactor mapping - this is a nightmare... + result["id"] = map_company_id(data) + result["name"] = name_from_beteiligung(data) + + result["location"] = loc_from_beteiligung(data) + result["last_update"] = map_last_update(data) + + result["company_type"] = map_rechtsform(result["name"], data) + result["capital"] = map_capital(data, result["company_type"]) + result["business_purpose"] = map_business_purpose(data) + result["founding_date"] = map_founding_date(data) + + for i in range( + 2, len(data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"]) + ): + people = parse_stakeholder( + data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][i] + ) + result["relationships"].append(people) + result = map_co_relation(result) + return Company(**result) + + +if __name__ == "__main__": + from loguru import logger + + base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" + for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): + path = os.path.join(f"{base_path}/export", file) + with open(path, encoding="utf-8") as file_object: + try: + company: Company = map_unternehmensregister_json( + json.loads(file_object.read()) + ) + + name = "".join(e for e in company.name if e.isalnum())[:50] + + with open( + f"{base_path}/transformed/{name}.json", + "w+", + encoding="utf-8", + ) as export_file: + json.dump( + dataclasses.asdict(company), export_file, ensure_ascii=False + ) + except Exception as e: + logger.error(e) + logger.error(f"Error in processing {path}") + sys.exit(1) \ No newline at end of file diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/__init__.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/assets/xjustiz_0040_cl_rollenbezeichnung_3_3.xsd b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/assets/xjustiz_0040_cl_rollenbezeichnung_3_3.xsd new file mode 100644 index 0000000..429985d --- /dev/null +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/assets/xjustiz_0040_cl_rollenbezeichnung_3_3.xsd @@ -0,0 +1,2714 @@ + + + + + + Strukturierte Fachdaten für die Kommunikation im elektronischen Rechtsverkehr + XJustiz + xjustiz + urn:xoev-de:blk-ag-it-standards:standard:xjustiz + XJustiz ist der bundesweit einheitliche Standard für den Austausch strukturierter elektronischer Informationen mit der Justiz. + + + 3.3.1 + XJustiz beschreibt ein standardisiertes Datenaustauschformat für die elektronische Kommunikation innerhalb und mit der Justizverwaltung. + 1.7.1 + 2.3.0 + 3.0.1 + 19.0 SP3 + MagicDraw + + + + + + + + + GDS.Rollenbezeichnung + GDS.Rollenbezeichnung + gds.rollenbezeichnung + urn:xoev-de:xjustiz:codeliste:gds.rollenbezeichnung + Codeliste der verschiedenen Rollenbezeichnungen. + BLK-AG IT-Standards in der Justiz + AG IT-Standards + + + 3.3 + 1.1 + + + + Schlüssel + string + true + required + true + + + Wert + string + false + required + false + + + Aufgeführte Fachmodule nutzen ausschließlich die für sie gekennzeichneten Werte + string + false + optional + false + + + code + + + + + + + + + + + + + + + + + GDS.Rollenbezeichnung + GDS.Rollenbezeichnung + gds.rollenbezeichnung + urn:xoev-de:xjustiz:codeliste:gds.rollenbezeichnung + Codeliste der verschiedenen Rollenbezeichnungen. + BLK-AG IT-Standards in der Justiz + AG IT-Standards + + + 3.3 + 1.1 + + + + Schlüssel + string + true + required + true + + + Wert + string + false + required + false + + + Aufgeführte Fachmodule nutzen ausschließlich die für sie gekennzeichneten Werte + string + false + optional + false + + + + + + + + + Abwesenheitspfleger(in) + INSO + + + + + + + Aliasidentität + STRAF + + + + + + + Angehörige(r) + STRAF + + + + + + + Angeklagte(r) + STRAF + + + + + + + Angeschuldigte(r) + STRAF + + + + + + + Annehmende(r) + + + + + + + Anschlussberufungsbeklagte(r) + + + + + + + Anschlussberufungskläger(in) + + + + + + + Anschlussbeschwerdeführer(in) + + + + + + + Anschlussbeschwerdegegner(in) + + + + + + + Anschlussrechtsbeschwerdeführer(in) + + + + + + + Anschlussrechtsbeschwerdegegner(in) + + + + + + + Anschlussrevisionsbeklagte(r) + + + + + + + Anschlussrevisionskläger(in) + + + + + + + Antragsgegner(in) + INSO,VAG,ZSSR,STRAF + + + + + + + Antragsteller(in) + INSO,VAG,ZSSR,STRAF + + + + + + + Anzeigeerstatter(in) + STRAF + + + + + + + Anzunehmende(r) + + + + + + + Arrestgläubiger(in) + + + + + + + Arrestschuldner(in) + + + + + + + Aufsichtsbehörde + + + + + + + Ausschlagende(r) + + + + + + + Beamter (Beamtin) + + + + + + + Behörde + INSO,STRAF + + + + + + + Beigeladene(r) + + + + + + + Beistand + INSO,STRAF + + + + + + + Bekannte(r) + + + + + + + Beklagte(r) + STRAF + + + + + + + Berufungsbeklagte(r) + + + + + + + Berufungskläger(in) + + + + + + + Beschuldigte(r) + STRAF + + + + + + + Beschwerdeführer(in) + STRAF + + + + + + + Beschwerdegegner(in) + STRAF + + + + + + + Besucher(in) + + + + + + + Betreibende(r) Gläubige(r) + + + + + + + Betreuer(in) + INSO,STRAF + + + + + + + Betreute(r) + INSO,STRAF + + + + + + + Betreuungsbehörde + + + + + + + Betroffene(r) + EHUG,STRAF + + + + + + + Bevollmächtigte(r) + EHUG,INSO + + + + + + + Bewährungshelfer(in) + STRAF + + + + + + + Beweisanwalt (-anwältin) + + + + + + + Bruder (Schwester) + + + + + + + Bundeswehrdisziplinaranwalt (-anwältin) + + + + + + + Bußgeldempfänger(in) + STRAF + + + + + + + Cousin(e) + + + + + + + Dienstvorgesetzte(r) + + + + + + + director + INSO + + + + + + + Dolmetscher(in) + STRAF + + + + + + + Dritte(r) + + + + + + + Drittschuldner(in) + INSO,STRAF + + + + + + + Drittwiderbeklagte(r) + + + + + + + Drittwiderkläger(in) + + + + + + + Ehemann (Ehefrau) + + + + + + + Eigentümer(in) + + + + + + + Eingetragene(r) Lebenspartner(in) + STRAF + + + + + + + Einleitungsbehörde + STRAF + + + + + + + Eltern + INSO + + + + + + + Elternteil + INSO + + + + + + + Enkel(in) + + + + + + + Erbe (Erbin) + INSO + + + + + + + Erbe (Erbin) (ausschlagend) + + + + + + + Erbe (Erbin) (vorverstorben) + + + + + + + Erblasser(in) + INSO + + + + + + + Ergänzungspfleger(in) + INSO,STRAF + + + + + + + Erinnerungsführer(in) + INSO + + + + + + + Erinnerungsgegner(in) + INSO + + + + + + + Ersatzbetreuer(in) + INSO + + + + + + + Ersteher(in) + + + + + + + Erwerber(in) + + + + + + + Erziehungsberechtigte(r) + INSO,STRAF + + + + + + + Frühere(r) Ehegatte (Ehegattin) + + + + + + + Frühere(r) Beklagte(r) + + + + + + + Frühere(r) Beteiligte(r) + + + + + + + Frühere(r) Gläubiger(in) + + + + + + + Frühere(r) Kläger(in) + + + + + + + Frühere(r) Soldat(in) + + + + + + + Gegenvormund + + + + + + + Generalbundesanwalt (-anwältin) + STRAF + + + + + + + Gericht + STRAF + + + + + + + Gerichtsvollzieher(in) + ZPO,STRAF + + + + + + + Geschädigte(r) + STRAF + + + + + + + Geschäftsführende(r) Gesellschafter(in) + INSO,STRAF + + + + + + + Geschäftsführer(in) + INSO,STRAF + + + + + + + Gesetzliche(r) Erbe (Erbin) + + + + + + + Gesetzliche(r) Vertreter(in) + EHUG,INSO,ZSSR,STRAF + + + + + + + Gläubiger(in) + EZOLL,INSO,ZPO + + + + + + + Großeltern + + + + + + + Großvater (Großmutter) + + + + + + + Hauptbevollmächtigte(r) + EHUG,INSO + + + + + + + Hoferbe (Hoferbin) + + + + + + + Inhaber(in) der Firma + + + + + + + Insolvenzverwalter(in) + INSO,STRAF + + + + + + + Jugendamt + + + + + + + Kammer + + + + + + + Kammermitglied + + + + + + + Kind + + + + + + + Kläger(in) + STRAF + + + + + + + Kontrollbetreuer(in) + + + + + + + Korrespondenzanwalt (-anwältin) + + + + + + + Kostenschuldner(in) + STRAF + + + + + + + Landwirtschaftsrichter(in) + + + + + + + Lebenspartner(in) + + + + + + + Liquidator(in) + INSO + + + + + + + Minderjährige(r) + + + + + + + Mitvormund + + + + + + + Mündel + + + + + + + Nachbesserungsgläubiger(in) + + + + + + + Nachlasspfleger(in) + INSO + + + + + + + Nachlassverwalter(in) + INSO + + + + + + + Nebenkläger(in) + STRAF + + + + + + + Neffe (Nichte) + + + + + + + Nicht verwandt + + + + + + + Onkel (Tante) + + + + + + + Opfer + STRAF + + + + + + + Pächter(in) + + + + + + + Pflegeeltern + + + + + + + Pfleger(in) + INSO + + + + + + + Pfleger(in) für das Sammelvermögen + + + + + + + Pfleger(in) für die Leibesfrucht + + + + + + + Pflegevater (Pflegemutter) des Mündels + + + + + + + Pflegling + + + + + + + Pflichtverteidiger(in) + STRAF + + + + + + + Polizei + STRAF + + + + + + + Privatbeklagte(r) + + + + + + + Privatkläger(in) + + + + + + + Prozessbevollmächtigte(r) + EHUG,INSO,ZSSR,STRAF + + + + + + + Prozesskostenhilfe-Anwalt (-Anwältin) + INSO + + + + + + + Prozesskostenhilfe-Korrespondenzanwalt (-anwältin) + INSO + + + + + + + Rechtsanwalt (-anwältin) + INSO,STRAF + + + + + + + Rechtsbeistand + INSO,STRAF + + + + + + + Rechtsbeschwerdeführer(in) + + + + + + + Rechtsbeschwerdegegner(in) + + + + + + + Revisionsbeklagte(r) + + + + + + + Revisionskläger(in) + + + + + + + Sachbearbeiter(in) + INSO + + + + + + + Sachverständige(r) + INSO,STRAF + + + + + + + Schuldner(in) + EZOLL,INSO,VSTR,ZPO + + + + + + + Schwager (Schwägerin) + + + + + + + Schwiegersohn (Schwiegertochter) + + + + + + + Schwiegervater (Schwiegermutter) + + + + + + + Sohn (Tochter) + + + + + + + Soldat(in) + + + + + + + Sonstige(r) Beteiligte(r) + + + + + + + Sonstige(r) Vertreter(in) + EHUG,INSO + + + + + + + Staatsanwaltschaft + STRAF + + + + + + + Stiefeltern + + + + + + + Stiefvater (Stiefmutter) + + + + + + + Streithelfer(in) Beklagte(r) + + + + + + + Streithelfer(in) Kläger(in) + + + + + + + Streitverkündete(r) Beklagte(r) + + + + + + + Streitverkündete(r) Kläger(in) + + + + + + + Terminsbevollmächtigte(r) + + + + + + + Testamentsvollstrecker(in) + + + + + + + Testator(in) + + + + + + + Übernehmer(in) + + + + + + + Unterbevollmächtigte(r) + EHUG,INSO + + + + + + + Ur-Enkel(in) + + + + + + + Vater (Mutter) + INSO + + + + + + + Veräußerer (Veräußerin) + + + + + + + Verfahrensbevollmächtigte(r) + EHUG,INSO + + + + + + + Verfahrenskostenhilfe-Anwalt(-Anwältin) + INSO + + + + + + + Verfahrenskostenhilfe-Korrespondenzanwalt (-anwältin) + INSO + + + + + + + Verfahrenspfleger(in) + INSO + + + + + + + Verfahrensvertreter(in) (§787 ZPO) + EHUG,INSO + + + + + + + Verfügungsbeklagte(r) + + + + + + + Verfügungskläger(in) + + + + + + + Verkehrsanwalt (-anwältin) + + + + + + + Verlobte(r) + + + + + + + Vermächtnisnehmer(in) + + + + + + + Vermieter(in) + + + + + + + Verpächter(in) + + + + + + + Versorgungsträger(in) + VAG + + + + + + + Verteidiger(in) + + + + + + + Vertreter(in) der Interessen des Ausgleichsfonds + + + + + + + Vertreter(in) der Staatskasse + + + + + + + Vertreter(in) des Bundesinteresses beim Bundesverwaltungsgericht + + + + + + + Vertreter(in) des öffentlichen Interesses + + + + + + + Verurteilte(r) + STRAF + + + + + + + Verwalter(in) der Wohnungseigentümergemeinschaft + INSO + + + + + + + Verwaltungsbehörde + STRAF + + + + + + + Vollstreckungsgläubiger(in) + INSO + + + + + + + Vollstreckungsschuldner(in) + INSO + + + + + + + Vorläufige(r) Betreuer(in) + INSO + + + + + + + Vormund + INSO + + + + + + + Vorstand + INSO + + + + + + + Vorsorgebevollmächtigte(r) + + + + + + + Wahlverteidiger(in) + STRAF + + + + + + + Widerbeklagte(r) + + + + + + + Widerkläger(in) + + + + + + + Wiederaufnahmebeklagte(r) + + + + + + + Wiederaufnahmekläger(in) + + + + + + + Zahlungs- und Auflagenempfänger(in) + + + + + + + Zeuge (Zeugin) + INSO,STRAF + + + + + + + Zeugenbeistand + + + + + + + Zulassungsantragsgegner(in) + + + + + + + Zulassungsantragsteller(in) + + + + + + + Zustellungsbevollmächtigte(r) + EHUG,INSO,STRAF + + + + + + + Zustellungsvertreter(in) (§6 ZVG) + EHUG + + + + + + + Notar(in) + + + + + + + Auskunftsempfänger(in) + + + + + + + Melder(in) + + + + + + + Verwahrstelle + + + + + + + Aussteller(in) + + + + + + + Berechtigte(r) + + + + + + + Berechtigte(r) an einem Recht + + + + + + + Einreicher(in) + + + + + + + Erbbauberechtigte(r) + + + + + + + Finanzamt + + + + + + + Grundbuchvertreter(in) + + + + + + + Insolvenzgericht + + + + + + + Mitteilungsempfänger(in) + + + + + + + Nacherbe (Nacherbin) + + + + + + + Rechnungsempfänger(in) + + + + + + + Veranlasser(in) + + + + + + + Versteigerungsabteilung + + + + + + + Vertretungsberechtigte(r) + EHUG + + + + + + + Zweitschuldner(in) + INSO + + + + + + + Vertreter(in) + EHUG,INSO + + + + + + + Arbeitgeber(in) + EZOLL,ZPO + + + + + + + RV-Träger(in) + EZOLL,ZPO + + + + + + + Vollstreckungsstelle + EZOLL + + + + + + + Abkömmling + + + + + + + Kreditnehmer(in) + + + + + + + Neu vorzutragende(r) Eigentümer(in) + + + + + + + Notariatsverwalter(in) + + + + + + + Notarvertreter(in) + + + + + + + Partei kraft Amtes + INSO + + + + + + + Sequester + INSO + + + + + + + Treuhänder(in) + INSO + + + + + + + Zustimmende(r) + + + + + + + Gläubigervertreter(in) + INSO + + + + + + + Schuldnervertreter(in) + + + + + + + Zahlungsempfänger(in) + + + + + + + Anteilsinhaber(in) + INSO + + + + + + + Antragsteller(in) -Eröffnung + INSO + + + + + + + Debitor(in) + INSO + + + + + + + Gesellschafter(in) + INSO + + + + + + + Handlungsbevollmächtigte(r) + INSO + + + + + + + Mitglied einer Gesamt-Anteilsinhaberschaft + INSO + + + + + + + Mitglied einer Gläubigergemeinschaft + INSO + + + + + + + Mitreeder(in) + INSO + + + + + + + Partner(in) + INSO + + + + + + + Persönlich haftende(r) Gesellschafter(in) + INSO + + + + + + + Prozesspfleger(in) + + + + + + + Sachwalter(in) + INSO + + + + + + + Treuhänder(in) (Wohlverhaltensperiode) + INSO + + + + + + + Vermögensträger(in) + INSO + + + + + + + Vorläufige(r) Insolvenzverwalter(in) + INSO + + + + + + + Vorläufige(r) Treuhänder(in) + INSO + + + + + + + Sondersachwalter(in) + INSO + + + + + + + Sonderinsolvenzverwalter(in) + INSO + + + + + + + Vorläufige(r) Sachwalter(in) + INSO + + + + + + + Abwickler(in) + INSO + + + + + + + Übernehmender Rechtsträger + + + + + + + Aufsichtsrat (-rätin) + + + + + + + Besondere(r) Vertreter(in) nach § 30 BGB + EHUG + + + + + + + Betriebsleiter(in) + + + + + + + Empfangsberechtigte(r) + + + + + + + Geschäftsführende(r) Direktor(in) + + + + + + + Geschäftsleiter(in) + + + + + + + Gründer(in) + + + + + + + Gründungsprüfer(in) + + + + + + + Hauptniederlassung + + + + + + + Inhaber(in) + + + + + + + Kommanditist(in) + INSO + + + + + + + Konkursverwalter(in) + + + + + + + Mitglied des Leitungsorgans + + + + + + + Mitglied EWIV + + + + + + + Nachgründungsprüfer(in) + + + + + + + Nachtragsabwickler(in) + + + + + + + Nachtragsliquidator(in) + INSO + + + + + + + Notgeschäftsführer(in) + INSO + + + + + + + Notliquidator(in) + INSO + + + + + + + Notvorstand + INSO + + + + + + + Prokurist(in) + + + + + + + Rechtsträger(in) + + + + + + + Registergericht + + + + + + + Sacheinlagenprüfer(in) + + + + + + + Ständige(r) Vertreter(in) für die Zweigniederlassung + INSO + + + + + + + Übertragender Rechtsträger + + + + + + + Vergleichsverwalter(in) + + + + + + + Verwaltungsrat (-rätin) + + + + + + + Vorstandsvorsitzende(r) + + + + + + + Zweigniederlassung + + + + + + + Vertreter(in) des Klägers/der Klägerin + + + + + + + Vertreter(in) des/der Beklagten + + + + + + + Bewährungshilfe + STRAF + + + + + + + Gerichtshilfe + STRAF + + + + + + + Justizvollzug + STRAF + + + + + + + Pseudoname + STRAF + + + + + + + Gesetzliche(r) Vertreter(in) des Gläubigers/der Gläubigerin + + + + + + + Gesetzliche(r) Vertreter(in) des Schuldners/der Schuldnerin + + + + + + + Bevollmächtigte(r) des Gläubigers/der Gläubigerin + + + + + + + Bevollmächtigte(r) des Schuldners/der Schuldnerin + + + + + + + Fahrzeughalter(in) + STRAF + + + + + + + Frühere(r) Lebenspartner(in) + + + + + + + Sicherungsverwalter(in) + + + + + + + Zwangsverwalter(in) + + + + + + + Mieter(in) + + + + + + + Bürge (Bürgin) + + + + + + + Meistbietende(r) + + + + + + + Abschlussprüfer(in) + + + + + + + Antragstellervertreter(in) + + + + + + + Aufsichtsratsvorsitzende(r) + + + + + + + Berufskammer + + + + + + + Betroffenenvertreter(in) + + + + + + + Bürgermeister(in) + + + + + + + Eingliederungsbeteiligte(r) + + + + + + + Formwechselnder Rechtsträger + + + + + + + Gewerbeamt + + + + + + + Inhaber(in) (nicht eingetragen) + + + + + + + Kostenempfänger(in) + + + + + + + Nachlassgericht + + + + + + + Sonderprüfer(in) + + + + + + + Sonstige(r) gerichtlich bestellte(r) Vertreter(in) + + + + + + + Sonstige(r) gesetzliche(r) Vertreter(in) BGB + + + + + + + Sonstige(r) organschaftliche(r) Vertreter(in) HRB + + + + + + + Standardkostenschuldner(in) + + + + + + + Übernahmeschuldner(in) + + + + + + + Unternehmensvertragsbeteiligte(r) + + + + + + + Vertreter(in) des persönlich haftenden Gesellschafters + + + + + + + Werkleiter(in) + + + + + + + Mehrfachsitz + + + + + + + Mitglied VR + + + + + + + Mitglied e. BGB-Gesellschaft als Abwicklerin + + + + + + + Mitglied e. BGB-Gesellschaft als ges. Vertreterin + + + + + + + Mitglied e. BGB-Gesellschaft als Kommanditistin + + + + + + + Mitglied e. BGB-Gesellschaft als Liquidatorin + + + + + + + Mitglied e. BGB-Gesellschaft als Mitglied e. EWIV + + + + + + + Mitglied e. BGB-Gesellschaft als phG + + + + + + + Mitglied e. Erbengemeinschaft als ges. Vertreterin + + + + + + + Mitglied e. Erbengemeinschaft als Inhaberin + + + + + + + Mitglied e. Erbengemeinschaft als Kommanditistin + + + + + + + Mitglied e. Erbengemeinschaft als Mitglied e. EWIV + + + + + + + Nebensitz + + + + + + + Gesamthandsgemeinschaft + + + + + + + Mitglied einer Gesamthandsgemeinschaft + + + + + + + Leitungsperson i.S.v. § 30 Abs. 1 Nr. 1 bis 5 OWiG + + + + + + + Einziehungsbeteiligter + STRAF + + + + + + + Antragsgegnervertreter(in) + + + + + + + Verbraucher(in) + + + + + + + Vielmelder(in) + + + + + + + Vollmachtgeber(in) + + + + + + + Nebenbeteiligte(r) § 444 StPO + STRAF + + + + + + + Verfallsbeteiligte(r) § 442 StPO a.F. + STRAF + + + + + + + Verfolgte(r) § 34 IRG + STRAF + + + + + + + Rechtsnachfolger(in) + STRAF + + + + + + + Statuswechselnde(r) Rechtsträger(in) + + + + + + + Haftangehörige + STRAF + + + + + + diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/role_mapper.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/role_mapper.py new file mode 100644 index 0000000..81a24a8 --- /dev/null +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/role_mapper.py @@ -0,0 +1,34 @@ +import os +import xmltodict + +from pathlib import Path +from aki_prj23_transparenzregister.models.company import RelationshipRoleEnum + + +class RoleMapper: + singleton = None + def __init__(self): + # TODO Automated file retrieval + base_path = os.path.dirname(Path(__file__)) + path = os.path.join(base_path, "assets", "xjustiz_0040_cl_rollenbezeichnung_3_3.xsd") + with open(path, encoding="utf-8") as file: + content = file.read() + data = xmltodict.parse(content) + + mapping = {} + for entry in data["xs:schema"]["xs:simpleType"]["xs:restriction"]["xs:enumeration"]: + mapping[entry['@value']] = entry['xs:annotation']['xs:appinfo']['wert'] + self.dictionary = mapping + + @staticmethod + def mapper(): + if RoleMapper.singleton is None: + RoleMapper.singleton = RoleMapper() + return RoleMapper.singleton + + def get(self, key: str) -> RelationshipRoleEnum: + return RelationshipRoleEnum(self.dictionary[key]) + +if __name__ == '__main__': + mapper = RoleMapper() + print(mapper.get("201")) \ No newline at end of file diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py similarity index 69% rename from src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py rename to src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py index 2e64e3c..61c9371 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py @@ -30,28 +30,9 @@ from aki_prj23_transparenzregister.utils.string_tools import ( transform_date_to_iso, ) - -def transform_xml_to_json(source_dir: str, target_dir: str) -> None: - """Convert all xml files in a directory to json files. - - Args: - source_dir (str): Directory hosting the xml files - target_dir (str): Target directory to move json files to - """ - if not os.path.exists(target_dir): - os.makedirs(target_dir) - for source_path in [ - os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True) - ]: - target_path = os.path.join( - target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json") - ) - - with open(source_path, encoding="utf-8") as source_file: - # deepcode ignore HandleUnicode: Weird XML format no other solution - data = xmltodict.parse(source_file.read().encode()) - with open(target_path, "w", encoding="utf-8") as json_file: - json_file.write(json.dumps(data)) +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.role_mapper import ( + RoleMapper, +) def parse_date_of_birth(data: dict) -> str | None: @@ -63,22 +44,20 @@ def parse_date_of_birth(data: dict) -> str | None: Returns: str | None: date of birth or None if not found """ - if "tns:geburt" in (base := data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]): + if "tns:geburt" in ( + base := data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ] + ): base = base["tns:geburt"]["tns:geburtsdatum"] if isinstance(base, str): return base return None + def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: - match role_id: - case "086": - return RelationshipRoleEnum.GESCHAEFTSFUEHRER - case "285": - return RelationshipRoleEnum.PROKURIST - case "194": - return RelationshipRoleEnum.VORSTAND - case _: - raise KeyError(f'Uknown role_id: {role_id}') + mapper = RoleMapper.mapper() + return mapper.get(role_id) def parse_stakeholder(data: dict) -> CompanyRelationship | None: @@ -92,100 +71,120 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None: """ if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: # It's a Company serving as a "Kommanditist" or similar - # if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None: - # return CompanyToCompanyRelationship( - # **{ # type: ignore - # "name": remove_traling_and_leading_quotes( - # data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ - # "Nachname" - # ] - # ), - # "location": Location( - # **{ - # "city": data["Beteiligter"]["Natuerliche_Person"][ - # "Anschrift" - # ][-1]["Ort"] - # if isinstance( - # data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], - # list, - # ) - # else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ - # "Ort" - # ] - # } - # ), - # "role": RelationshipRoleEnum( - # data["Rolle"]["Rollenbezeichnung"]["content"] - # ), - # "type": CompanyRelationshipEnum.COMPANY, - # } - # ) + if ( + "tns:vorname" + not in data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:vollerName"] + ): + return CompanyToCompanyRelationship( + **{ # type: ignore + "name": remove_traling_and_leading_quotes( + data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:vollerName"]["tns:nachname"] + ), + "location": Location( + **{ + "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:anschrift"][-1]["tns:ort"] + if isinstance( + data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:anschrift"], + list, + ) + else data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:anschrift"]["tns:ort"] + } + ), + "role": map_role_id_to_enum( + data["tns:rolle"]["tns:rollenbezeichnung"]["code"] + ), + "type": CompanyRelationshipEnum.COMPANY, + } + ) return PersonToCompanyRelationship( **{ # type: ignore "name": PersonName( **{ - "firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ - "tns:vollerName" - ]["tns:vorname"], - "lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ - "tns:vollerName" - ]["tns:nachname"], + "firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:vollerName"]["tns:vorname"], + "lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:vollerName"]["tns:nachname"], } ), "date_of_birth": parse_date_of_birth(data), "location": Location( **{ - "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ - -1 - ]["tns:ort"] + "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:anschrift"][-1]["tns:ort"] if isinstance( - data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"], list + data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:anschrift"], + list, ) - else data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ - "tns:ort" - ] + else data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:anschrift"]["tns:ort"] } ), - # TODO get role via ID "role": map_role_id_to_enum( data["tns:rolle"]["tns:rollenbezeichnung"]["code"] ), "type": CompanyRelationshipEnum.PERSON, } ) - if "Organisation" in data["Beteiligter"]: + if "tns:organisation" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: + base = data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"] + + location = None + if "tns:anschrift" in base: + location = Location( + **{ + "city": base["tns:anschrift"]["tns:ort"], + "street": base["tns:anschrift"]["tns:strasse"] + if "tns:strasse" in base["tns:anschrift"] + else None, + "house_number": base["tns:anschrift"]["tns:hausnummer"] + if "tns:hausnummer" in base["tns:anschrift"] + else None, + "zip_code": base["tns:anschrift"]["tns:postleitzahl"] + if "tns:potsleitzahl" in base["tns:anschrift"] + else None, + } + ) + else: + location = Location( + **{ + "city": base["tns:sitz"]["tns:ort"], + "street": base["tns:sitz"]["tns:strasse"] + if "tns:strasse" in base["tns:sitz"] + else None, + "house_number": base["tns:sitz"]["tns:hausnummer"] + if "tns:hausnummer" in base["tns:sitz"] + else None, + "zip_code": base["tns:sitz"]["tns:postleitzahl"] + if "tns:potsleitzahl" in base["tns:sitz"] + else None, + } + ) + return CompanyToCompanyRelationship( **{ # type: ignore - "role": RelationshipRoleEnum( - data["Rolle"]["Rollenbezeichnung"]["content"] + "role": map_role_id_to_enum( + data["tns:rolle"]["tns:rollenbezeichnung"]["code"] ), "name": remove_traling_and_leading_quotes( - data["Beteiligter"]["Organisation"]["Bezeichnung"][ - "Bezeichnung_Aktuell" - ] - ), - "location": Location( - **{ - "city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"], - "street": data["Beteiligter"]["Organisation"]["Anschrift"][ - "Strasse" - ] - if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - "house_number": data["Beteiligter"]["Organisation"][ - "Anschrift" - ]["Hausnummer"] - if "Hausnummer" - in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - "zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][ - "Postleitzahl" - ] - if "Postleitzahl" - in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - } + base["tns:bezeichnung"]["tns:bezeichnung.aktuell"] ), + "location": location, "type": CompanyRelationshipEnum.COMPANY, } ) @@ -227,10 +226,16 @@ def loc_from_beteiligung(data: dict) -> Location: "tns:beteiligter", "tns:auswahl_beteiligter", "tns:organisation", - "tns:anschrift" + # "tns:anschrift", ] base = traversal(data, base_path) + if "tns:anschrift" in base: + base = base["tns:anschrift"] + else: + base = base["tns:sitz"] + if isinstance(base, list): + base = base[0] house_number = None street = None if "tns:strasse" in base: @@ -273,7 +278,7 @@ def name_from_beteiligung(data: dict) -> str: "tns:auswahl_beteiligter", "tns:organisation", "tns:bezeichnung", - "tns:bezeichnung.aktuell" + "tns:bezeichnung.aktuell", ] name = traversal(data, path) return remove_traling_and_leading_quotes(name) @@ -296,11 +301,9 @@ def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None: "tns:rechtstraeger", "tns:angabenZurRechtsform", "tns:rechtsform", - "code" + "code", ] - return CompanyTypeEnum( - traversal(data, path) - ) + return CompanyTypeEnum(traversal(data, path)) except Exception: if ( company_name.endswith("GmbH") @@ -328,8 +331,8 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: # Early return if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]: return None - capital: dict = {"Zahl": 0.0, "Waehrung": ""} - if company_type == CompanyTypeEnum.KG: + capital: dict = {"tns:zahl": 0.0, "tns:waehrung": {"code": None}} + if company_type == CompanyTypeEnum.KG and "tns:personengesellschaft" in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"]: capital_type = "Hafteinlage" base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ "tns:personengesellschaft" @@ -337,10 +340,14 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: if isinstance(base, list): for entry in base: # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below - capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"]) - capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"] + capital["tns:zahl"] = capital["tns:zahl"] + float( + entry["tns:hafteinlage"]["tns:zahl"] + ) + capital["tns:waehrung"]["code"] = entry["tns:hafteinlage"][ + "tns:waehrung" + ]["code"] elif isinstance(base, dict): - capital = base["Hafteinlage"] + capital = base["tns:hafteinlage"] elif company_type in [ CompanyTypeEnum.GMBH, CompanyTypeEnum.SE, @@ -365,7 +372,9 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: capital = base["tns:zusatzGmbH"]["tns:stammkapital"] elif "tns:zusatzAktiengesellschaft" in base: capital_type = "Grundkapital" - capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"]["tns:hoehe"] + capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"][ + "tns:hoehe" + ] elif company_type in [ CompanyTypeEnum.EINZELKAUFMANN, CompanyTypeEnum.EG, @@ -397,11 +406,7 @@ def map_business_purpose(data: dict) -> str | None: str | None: Business purpose if found """ try: - path = [ - "tns:fachdatenRegister", - "tns:basisdatenRegister", - "tns:gegenstand" - ] + path = ["tns:fachdatenRegister", "tns:basisdatenRegister", "tns:gegenstand"] return traversal(data, path) except KeyError: return None @@ -455,20 +460,18 @@ def map_founding_date(data: dict) -> str | None: ) if len(entry_date) == 1: return transform_date_to_iso(entry_date[0]) - if ( - "tns:satzungsdatum" - in data["tns:fachdatenRegister"]["tns:basisdatenRegister"] - ): + if "tns:satzungsdatum" in data["tns:fachdatenRegister"]["tns:basisdatenRegister"]: path = [ "tns:fachdatenRegister", "tns:basisdatenRegister", "tns:satzungsdatum", - "tns:aktuellesSatzungsdatum" + "tns:aktuellesSatzungsdatum", ] return traversal(data, path) # No reliable answer return None + def traversal(data: dict, path: list[str | int]) -> any: current = data for key in path: @@ -484,15 +487,14 @@ def map_hr_number(data: dict) -> str: "tns:aktenzeichen" ]["tns:auswahl_aktenzeichen"] if "tns:aktenzeichen.strukturiert" in base: - hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"][ - "code" - ] + hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"]["code"] hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"] return f"{hr_prefix} {hr_number}" elif "tns:aktenzeichen.freitext" in base: return base["tns:aktenzeichen.freitext"] return hr_full + def map_district_court(data: dict) -> DistrictCourt: base_path = [ "tns:grunddaten", @@ -501,17 +503,11 @@ def map_district_court(data: dict) -> DistrictCourt: 1, "tns:beteiligter", "tns:auswahl_beteiligter", - "tns:organisation" - ] - path = [*base_path, - "tns:bezeichnung", - "tns:bezeichnung.aktuell" + "tns:organisation", ] + path = [*base_path, "tns:bezeichnung", "tns:bezeichnung.aktuell"] name = traversal(data, path) - path = [*base_path, - "tns:anschrift", - "tns:ort" - ] + path = [*base_path, "tns:anschrift", "tns:ort"] city = traversal(data, path) return DistrictCourt(name=name, city=city) @@ -525,12 +521,14 @@ def map_company_id(data: dict) -> CompanyID: Returns: CompanyID: ID of the company """ - return CompanyID( - **{ - "hr_number": map_hr_number(data), - "district_court": map_district_court(data) - } - ) + try: + return CompanyID( + **{"hr_number": map_hr_number(data), "district_court": map_district_court(data)} + ) + except KeyError: + hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][0]["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"]["tns:registereintragung"]["tns:registernummer"] + district_court = map_district_court(data) + return CompanyID(hr_number=hr_number, district_court=district_court) def map_last_update(data: dict) -> str: @@ -542,11 +540,7 @@ def map_last_update(data: dict) -> str: Returns: str: Last update date """ - path = [ - "tns:fachdatenRegister", - "tns:auszug", - "tns:letzteEintragung" - ] + path = ["tns:fachdatenRegister", "tns:auszug", "tns:letzteEintragung"] return traversal(data, path) diff --git a/tmp/transformation.ipynb b/tmp/transformation.ipynb index 9a560bb..62b0277 100644 --- a/tmp/transformation.ipynb +++ b/tmp/transformation.ipynb @@ -2,7 +2,1112 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id: 001\n", + "value: Abwesenheitspfleger(in)\n", + "id: 002\n", + "value: Aliasidentität\n", + "id: 003\n", + "value: Angehörige(r)\n", + "id: 004\n", + "value: Angeklagte(r)\n", + "id: 005\n", + "value: Angeschuldigte(r)\n", + "id: 006\n", + "value: Annehmende(r)\n", + "id: 007\n", + "value: Anschlussberufungsbeklagte(r)\n", + "id: 008\n", + "value: Anschlussberufungskläger(in)\n", + "id: 009\n", + "value: Anschlussbeschwerdeführer(in)\n", + "id: 010\n", + "value: Anschlussbeschwerdegegner(in)\n", + "id: 011\n", + "value: Anschlussrechtsbeschwerdeführer(in)\n", + "id: 012\n", + "value: Anschlussrechtsbeschwerdegegner(in)\n", + "id: 013\n", + "value: Anschlussrevisionsbeklagte(r)\n", + "id: 014\n", + "value: Anschlussrevisionskläger(in)\n", + "id: 015\n", + "value: Antragsgegner(in)\n", + "id: 016\n", + "value: Antragsteller(in)\n", + "id: 017\n", + "value: Anzeigeerstatter(in)\n", + "id: 018\n", + "value: Anzunehmende(r)\n", + "id: 019\n", + "value: Arrestgläubiger(in)\n", + "id: 020\n", + "value: Arrestschuldner(in)\n", + "id: 021\n", + "value: Aufsichtsbehörde\n", + "id: 022\n", + "value: Ausschlagende(r)\n", + "id: 023\n", + "value: Beamter (Beamtin)\n", + "id: 024\n", + "value: Behörde\n", + "id: 025\n", + "value: Beigeladene(r)\n", + "id: 026\n", + "value: Beistand\n", + "id: 027\n", + "value: Bekannte(r)\n", + "id: 028\n", + "value: Beklagte(r)\n", + "id: 029\n", + "value: Berufungsbeklagte(r)\n", + "id: 030\n", + "value: Berufungskläger(in)\n", + "id: 031\n", + "value: Beschuldigte(r)\n", + "id: 032\n", + "value: Beschwerdeführer(in)\n", + "id: 033\n", + "value: Beschwerdegegner(in)\n", + "id: 034\n", + "value: Besucher(in)\n", + "id: 036\n", + "value: Betreibende(r) Gläubige(r)\n", + "id: 037\n", + "value: Betreuer(in)\n", + "id: 038\n", + "value: Betreute(r)\n", + "id: 039\n", + "value: Betreuungsbehörde\n", + "id: 040\n", + "value: Betroffene(r)\n", + "id: 041\n", + "value: Bevollmächtigte(r)\n", + "id: 042\n", + "value: Bewährungshelfer(in)\n", + "id: 043\n", + "value: Beweisanwalt (-anwältin)\n", + "id: 044\n", + "value: Bruder (Schwester)\n", + "id: 045\n", + "value: Bundeswehrdisziplinaranwalt (-anwältin)\n", + "id: 046\n", + "value: Bußgeldempfänger(in)\n", + "id: 047\n", + "value: Cousin(e)\n", + "id: 048\n", + "value: Dienstvorgesetzte(r)\n", + "id: 049\n", + "value: director\n", + "id: 050\n", + "value: Dolmetscher(in)\n", + "id: 051\n", + "value: Dritte(r)\n", + "id: 052\n", + "value: Drittschuldner(in)\n", + "id: 053\n", + "value: Drittwiderbeklagte(r)\n", + "id: 054\n", + "value: Drittwiderkläger(in)\n", + "id: 056\n", + "value: Ehemann (Ehefrau)\n", + "id: 057\n", + "value: Eigentümer(in)\n", + "id: 058\n", + "value: Eingetragene(r) Lebenspartner(in)\n", + "id: 059\n", + "value: Einleitungsbehörde\n", + "id: 060\n", + "value: Eltern\n", + "id: 061\n", + "value: Elternteil\n", + "id: 062\n", + "value: Enkel(in)\n", + "id: 063\n", + "value: Erbe (Erbin)\n", + "id: 064\n", + "value: Erbe (Erbin) (ausschlagend)\n", + "id: 065\n", + "value: Erbe (Erbin) (vorverstorben)\n", + "id: 066\n", + "value: Erblasser(in)\n", + "id: 067\n", + "value: Ergänzungspfleger(in)\n", + "id: 068\n", + "value: Erinnerungsführer(in)\n", + "id: 069\n", + "value: Erinnerungsgegner(in)\n", + "id: 070\n", + "value: Ersatzbetreuer(in)\n", + "id: 071\n", + "value: Ersteher(in)\n", + "id: 072\n", + "value: Erwerber(in)\n", + "id: 073\n", + "value: Erziehungsberechtigte(r)\n", + "id: 074\n", + "value: Frühere(r) Ehegatte (Ehegattin)\n", + "id: 075\n", + "value: Frühere(r) Beklagte(r)\n", + "id: 076\n", + "value: Frühere(r) Beteiligte(r)\n", + "id: 077\n", + "value: Frühere(r) Gläubiger(in)\n", + "id: 078\n", + "value: Frühere(r) Kläger(in)\n", + "id: 079\n", + "value: Frühere(r) Soldat(in)\n", + "id: 080\n", + "value: Gegenvormund\n", + "id: 081\n", + "value: Generalbundesanwalt (-anwältin)\n", + "id: 082\n", + "value: Gericht\n", + "id: 083\n", + "value: Gerichtsvollzieher(in)\n", + "id: 084\n", + "value: Geschädigte(r)\n", + "id: 085\n", + "value: Geschäftsführende(r) Gesellschafter(in)\n", + "id: 086\n", + "value: Geschäftsführer(in)\n", + "id: 087\n", + "value: Gesetzliche(r) Erbe (Erbin)\n", + "id: 088\n", + "value: Gesetzliche(r) Vertreter(in)\n", + "id: 089\n", + "value: Gläubiger(in)\n", + "id: 090\n", + "value: Großeltern\n", + "id: 092\n", + "value: Großvater (Großmutter)\n", + "id: 093\n", + "value: Hauptbevollmächtigte(r)\n", + "id: 094\n", + "value: Hoferbe (Hoferbin)\n", + "id: 095\n", + "value: Inhaber(in) der Firma\n", + "id: 096\n", + "value: Insolvenzverwalter(in)\n", + "id: 097\n", + "value: Jugendamt\n", + "id: 098\n", + "value: Kammer\n", + "id: 099\n", + "value: Kammermitglied\n", + "id: 100\n", + "value: Kind\n", + "id: 101\n", + "value: Kläger(in)\n", + "id: 102\n", + "value: Kontrollbetreuer(in)\n", + "id: 103\n", + "value: Korrespondenzanwalt (-anwältin)\n", + "id: 104\n", + "value: Kostenschuldner(in)\n", + "id: 105\n", + "value: Landwirtschaftsrichter(in)\n", + "id: 106\n", + "value: Lebenspartner(in)\n", + "id: 107\n", + "value: Liquidator(in)\n", + "id: 108\n", + "value: Minderjährige(r)\n", + "id: 109\n", + "value: Mitvormund\n", + "id: 110\n", + "value: Mündel\n", + "id: 112\n", + "value: Nachbesserungsgläubiger(in)\n", + "id: 113\n", + "value: Nachlasspfleger(in)\n", + "id: 114\n", + "value: Nachlassverwalter(in)\n", + "id: 115\n", + "value: Nebenkläger(in)\n", + "id: 116\n", + "value: Neffe (Nichte)\n", + "id: 117\n", + "value: Nicht verwandt\n", + "id: 118\n", + "value: Onkel (Tante)\n", + "id: 119\n", + "value: Opfer\n", + "id: 120\n", + "value: Pächter(in)\n", + "id: 121\n", + "value: Pflegeeltern\n", + "id: 123\n", + "value: Pfleger(in)\n", + "id: 124\n", + "value: Pfleger(in) für das Sammelvermögen\n", + "id: 125\n", + "value: Pfleger(in) für die Leibesfrucht\n", + "id: 126\n", + "value: Pflegevater (Pflegemutter) des Mündels\n", + "id: 127\n", + "value: Pflegling\n", + "id: 128\n", + "value: Pflichtverteidiger(in)\n", + "id: 129\n", + "value: Polizei\n", + "id: 130\n", + "value: Privatbeklagte(r)\n", + "id: 131\n", + "value: Privatkläger(in)\n", + "id: 132\n", + "value: Prozessbevollmächtigte(r)\n", + "id: 133\n", + "value: Prozesskostenhilfe-Anwalt (-Anwältin)\n", + "id: 134\n", + "value: Prozesskostenhilfe-Korrespondenzanwalt (-anwältin)\n", + "id: 135\n", + "value: Rechtsanwalt (-anwältin)\n", + "id: 136\n", + "value: Rechtsbeistand\n", + "id: 137\n", + "value: Rechtsbeschwerdeführer(in)\n", + "id: 138\n", + "value: Rechtsbeschwerdegegner(in)\n", + "id: 139\n", + "value: Revisionsbeklagte(r)\n", + "id: 140\n", + "value: Revisionskläger(in)\n", + "id: 141\n", + "value: Sachbearbeiter(in)\n", + "id: 142\n", + "value: Sachverständige(r)\n", + "id: 143\n", + "value: Schuldner(in)\n", + "id: 144\n", + "value: Schwager (Schwägerin)\n", + "id: 146\n", + "value: Schwiegersohn (Schwiegertochter)\n", + "id: 148\n", + "value: Schwiegervater (Schwiegermutter)\n", + "id: 149\n", + "value: Sohn (Tochter)\n", + "id: 150\n", + "value: Soldat(in)\n", + "id: 151\n", + "value: Sonstige(r) Beteiligte(r)\n", + "id: 152\n", + "value: Sonstige(r) Vertreter(in)\n", + "id: 153\n", + "value: Staatsanwaltschaft\n", + "id: 154\n", + "value: Stiefeltern\n", + "id: 156\n", + "value: Stiefvater (Stiefmutter)\n", + "id: 157\n", + "value: Streithelfer(in) Beklagte(r)\n", + "id: 158\n", + "value: Streithelfer(in) Kläger(in)\n", + "id: 159\n", + "value: Streitverkündete(r) Beklagte(r)\n", + "id: 160\n", + "value: Streitverkündete(r) Kläger(in)\n", + "id: 161\n", + "value: Terminsbevollmächtigte(r)\n", + "id: 162\n", + "value: Testamentsvollstrecker(in)\n", + "id: 163\n", + "value: Testator(in)\n", + "id: 164\n", + "value: Übernehmer(in)\n", + "id: 165\n", + "value: Unterbevollmächtigte(r)\n", + "id: 166\n", + "value: Ur-Enkel(in)\n", + "id: 167\n", + "value: Vater (Mutter)\n", + "id: 168\n", + "value: Veräußerer (Veräußerin)\n", + "id: 169\n", + "value: Verfahrensbevollmächtigte(r)\n", + "id: 170\n", + "value: Verfahrenskostenhilfe-Anwalt(-Anwältin)\n", + "id: 171\n", + "value: Verfahrenskostenhilfe-Korrespondenzanwalt (-anwältin)\n", + "id: 172\n", + "value: Verfahrenspfleger(in)\n", + "id: 173\n", + "value: Verfahrensvertreter(in) (§787 ZPO)\n", + "id: 174\n", + "value: Verfügungsbeklagte(r)\n", + "id: 175\n", + "value: Verfügungskläger(in)\n", + "id: 176\n", + "value: Verkehrsanwalt (-anwältin)\n", + "id: 177\n", + "value: Verlobte(r)\n", + "id: 178\n", + "value: Vermächtnisnehmer(in)\n", + "id: 179\n", + "value: Vermieter(in)\n", + "id: 180\n", + "value: Verpächter(in)\n", + "id: 181\n", + "value: Versorgungsträger(in)\n", + "id: 182\n", + "value: Verteidiger(in)\n", + "id: 183\n", + "value: Vertreter(in) der Interessen des Ausgleichsfonds\n", + "id: 184\n", + "value: Vertreter(in) der Staatskasse\n", + "id: 185\n", + "value: Vertreter(in) des Bundesinteresses beim Bundesverwaltungsgericht\n", + "id: 186\n", + "value: Vertreter(in) des öffentlichen Interesses\n", + "id: 187\n", + "value: Verurteilte(r)\n", + "id: 188\n", + "value: Verwalter(in) der Wohnungseigentümergemeinschaft\n", + "id: 189\n", + "value: Verwaltungsbehörde\n", + "id: 190\n", + "value: Vollstreckungsgläubiger(in)\n", + "id: 191\n", + "value: Vollstreckungsschuldner(in)\n", + "id: 192\n", + "value: Vorläufige(r) Betreuer(in)\n", + "id: 193\n", + "value: Vormund\n", + "id: 194\n", + "value: Vorstand\n", + "id: 195\n", + "value: Vorsorgebevollmächtigte(r)\n", + "id: 196\n", + "value: Wahlverteidiger(in)\n", + "id: 197\n", + "value: Widerbeklagte(r)\n", + "id: 198\n", + "value: Widerkläger(in)\n", + "id: 199\n", + "value: Wiederaufnahmebeklagte(r)\n", + "id: 200\n", + "value: Wiederaufnahmekläger(in)\n", + "id: 201\n", + "value: Zahlungs- und Auflagenempfänger(in)\n", + "id: 202\n", + "value: Zeuge (Zeugin)\n", + "id: 203\n", + "value: Zeugenbeistand\n", + "id: 204\n", + "value: Zulassungsantragsgegner(in)\n", + "id: 205\n", + "value: Zulassungsantragsteller(in)\n", + "id: 206\n", + "value: Zustellungsbevollmächtigte(r)\n", + "id: 207\n", + "value: Zustellungsvertreter(in) (§6 ZVG)\n", + "id: 208\n", + "value: Notar(in)\n", + "id: 209\n", + "value: Auskunftsempfänger(in)\n", + "id: 210\n", + "value: Melder(in)\n", + "id: 211\n", + "value: Verwahrstelle\n", + "id: 212\n", + "value: Aussteller(in)\n", + "id: 213\n", + "value: Berechtigte(r)\n", + "id: 214\n", + "value: Berechtigte(r) an einem Recht\n", + "id: 215\n", + "value: Einreicher(in)\n", + "id: 216\n", + "value: Erbbauberechtigte(r)\n", + "id: 217\n", + "value: Finanzamt\n", + "id: 218\n", + "value: Grundbuchvertreter(in)\n", + "id: 219\n", + "value: Insolvenzgericht\n", + "id: 220\n", + "value: Mitteilungsempfänger(in)\n", + "id: 221\n", + "value: Nacherbe (Nacherbin)\n", + "id: 222\n", + "value: Rechnungsempfänger(in)\n", + "id: 223\n", + "value: Veranlasser(in)\n", + "id: 224\n", + "value: Versteigerungsabteilung\n", + "id: 225\n", + "value: Vertretungsberechtigte(r)\n", + "id: 226\n", + "value: Zweitschuldner(in)\n", + "id: 227\n", + "value: Vertreter(in)\n", + "id: 228\n", + "value: Arbeitgeber(in)\n", + "id: 229\n", + "value: RV-Träger(in)\n", + "id: 230\n", + "value: Vollstreckungsstelle\n", + "id: 231\n", + "value: Abkömmling\n", + "id: 232\n", + "value: Kreditnehmer(in)\n", + "id: 233\n", + "value: Neu vorzutragende(r) Eigentümer(in)\n", + "id: 234\n", + "value: Notariatsverwalter(in)\n", + "id: 235\n", + "value: Notarvertreter(in)\n", + "id: 236\n", + "value: Partei kraft Amtes\n", + "id: 237\n", + "value: Sequester\n", + "id: 238\n", + "value: Treuhänder(in)\n", + "id: 239\n", + "value: Zustimmende(r)\n", + "id: 240\n", + "value: Gläubigervertreter(in)\n", + "id: 241\n", + "value: Schuldnervertreter(in)\n", + "id: 242\n", + "value: Zahlungsempfänger(in)\n", + "id: 243\n", + "value: Anteilsinhaber(in)\n", + "id: 244\n", + "value: Antragsteller(in) -Eröffnung\n", + "id: 245\n", + "value: Debitor(in)\n", + "id: 246\n", + "value: Gesellschafter(in)\n", + "id: 247\n", + "value: Handlungsbevollmächtigte(r)\n", + "id: 248\n", + "value: Mitglied einer Gesamt-Anteilsinhaberschaft\n", + "id: 249\n", + "value: Mitglied einer Gläubigergemeinschaft\n", + "id: 250\n", + "value: Mitreeder(in)\n", + "id: 251\n", + "value: Partner(in)\n", + "id: 252\n", + "value: Persönlich haftende(r) Gesellschafter(in)\n", + "id: 253\n", + "value: Prozesspfleger(in)\n", + "id: 254\n", + "value: Sachwalter(in)\n", + "id: 255\n", + "value: Treuhänder(in) (Wohlverhaltensperiode)\n", + "id: 256\n", + "value: Vermögensträger(in)\n", + "id: 257\n", + "value: Vorläufige(r) Insolvenzverwalter(in)\n", + "id: 258\n", + "value: Vorläufige(r) Treuhänder(in)\n", + "id: 259\n", + "value: Sondersachwalter(in)\n", + "id: 260\n", + "value: Sonderinsolvenzverwalter(in)\n", + "id: 261\n", + "value: Vorläufige(r) Sachwalter(in)\n", + "id: 262\n", + "value: Abwickler(in)\n", + "id: 263\n", + "value: Übernehmender Rechtsträger\n", + "id: 264\n", + "value: Aufsichtsrat (-rätin)\n", + "id: 265\n", + "value: Besondere(r) Vertreter(in) nach § 30 BGB\n", + "id: 266\n", + "value: Betriebsleiter(in)\n", + "id: 267\n", + "value: Empfangsberechtigte(r)\n", + "id: 268\n", + "value: Geschäftsführende(r) Direktor(in)\n", + "id: 269\n", + "value: Geschäftsleiter(in)\n", + "id: 271\n", + "value: Gründer(in)\n", + "id: 272\n", + "value: Gründungsprüfer(in)\n", + "id: 273\n", + "value: Hauptniederlassung\n", + "id: 274\n", + "value: Inhaber(in)\n", + "id: 275\n", + "value: Kommanditist(in)\n", + "id: 276\n", + "value: Konkursverwalter(in)\n", + "id: 277\n", + "value: Mitglied des Leitungsorgans\n", + "id: 278\n", + "value: Mitglied EWIV\n", + "id: 279\n", + "value: Nachgründungsprüfer(in)\n", + "id: 280\n", + "value: Nachtragsabwickler(in)\n", + "id: 281\n", + "value: Nachtragsliquidator(in)\n", + "id: 282\n", + "value: Notgeschäftsführer(in)\n", + "id: 283\n", + "value: Notliquidator(in)\n", + "id: 284\n", + "value: Notvorstand\n", + "id: 285\n", + "value: Prokurist(in)\n", + "id: 287\n", + "value: Rechtsträger(in)\n", + "id: 288\n", + "value: Registergericht\n", + "id: 289\n", + "value: Sacheinlagenprüfer(in)\n", + "id: 290\n", + "value: Ständige(r) Vertreter(in) für die Zweigniederlassung\n", + "id: 291\n", + "value: Übertragender Rechtsträger\n", + "id: 292\n", + "value: Vergleichsverwalter(in)\n", + "id: 293\n", + "value: Verwaltungsrat (-rätin)\n", + "id: 294\n", + "value: Vorstandsvorsitzende(r)\n", + "id: 295\n", + "value: Zweigniederlassung\n", + "id: 296\n", + "value: Vertreter(in) des Klägers/der Klägerin\n", + "id: 297\n", + "value: Vertreter(in) des/der Beklagten\n", + "id: 298\n", + "value: Bewährungshilfe\n", + "id: 299\n", + "value: Gerichtshilfe\n", + "id: 300\n", + "value: Justizvollzug\n", + "id: 301\n", + "value: Pseudoname\n", + "id: 302\n", + "value: Gesetzliche(r) Vertreter(in) des Gläubigers/der Gläubigerin\n", + "id: 303\n", + "value: Gesetzliche(r) Vertreter(in) des Schuldners/der Schuldnerin\n", + "id: 304\n", + "value: Bevollmächtigte(r) des Gläubigers/der Gläubigerin\n", + "id: 305\n", + "value: Bevollmächtigte(r) des Schuldners/der Schuldnerin\n", + "id: 306\n", + "value: Fahrzeughalter(in)\n", + "id: 307\n", + "value: Frühere(r) Lebenspartner(in)\n", + "id: 308\n", + "value: Sicherungsverwalter(in)\n", + "id: 309\n", + "value: Zwangsverwalter(in)\n", + "id: 310\n", + "value: Mieter(in)\n", + "id: 311\n", + "value: Bürge (Bürgin)\n", + "id: 312\n", + "value: Meistbietende(r)\n", + "id: 313\n", + "value: Abschlussprüfer(in)\n", + "id: 314\n", + "value: Antragstellervertreter(in)\n", + "id: 315\n", + "value: Aufsichtsratsvorsitzende(r)\n", + "id: 316\n", + "value: Berufskammer\n", + "id: 317\n", + "value: Betroffenenvertreter(in)\n", + "id: 318\n", + "value: Bürgermeister(in)\n", + "id: 319\n", + "value: Eingliederungsbeteiligte(r)\n", + "id: 320\n", + "value: Formwechselnder Rechtsträger\n", + "id: 321\n", + "value: Gewerbeamt\n", + "id: 322\n", + "value: Inhaber(in) (nicht eingetragen)\n", + "id: 323\n", + "value: Kostenempfänger(in)\n", + "id: 324\n", + "value: Nachlassgericht\n", + "id: 325\n", + "value: Sonderprüfer(in)\n", + "id: 326\n", + "value: Sonstige(r) gerichtlich bestellte(r) Vertreter(in)\n", + "id: 327\n", + "value: Sonstige(r) gesetzliche(r) Vertreter(in) BGB\n", + "id: 328\n", + "value: Sonstige(r) organschaftliche(r) Vertreter(in) HRB\n", + "id: 329\n", + "value: Standardkostenschuldner(in)\n", + "id: 330\n", + "value: Übernahmeschuldner(in)\n", + "id: 331\n", + "value: Unternehmensvertragsbeteiligte(r)\n", + "id: 332\n", + "value: Vertreter(in) des persönlich haftenden Gesellschafters\n", + "id: 333\n", + "value: Werkleiter(in)\n", + "id: 334\n", + "value: Mehrfachsitz\n", + "id: 335\n", + "value: Mitglied VR\n", + "id: 336\n", + "value: Mitglied e. BGB-Gesellschaft als Abwicklerin\n", + "id: 337\n", + "value: Mitglied e. BGB-Gesellschaft als ges. Vertreterin\n", + "id: 338\n", + "value: Mitglied e. BGB-Gesellschaft als Kommanditistin\n", + "id: 339\n", + "value: Mitglied e. BGB-Gesellschaft als Liquidatorin\n", + "id: 340\n", + "value: Mitglied e. BGB-Gesellschaft als Mitglied e. EWIV\n", + "id: 341\n", + "value: Mitglied e. BGB-Gesellschaft als phG\n", + "id: 342\n", + "value: Mitglied e. Erbengemeinschaft als ges. Vertreterin\n", + "id: 343\n", + "value: Mitglied e. Erbengemeinschaft als Inhaberin\n", + "id: 344\n", + "value: Mitglied e. Erbengemeinschaft als Kommanditistin\n", + "id: 345\n", + "value: Mitglied e. Erbengemeinschaft als Mitglied e. EWIV\n", + "id: 346\n", + "value: Nebensitz\n", + "id: 347\n", + "value: Gesamthandsgemeinschaft\n", + "id: 348\n", + "value: Mitglied einer Gesamthandsgemeinschaft\n", + "id: 349\n", + "value: Leitungsperson i.S.v. § 30 Abs. 1 Nr. 1 bis 5 OWiG\n", + "id: 350\n", + "value: Einziehungsbeteiligter\n", + "id: 351\n", + "value: Antragsgegnervertreter(in)\n", + "id: 352\n", + "value: Verbraucher(in)\n", + "id: 353\n", + "value: Vielmelder(in)\n", + "id: 354\n", + "value: Vollmachtgeber(in)\n", + "id: 355\n", + "value: Nebenbeteiligte(r) § 444 StPO\n", + "id: 356\n", + "value: Verfallsbeteiligte(r) § 442 StPO a.F.\n", + "id: 357\n", + "value: Verfolgte(r) § 34 IRG\n", + "id: 358\n", + "value: Rechtsnachfolger(in)\n", + "id: 359\n", + "value: Statuswechselnde(r) Rechtsträger(in)\n", + "id: 360\n", + "value: Haftangehörige\n" + ] + }, + { + "data": { + "text/plain": [ + "{'001': 'Abwesenheitspfleger(in)',\n", + " '002': 'Aliasidentität',\n", + " '003': 'Angehörige(r)',\n", + " '004': 'Angeklagte(r)',\n", + " '005': 'Angeschuldigte(r)',\n", + " '006': 'Annehmende(r)',\n", + " '007': 'Anschlussberufungsbeklagte(r)',\n", + " '008': 'Anschlussberufungskläger(in)',\n", + " '009': 'Anschlussbeschwerdeführer(in)',\n", + " '010': 'Anschlussbeschwerdegegner(in)',\n", + " '011': 'Anschlussrechtsbeschwerdeführer(in)',\n", + " '012': 'Anschlussrechtsbeschwerdegegner(in)',\n", + " '013': 'Anschlussrevisionsbeklagte(r)',\n", + " '014': 'Anschlussrevisionskläger(in)',\n", + " '015': 'Antragsgegner(in)',\n", + " '016': 'Antragsteller(in)',\n", + " '017': 'Anzeigeerstatter(in)',\n", + " '018': 'Anzunehmende(r)',\n", + " '019': 'Arrestgläubiger(in)',\n", + " '020': 'Arrestschuldner(in)',\n", + " '021': 'Aufsichtsbehörde',\n", + " '022': 'Ausschlagende(r)',\n", + " '023': 'Beamter (Beamtin)',\n", + " '024': 'Behörde',\n", + " '025': 'Beigeladene(r)',\n", + " '026': 'Beistand',\n", + " '027': 'Bekannte(r)',\n", + " '028': 'Beklagte(r)',\n", + " '029': 'Berufungsbeklagte(r)',\n", + " '030': 'Berufungskläger(in)',\n", + " '031': 'Beschuldigte(r)',\n", + " '032': 'Beschwerdeführer(in)',\n", + " '033': 'Beschwerdegegner(in)',\n", + " '034': 'Besucher(in)',\n", + " '036': 'Betreibende(r) Gläubige(r)',\n", + " '037': 'Betreuer(in)',\n", + " '038': 'Betreute(r)',\n", + " '039': 'Betreuungsbehörde',\n", + " '040': 'Betroffene(r)',\n", + " '041': 'Bevollmächtigte(r)',\n", + " '042': 'Bewährungshelfer(in)',\n", + " '043': 'Beweisanwalt (-anwältin)',\n", + " '044': 'Bruder (Schwester)',\n", + " '045': 'Bundeswehrdisziplinaranwalt (-anwältin)',\n", + " '046': 'Bußgeldempfänger(in)',\n", + " '047': 'Cousin(e)',\n", + " '048': 'Dienstvorgesetzte(r)',\n", + " '049': 'director',\n", + " '050': 'Dolmetscher(in)',\n", + " '051': 'Dritte(r)',\n", + " '052': 'Drittschuldner(in)',\n", + " '053': 'Drittwiderbeklagte(r)',\n", + " '054': 'Drittwiderkläger(in)',\n", + " '056': 'Ehemann (Ehefrau)',\n", + " '057': 'Eigentümer(in)',\n", + " '058': 'Eingetragene(r) Lebenspartner(in)',\n", + " '059': 'Einleitungsbehörde',\n", + " '060': 'Eltern',\n", + " '061': 'Elternteil',\n", + " '062': 'Enkel(in)',\n", + " '063': 'Erbe (Erbin)',\n", + " '064': 'Erbe (Erbin) (ausschlagend)',\n", + " '065': 'Erbe (Erbin) (vorverstorben)',\n", + " '066': 'Erblasser(in)',\n", + " '067': 'Ergänzungspfleger(in)',\n", + " '068': 'Erinnerungsführer(in)',\n", + " '069': 'Erinnerungsgegner(in)',\n", + " '070': 'Ersatzbetreuer(in)',\n", + " '071': 'Ersteher(in)',\n", + " '072': 'Erwerber(in)',\n", + " '073': 'Erziehungsberechtigte(r)',\n", + " '074': 'Frühere(r) Ehegatte (Ehegattin)',\n", + " '075': 'Frühere(r) Beklagte(r)',\n", + " '076': 'Frühere(r) Beteiligte(r)',\n", + " '077': 'Frühere(r) Gläubiger(in)',\n", + " '078': 'Frühere(r) Kläger(in)',\n", + " '079': 'Frühere(r) Soldat(in)',\n", + " '080': 'Gegenvormund',\n", + " '081': 'Generalbundesanwalt (-anwältin)',\n", + " '082': 'Gericht',\n", + " '083': 'Gerichtsvollzieher(in)',\n", + " '084': 'Geschädigte(r)',\n", + " '085': 'Geschäftsführende(r) Gesellschafter(in)',\n", + " '086': 'Geschäftsführer(in)',\n", + " '087': 'Gesetzliche(r) Erbe (Erbin)',\n", + " '088': 'Gesetzliche(r) Vertreter(in)',\n", + " '089': 'Gläubiger(in)',\n", + " '090': 'Großeltern',\n", + " '092': 'Großvater (Großmutter)',\n", + " '093': 'Hauptbevollmächtigte(r)',\n", + " '094': 'Hoferbe (Hoferbin)',\n", + " '095': 'Inhaber(in) der Firma',\n", + " '096': 'Insolvenzverwalter(in)',\n", + " '097': 'Jugendamt',\n", + " '098': 'Kammer',\n", + " '099': 'Kammermitglied',\n", + " '100': 'Kind',\n", + " '101': 'Kläger(in)',\n", + " '102': 'Kontrollbetreuer(in)',\n", + " '103': 'Korrespondenzanwalt (-anwältin)',\n", + " '104': 'Kostenschuldner(in)',\n", + " '105': 'Landwirtschaftsrichter(in)',\n", + " '106': 'Lebenspartner(in)',\n", + " '107': 'Liquidator(in)',\n", + " '108': 'Minderjährige(r)',\n", + " '109': 'Mitvormund',\n", + " '110': 'Mündel',\n", + " '112': 'Nachbesserungsgläubiger(in)',\n", + " '113': 'Nachlasspfleger(in)',\n", + " '114': 'Nachlassverwalter(in)',\n", + " '115': 'Nebenkläger(in)',\n", + " '116': 'Neffe (Nichte)',\n", + " '117': 'Nicht verwandt',\n", + " '118': 'Onkel (Tante)',\n", + " '119': 'Opfer',\n", + " '120': 'Pächter(in)',\n", + " '121': 'Pflegeeltern',\n", + " '123': 'Pfleger(in)',\n", + " '124': 'Pfleger(in) für das Sammelvermögen',\n", + " '125': 'Pfleger(in) für die Leibesfrucht',\n", + " '126': 'Pflegevater (Pflegemutter) des Mündels',\n", + " '127': 'Pflegling',\n", + " '128': 'Pflichtverteidiger(in)',\n", + " '129': 'Polizei',\n", + " '130': 'Privatbeklagte(r)',\n", + " '131': 'Privatkläger(in)',\n", + " '132': 'Prozessbevollmächtigte(r)',\n", + " '133': 'Prozesskostenhilfe-Anwalt (-Anwältin)',\n", + " '134': 'Prozesskostenhilfe-Korrespondenzanwalt (-anwältin)',\n", + " '135': 'Rechtsanwalt (-anwältin)',\n", + " '136': 'Rechtsbeistand',\n", + " '137': 'Rechtsbeschwerdeführer(in)',\n", + " '138': 'Rechtsbeschwerdegegner(in)',\n", + " '139': 'Revisionsbeklagte(r)',\n", + " '140': 'Revisionskläger(in)',\n", + " '141': 'Sachbearbeiter(in)',\n", + " '142': 'Sachverständige(r)',\n", + " '143': 'Schuldner(in)',\n", + " '144': 'Schwager (Schwägerin)',\n", + " '146': 'Schwiegersohn (Schwiegertochter)',\n", + " '148': 'Schwiegervater (Schwiegermutter)',\n", + " '149': 'Sohn (Tochter)',\n", + " '150': 'Soldat(in)',\n", + " '151': 'Sonstige(r) Beteiligte(r)',\n", + " '152': 'Sonstige(r) Vertreter(in)',\n", + " '153': 'Staatsanwaltschaft',\n", + " '154': 'Stiefeltern',\n", + " '156': 'Stiefvater (Stiefmutter)',\n", + " '157': 'Streithelfer(in) Beklagte(r)',\n", + " '158': 'Streithelfer(in) Kläger(in)',\n", + " '159': 'Streitverkündete(r) Beklagte(r)',\n", + " '160': 'Streitverkündete(r) Kläger(in)',\n", + " '161': 'Terminsbevollmächtigte(r)',\n", + " '162': 'Testamentsvollstrecker(in)',\n", + " '163': 'Testator(in)',\n", + " '164': 'Übernehmer(in)',\n", + " '165': 'Unterbevollmächtigte(r)',\n", + " '166': 'Ur-Enkel(in)',\n", + " '167': 'Vater (Mutter)',\n", + " '168': 'Veräußerer (Veräußerin)',\n", + " '169': 'Verfahrensbevollmächtigte(r)',\n", + " '170': 'Verfahrenskostenhilfe-Anwalt(-Anwältin)',\n", + " '171': 'Verfahrenskostenhilfe-Korrespondenzanwalt (-anwältin)',\n", + " '172': 'Verfahrenspfleger(in)',\n", + " '173': 'Verfahrensvertreter(in) (§787 ZPO)',\n", + " '174': 'Verfügungsbeklagte(r)',\n", + " '175': 'Verfügungskläger(in)',\n", + " '176': 'Verkehrsanwalt (-anwältin)',\n", + " '177': 'Verlobte(r)',\n", + " '178': 'Vermächtnisnehmer(in)',\n", + " '179': 'Vermieter(in)',\n", + " '180': 'Verpächter(in)',\n", + " '181': 'Versorgungsträger(in)',\n", + " '182': 'Verteidiger(in)',\n", + " '183': 'Vertreter(in) der Interessen des Ausgleichsfonds',\n", + " '184': 'Vertreter(in) der Staatskasse',\n", + " '185': 'Vertreter(in) des Bundesinteresses beim Bundesverwaltungsgericht',\n", + " '186': 'Vertreter(in) des öffentlichen Interesses',\n", + " '187': 'Verurteilte(r)',\n", + " '188': 'Verwalter(in) der Wohnungseigentümergemeinschaft',\n", + " '189': 'Verwaltungsbehörde',\n", + " '190': 'Vollstreckungsgläubiger(in)',\n", + " '191': 'Vollstreckungsschuldner(in)',\n", + " '192': 'Vorläufige(r) Betreuer(in)',\n", + " '193': 'Vormund',\n", + " '194': 'Vorstand',\n", + " '195': 'Vorsorgebevollmächtigte(r)',\n", + " '196': 'Wahlverteidiger(in)',\n", + " '197': 'Widerbeklagte(r)',\n", + " '198': 'Widerkläger(in)',\n", + " '199': 'Wiederaufnahmebeklagte(r)',\n", + " '200': 'Wiederaufnahmekläger(in)',\n", + " '201': 'Zahlungs- und Auflagenempfänger(in)',\n", + " '202': 'Zeuge (Zeugin)',\n", + " '203': 'Zeugenbeistand',\n", + " '204': 'Zulassungsantragsgegner(in)',\n", + " '205': 'Zulassungsantragsteller(in)',\n", + " '206': 'Zustellungsbevollmächtigte(r)',\n", + " '207': 'Zustellungsvertreter(in) (§6 ZVG)',\n", + " '208': 'Notar(in)',\n", + " '209': 'Auskunftsempfänger(in)',\n", + " '210': 'Melder(in)',\n", + " '211': 'Verwahrstelle',\n", + " '212': 'Aussteller(in)',\n", + " '213': 'Berechtigte(r)',\n", + " '214': 'Berechtigte(r) an einem Recht',\n", + " '215': 'Einreicher(in)',\n", + " '216': 'Erbbauberechtigte(r)',\n", + " '217': 'Finanzamt',\n", + " '218': 'Grundbuchvertreter(in)',\n", + " '219': 'Insolvenzgericht',\n", + " '220': 'Mitteilungsempfänger(in)',\n", + " '221': 'Nacherbe (Nacherbin)',\n", + " '222': 'Rechnungsempfänger(in)',\n", + " '223': 'Veranlasser(in)',\n", + " '224': 'Versteigerungsabteilung',\n", + " '225': 'Vertretungsberechtigte(r)',\n", + " '226': 'Zweitschuldner(in)',\n", + " '227': 'Vertreter(in)',\n", + " '228': 'Arbeitgeber(in)',\n", + " '229': 'RV-Träger(in)',\n", + " '230': 'Vollstreckungsstelle',\n", + " '231': 'Abkömmling',\n", + " '232': 'Kreditnehmer(in)',\n", + " '233': 'Neu vorzutragende(r) Eigentümer(in)',\n", + " '234': 'Notariatsverwalter(in)',\n", + " '235': 'Notarvertreter(in)',\n", + " '236': 'Partei kraft Amtes',\n", + " '237': 'Sequester',\n", + " '238': 'Treuhänder(in)',\n", + " '239': 'Zustimmende(r)',\n", + " '240': 'Gläubigervertreter(in)',\n", + " '241': 'Schuldnervertreter(in)',\n", + " '242': 'Zahlungsempfänger(in)',\n", + " '243': 'Anteilsinhaber(in)',\n", + " '244': 'Antragsteller(in) -Eröffnung',\n", + " '245': 'Debitor(in)',\n", + " '246': 'Gesellschafter(in)',\n", + " '247': 'Handlungsbevollmächtigte(r)',\n", + " '248': 'Mitglied einer Gesamt-Anteilsinhaberschaft',\n", + " '249': 'Mitglied einer Gläubigergemeinschaft',\n", + " '250': 'Mitreeder(in)',\n", + " '251': 'Partner(in)',\n", + " '252': 'Persönlich haftende(r) Gesellschafter(in)',\n", + " '253': 'Prozesspfleger(in)',\n", + " '254': 'Sachwalter(in)',\n", + " '255': 'Treuhänder(in) (Wohlverhaltensperiode)',\n", + " '256': 'Vermögensträger(in)',\n", + " '257': 'Vorläufige(r) Insolvenzverwalter(in)',\n", + " '258': 'Vorläufige(r) Treuhänder(in)',\n", + " '259': 'Sondersachwalter(in)',\n", + " '260': 'Sonderinsolvenzverwalter(in)',\n", + " '261': 'Vorläufige(r) Sachwalter(in)',\n", + " '262': 'Abwickler(in)',\n", + " '263': 'Übernehmender Rechtsträger',\n", + " '264': 'Aufsichtsrat (-rätin)',\n", + " '265': 'Besondere(r) Vertreter(in) nach § 30 BGB',\n", + " '266': 'Betriebsleiter(in)',\n", + " '267': 'Empfangsberechtigte(r)',\n", + " '268': 'Geschäftsführende(r) Direktor(in)',\n", + " '269': 'Geschäftsleiter(in)',\n", + " '271': 'Gründer(in)',\n", + " '272': 'Gründungsprüfer(in)',\n", + " '273': 'Hauptniederlassung',\n", + " '274': 'Inhaber(in)',\n", + " '275': 'Kommanditist(in)',\n", + " '276': 'Konkursverwalter(in)',\n", + " '277': 'Mitglied des Leitungsorgans',\n", + " '278': 'Mitglied EWIV',\n", + " '279': 'Nachgründungsprüfer(in)',\n", + " '280': 'Nachtragsabwickler(in)',\n", + " '281': 'Nachtragsliquidator(in)',\n", + " '282': 'Notgeschäftsführer(in)',\n", + " '283': 'Notliquidator(in)',\n", + " '284': 'Notvorstand',\n", + " '285': 'Prokurist(in)',\n", + " '287': 'Rechtsträger(in)',\n", + " '288': 'Registergericht',\n", + " '289': 'Sacheinlagenprüfer(in)',\n", + " '290': 'Ständige(r) Vertreter(in) für die Zweigniederlassung',\n", + " '291': 'Übertragender Rechtsträger',\n", + " '292': 'Vergleichsverwalter(in)',\n", + " '293': 'Verwaltungsrat (-rätin)',\n", + " '294': 'Vorstandsvorsitzende(r)',\n", + " '295': 'Zweigniederlassung',\n", + " '296': 'Vertreter(in) des Klägers/der Klägerin',\n", + " '297': 'Vertreter(in) des/der Beklagten',\n", + " '298': 'Bewährungshilfe',\n", + " '299': 'Gerichtshilfe',\n", + " '300': 'Justizvollzug',\n", + " '301': 'Pseudoname',\n", + " '302': 'Gesetzliche(r) Vertreter(in) des Gläubigers/der Gläubigerin',\n", + " '303': 'Gesetzliche(r) Vertreter(in) des Schuldners/der Schuldnerin',\n", + " '304': 'Bevollmächtigte(r) des Gläubigers/der Gläubigerin',\n", + " '305': 'Bevollmächtigte(r) des Schuldners/der Schuldnerin',\n", + " '306': 'Fahrzeughalter(in)',\n", + " '307': 'Frühere(r) Lebenspartner(in)',\n", + " '308': 'Sicherungsverwalter(in)',\n", + " '309': 'Zwangsverwalter(in)',\n", + " '310': 'Mieter(in)',\n", + " '311': 'Bürge (Bürgin)',\n", + " '312': 'Meistbietende(r)',\n", + " '313': 'Abschlussprüfer(in)',\n", + " '314': 'Antragstellervertreter(in)',\n", + " '315': 'Aufsichtsratsvorsitzende(r)',\n", + " '316': 'Berufskammer',\n", + " '317': 'Betroffenenvertreter(in)',\n", + " '318': 'Bürgermeister(in)',\n", + " '319': 'Eingliederungsbeteiligte(r)',\n", + " '320': 'Formwechselnder Rechtsträger',\n", + " '321': 'Gewerbeamt',\n", + " '322': 'Inhaber(in) (nicht eingetragen)',\n", + " '323': 'Kostenempfänger(in)',\n", + " '324': 'Nachlassgericht',\n", + " '325': 'Sonderprüfer(in)',\n", + " '326': 'Sonstige(r) gerichtlich bestellte(r) Vertreter(in)',\n", + " '327': 'Sonstige(r) gesetzliche(r) Vertreter(in) BGB',\n", + " '328': 'Sonstige(r) organschaftliche(r) Vertreter(in) HRB',\n", + " '329': 'Standardkostenschuldner(in)',\n", + " '330': 'Übernahmeschuldner(in)',\n", + " '331': 'Unternehmensvertragsbeteiligte(r)',\n", + " '332': 'Vertreter(in) des persönlich haftenden Gesellschafters',\n", + " '333': 'Werkleiter(in)',\n", + " '334': 'Mehrfachsitz',\n", + " '335': 'Mitglied VR',\n", + " '336': 'Mitglied e. BGB-Gesellschaft als Abwicklerin',\n", + " '337': 'Mitglied e. BGB-Gesellschaft als ges. Vertreterin',\n", + " '338': 'Mitglied e. BGB-Gesellschaft als Kommanditistin',\n", + " '339': 'Mitglied e. BGB-Gesellschaft als Liquidatorin',\n", + " '340': 'Mitglied e. BGB-Gesellschaft als Mitglied e. EWIV',\n", + " '341': 'Mitglied e. BGB-Gesellschaft als phG',\n", + " '342': 'Mitglied e. Erbengemeinschaft als ges. Vertreterin',\n", + " '343': 'Mitglied e. Erbengemeinschaft als Inhaberin',\n", + " '344': 'Mitglied e. Erbengemeinschaft als Kommanditistin',\n", + " '345': 'Mitglied e. Erbengemeinschaft als Mitglied e. EWIV',\n", + " '346': 'Nebensitz',\n", + " '347': 'Gesamthandsgemeinschaft',\n", + " '348': 'Mitglied einer Gesamthandsgemeinschaft',\n", + " '349': 'Leitungsperson i.S.v. § 30 Abs. 1 Nr. 1 bis 5 OWiG',\n", + " '350': 'Einziehungsbeteiligter',\n", + " '351': 'Antragsgegnervertreter(in)',\n", + " '352': 'Verbraucher(in)',\n", + " '353': 'Vielmelder(in)',\n", + " '354': 'Vollmachtgeber(in)',\n", + " '355': 'Nebenbeteiligte(r) § 444 StPO',\n", + " '356': 'Verfallsbeteiligte(r) § 442 StPO a.F.',\n", + " '357': 'Verfolgte(r) § 34 IRG',\n", + " '358': 'Rechtsnachfolger(in)',\n", + " '359': 'Statuswechselnde(r) Rechtsträger(in)',\n", + " '360': 'Haftangehörige'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import xmltodict\n", + "\n", + "with open('../src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/xjustiz_0040_cl_rollenbezeichnung_3_3.xsd', encoding=\"utf-8\") as file:\n", + " content = file.read()\n", + " data = xmltodict.parse(content)\n", + "mapping = {}\n", + "for entry in data[\"xs:schema\"][\"xs:simpleType\"][\"xs:restriction\"][\"xs:enumeration\"]:\n", + " mapping[entry['@value']] = entry['xs:annotation']['xs:appinfo']['wert']\n", + " print(f\"id: {entry['@value']}\")\n", + " print(f\"value: {entry['xs:annotation']['xs:appinfo']['wert']}\")\n", + "mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Geschäftsführer(in)'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mapping[\"086\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -16,7 +1121,7 @@ "source": [ "import json\n", "import dataclasses\n", - "from transform import map_unternehmensregister_json\n", + "from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import map_unternehmensregister_json\n", "\n", "with open('../tmp/json/GEAFarmTechnologiesGmbH.json', \"r\") as file:\n", " content = json.load(file)\n", @@ -28,7 +1133,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -40,10 +1145,6 @@ } ], "source": [ - "import json\n", - "import dataclasses\n", - "from transform import map_unternehmensregister_json\n", - "\n", "with open('../tmp/json/ZalandoLoungeServiceGmbH.json', \"r\") as file:\n", " content = json.load(file)\n", " company_data = map_unternehmensregister_json(content)\n", @@ -54,7 +1155,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -66,10 +1167,6 @@ } ], "source": [ - "import json\n", - "import dataclasses\n", - "from transform import map_unternehmensregister_json\n", - "\n", "with open('../tmp/json/ZalandoSE.json', \"r\") as file:\n", " content = json.load(file)\n", " company_data = map_unternehmensregister_json(content)\n", From 042a0196281a191ce2f0001896461a07c9493e3b Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 3 Nov 2023 22:39:27 +0100 Subject: [PATCH 06/14] checkpoint: Refactoring, first working version of processing --- .../apps/find_missing_companies.py | 127 ++++++++++++------ .../unternehmensregister/extract.py | 13 +- .../unternehmensregister/load.py | 21 +-- .../unternehmensregister/transform/main.py | 15 ++- .../unternehmensregister/transform/v1/v1.py | 28 ---- .../unternehmensregister/transform/v3/v3.py | 35 +---- .../utils/mongo/company_mongo_service.py | 1 - tmp/transformation.ipynb | 8 ++ 8 files changed, 132 insertions(+), 116 deletions(-) diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py index f8a7b5b..3f0f941 100644 --- a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -5,6 +5,7 @@ import glob import argparse import tempfile import dataclasses +import multiprocessing import pandas as pd from tqdm import tqdm from pathlib import Path @@ -21,11 +22,76 @@ from aki_prj23_transparenzregister.utils.logger_config import ( from aki_prj23_transparenzregister.utils.sql import connector from aki_prj23_transparenzregister.utils.sql import entities +from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector +from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import ( + CompanyMongoService, +) + from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import ( extract, load, ) -from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import main as transform +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import ( + main as transform, +) + +def work(company: entities.Company, configProvider) -> None: + with tempfile.TemporaryDirectory() as tmp_dir: + xml_dir = os.path.join(*[tmp_dir, "xml"]) + os.makedirs(xml_dir, exist_ok=True) + try: + extract.scrape(company.name, xml_dir, True) + except Exception as e: + logger.error(e) + return + output_path = os.path.join(*[tmp_dir, "transformed"]) + os.makedirs(output_path, exist_ok=True) + json_dir = os.path.join(*[tmp_dir, "json"]) + os.makedirs(json_dir, exist_ok=True) + transform.transform_xml_to_json( + xml_dir, + json_dir, + ) + + for file in tqdm(glob.glob1(json_dir, "*.json")): + try: + path = os.path.join(json_dir, file) + with open(path, encoding="utf-8") as file_object: + company_mapped = transform.map_unternehmensregister_json( + json.loads(file_object.read()) + ) + + name = "".join(e for e in company_mapped.name if e.isalnum())[:50] + + with open( + os.path.join(output_path, f"{name}.json"), + "w+", + encoding="utf-8", + ) as export_file: + json.dump( + dataclasses.asdict(company_mapped), export_file, ensure_ascii=False + ) + except Exception as e: + logger.error(e) + return + mongoConnector = MongoConnector(configProvider.get_mongo_connection_string()) + companyMongoService = CompanyMongoService( + mongoConnector + ) + num_processed = load.load_directory_to_mongo(output_path, companyMongoService) + mongoConnector.client.close() + + try: + if num_processed > 0: + with connector.get_session(configProvider) as session: + company = session.query(entities.MissingCompany).where(entities.MissingCompany.name == company.name).first() + company.searched_for = True + session.commit() + print(f"Processed {company.name}") + except Exception as e: + logger.error(e) + return + if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -43,44 +109,29 @@ if __name__ == "__main__": parsed = parser.parse_args(sys.argv[1:]) configer_logger(namespace=parsed) config = parsed.config - # session = connector.get_session(get_config_provider(config)) - # missing_companies = session.query(entities.MissingCompany).all() + configProvider = get_config_provider(config) + session = connector.get_session(configProvider) - counter = 0 - # # Scrape data from unternehmensregister - # for company in missing_companies: - # print(company.name) - # extract.scrape(company.name, ["tmp", "xml"]) - - # Transform input - output_path = os.path.join(str(Path.cwd()), *["tmp", "transformed"]) - xml_dir = os.path.join(str(Path.cwd()), *["tmp", "xml"]) - json_dir = os.path.join(str(Path.cwd()), *["tmp", "json"]) - transform.transform_xml_to_json( - os.path.join(xml_dir), - os.path.join(json_dir), + companyMongoService = CompanyMongoService( + MongoConnector(configProvider.get_mongo_connection_string()) ) - for file in tqdm(glob.glob1(json_dir, "*.json")): - path = os.path.join(json_dir, file) - with open(path, encoding="utf-8") as file_object: - # try: - print(path) - company = transform.map_unternehmensregister_json( - json.loads(file_object.read()) - ) - name = "".join(e for e in company.name if e.isalnum())[:50] + missing_companies = session.query(entities.MissingCompany).where(entities.MissingCompany.searched_for == False).all() - with open( - f"{output_path}/{name}.json", - "w+", - encoding="utf-8", - ) as export_file: - json.dump( - dataclasses.asdict(company), export_file, ensure_ascii=False - ) - # except Exception as e: - # logger.error(e.with_traceback()) - # logger.error(e) - # logger.error(f"Error in processing {path}") - # sys.exit(1) \ No newline at end of file + batch_size = 5 + pool = multiprocessing.Pool(processes=batch_size) + # Scrape data from unternehmensregister + params = [ + (company, configProvider) + for company in missing_companies + ] + # Map the process_handler function to the parameter list using the Pool + pool.starmap(work, params) + + # Close the Pool to prevent any more tasks from being submitted + pool.close() + + # Wait for all the processes to complete + pool.join() + # for company in tqdm(missing_companies): + diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py index efff716..1343566 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py @@ -13,15 +13,15 @@ from selenium.webdriver.support.ui import WebDriverWait from tqdm import tqdm -def scrape(query: str, download_dir: list[str]) -> None: +def scrape(query: str, download_dir: str, full_match: bool = False) -> None: """Fetch results from Unternehmensregister for given query. Args: query (str): Search Query (RegEx supported) download_dir (list[str]): Directory to place output files in """ - download_path = os.path.join(str(Path.cwd()), *download_dir) - print(download_path) + # download_path = os.path.join(str(Path.cwd()), *download_dir) + download_path = download_dir options = webdriver.ChromeOptions() preferences = { "profile.default_content_settings.popups": 0, @@ -33,8 +33,9 @@ def scrape(query: str, download_dir: list[str]) -> None: "default_directory": download_path, }, } - # options.add_argument("--headless=new") + options.add_argument("--headless=new") options.add_experimental_option("prefs", preferences) + options.add_experimental_option("excludeSwitches", ["enable-logging"]) driver = webdriver.Chrome(options=options) @@ -74,7 +75,7 @@ def scrape(query: str, download_dir: list[str]) -> None: ] for index, company_link in enumerate(companies_tab): company_name = company_names[index] - if company_name in processed_companies: + if company_name in processed_companies or (full_match == True and company_name != query): continue # Go to intermediary page company_link.click() @@ -121,6 +122,8 @@ def scrape(query: str, download_dir: list[str]) -> None: finally: for _ in range(6): driver.back() + if company_name == query and full_match == True: + break driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click() driver.close() diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/load.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/load.py index 621b723..4f58bf4 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/load.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/load.py @@ -14,17 +14,22 @@ from aki_prj23_transparenzregister.utils.mongo.connector import ( MongoConnector, ) +def load_directory_to_mongo(base_path: str, service: CompanyMongoService) -> int: + num_processed = 0 + for file in tqdm(glob.glob1(base_path, "*.json")): + path = os.path.join(base_path, file) + with open(path, encoding="utf-8") as file_object: + data = json.loads(file_object.read()) + company: Company = Company(**data) + + service.migrations_of_base_data(company) + num_processed += 1 + return num_processed + if __name__ == "__main__": provider = JsonFileConfigProvider("secrets.json") conn_string = provider.get_mongo_connection_string() connector = MongoConnector(conn_string) service = CompanyMongoService(connector) - base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" - for file in tqdm(glob.glob1(f"{base_path}/transformed", "*.json")): - path = os.path.join(f"{base_path}/transformed", file) - with open(path, encoding="utf-8") as file_object: - data = json.loads(file_object.read()) - company: Company = Company(**data) - - service.migrations_of_base_data(company) + load_directory_to_mongo("./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister/transformed", service) \ No newline at end of file diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py index 717c4d1..9025029 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py @@ -8,6 +8,7 @@ import sys import xmltodict from tqdm import tqdm +from loguru import logger from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import v1 from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3 import v3 @@ -26,12 +27,14 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None: target_path = os.path.join( target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json") ) - - with open(source_path, encoding="utf-8") as source_file: - # deepcode ignore HandleUnicode: Weird XML format no other solution - data = xmltodict.parse(source_file.read().encode()) - with open(target_path, "w", encoding="utf-8") as json_file: - json_file.write(json.dumps(data)) + try: + with open(source_path, encoding="utf-8") as source_file: + # deepcode ignore HandleUnicode: Weird XML format no other solution + data = xmltodict.parse(source_file.read().encode()) + with open(target_path, "w", encoding="utf-8") as json_file: + json_file.write(json.dumps(data)) + except Exception as e: + logger.error(e) def determine_version(data: dict): if "XJustiz_Daten" in data: diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py index 95405cb..92164fa 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py @@ -539,31 +539,3 @@ def map_unternehmensregister_json(data: dict) -> Company: result["relationships"].append(people) result = map_co_relation(result) return Company(**result) - - -if __name__ == "__main__": - from loguru import logger - - base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" - for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): - path = os.path.join(f"{base_path}/export", file) - with open(path, encoding="utf-8") as file_object: - try: - company: Company = map_unternehmensregister_json( - json.loads(file_object.read()) - ) - - name = "".join(e for e in company.name if e.isalnum())[:50] - - with open( - f"{base_path}/transformed/{name}.json", - "w+", - encoding="utf-8", - ) as export_file: - json.dump( - dataclasses.asdict(company), export_file, ensure_ascii=False - ) - except Exception as e: - logger.error(e) - logger.error(f"Error in processing {path}") - sys.exit(1) \ No newline at end of file diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py index 61c9371..f66dc69 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py @@ -385,6 +385,8 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: ]: return None # Catch entries having the dict but with null values + if isinstance(capital, list): + capital = capital[0] if not all(capital.values()): return None return Capital( @@ -465,9 +467,10 @@ def map_founding_date(data: dict) -> str | None: "tns:fachdatenRegister", "tns:basisdatenRegister", "tns:satzungsdatum", - "tns:aktuellesSatzungsdatum", ] - return traversal(data, path) + base = traversal(data, path) + if "tns:aktuellesSatzungsdatum" in base: + return base["tns:aktuellesSatzungsdatum"] # No reliable answer return None @@ -620,31 +623,3 @@ def map_unternehmensregister_json(data: dict) -> Company: result["relationships"].append(people) result = map_co_relation(result) return Company(**result) - - -if __name__ == "__main__": - from loguru import logger - - base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" - for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): - path = os.path.join(f"{base_path}/export", file) - with open(path, encoding="utf-8") as file_object: - try: - company: Company = map_unternehmensregister_json( - json.loads(file_object.read()) - ) - - name = "".join(e for e in company.name if e.isalnum())[:50] - - with open( - f"{base_path}/transformed/{name}.json", - "w+", - encoding="utf-8", - ) as export_file: - json.dump( - dataclasses.asdict(company), export_file, ensure_ascii=False - ) - except Exception as e: - logger.error(e) - logger.error(f"Error in processing {path}") - sys.exit(1) diff --git a/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py b/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py index d175be2..51c1309 100644 --- a/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py +++ b/src/aki_prj23_transparenzregister/utils/mongo/company_mongo_service.py @@ -45,7 +45,6 @@ class CompanyMongoService: query = { "id.hr_number": id["hr_number"], "id.district_court.name": id["district_court"]["name"], - "id.district_court.city": id["district_court"]["city"], } with self.lock: result = list(self.collection.find(query)) diff --git a/tmp/transformation.ipynb b/tmp/transformation.ipynb index 62b0277..cd06c34 100644 --- a/tmp/transformation.ipynb +++ b/tmp/transformation.ipynb @@ -1099,6 +1099,14 @@ "execution_count": 19, "metadata": {}, "output_type": "execute_result" + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." + ] } ], "source": [ From d6b07431e717df7589d43ad9fc42c5fa0742faeb Mon Sep 17 00:00:00 2001 From: TrisNol Date: Fri, 3 Nov 2023 23:26:08 +0100 Subject: [PATCH 07/14] test: Adapt existing unit tests to refactored imports --- .../apps/find_missing_companies.py | 122 ++++++++++-------- .../unternehmensregister/extract.py | 20 ++- tests/apps/find_missing_companies_test.py | 6 + .../unternehmensregister/extract_test.py | 2 +- .../transform/main_test.py | 24 ++++ .../v1_test.py} | 43 ++---- 6 files changed, 123 insertions(+), 94 deletions(-) create mode 100644 tests/apps/find_missing_companies_test.py create mode 100644 tests/utils/data_extraction/unternehmensregister/transform/main_test.py rename tests/utils/data_extraction/unternehmensregister/{transform_test.py => transform/v1_test.py} (95%) diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py index 3f0f941..4cccaa8 100644 --- a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -1,32 +1,20 @@ +"""Retrieve missing companies from unternehmensregister.""" +import argparse +import dataclasses +import glob +import json +import multiprocessing import os import sys -import json -import glob -import argparse import tempfile -import dataclasses -import multiprocessing -import pandas as pd -from tqdm import tqdm -from pathlib import Path + from loguru import logger +from tqdm import tqdm + from aki_prj23_transparenzregister.config.config_providers import ( - HELP_TEXT_CONFIG, + ConfigProvider, get_config_provider, ) -from aki_prj23_transparenzregister.utils.logger_config import ( - add_logger_options_to_argparse, - configer_logger, -) - -from aki_prj23_transparenzregister.utils.sql import connector -from aki_prj23_transparenzregister.utils.sql import entities - -from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector -from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import ( - CompanyMongoService, -) - from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import ( extract, load, @@ -34,13 +22,29 @@ from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister im from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import ( main as transform, ) +from aki_prj23_transparenzregister.utils.logger_config import ( + add_logger_options_to_argparse, + configer_logger, +) +from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import ( + CompanyMongoService, +) +from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector +from aki_prj23_transparenzregister.utils.sql import connector, entities -def work(company: entities.Company, configProvider) -> None: + +def work(company: entities.Company, config_provider: ConfigProvider) -> None: + """Main method. + + Args: + company (entities.Company): Company to be searched for + config_provider (ConfigProvider): ConfigProvider + """ with tempfile.TemporaryDirectory() as tmp_dir: xml_dir = os.path.join(*[tmp_dir, "xml"]) os.makedirs(xml_dir, exist_ok=True) try: - extract.scrape(company.name, xml_dir, True) + extract.scrape(company.name, xml_dir, True, True) # type: ignore except Exception as e: logger.error(e) return @@ -57,37 +61,41 @@ def work(company: entities.Company, configProvider) -> None: try: path = os.path.join(json_dir, file) with open(path, encoding="utf-8") as file_object: - company_mapped = transform.map_unternehmensregister_json( - json.loads(file_object.read()) + company_mapped = transform.map_unternehmensregister_json( + json.loads(file_object.read()) + ) + + name = "".join(e for e in company_mapped.name if e.isalnum())[:50] + + with open( + os.path.join(output_path, f"{name}.json"), + "w+", + encoding="utf-8", + ) as export_file: + json.dump( + dataclasses.asdict(company_mapped), + export_file, + ensure_ascii=False, ) - - name = "".join(e for e in company_mapped.name if e.isalnum())[:50] - - with open( - os.path.join(output_path, f"{name}.json"), - "w+", - encoding="utf-8", - ) as export_file: - json.dump( - dataclasses.asdict(company_mapped), export_file, ensure_ascii=False - ) except Exception as e: logger.error(e) return - mongoConnector = MongoConnector(configProvider.get_mongo_connection_string()) - companyMongoService = CompanyMongoService( - mongoConnector - ) - num_processed = load.load_directory_to_mongo(output_path, companyMongoService) - mongoConnector.client.close() + mongo_connector = MongoConnector(config_provider.get_mongo_connection_string()) + company_mongo_service = CompanyMongoService(mongo_connector) + num_processed = load.load_directory_to_mongo(output_path, company_mongo_service) + mongo_connector.client.close() try: if num_processed > 0: - with connector.get_session(configProvider) as session: - company = session.query(entities.MissingCompany).where(entities.MissingCompany.name == company.name).first() - company.searched_for = True + with connector.get_session(config_provider) as session: + company = ( + session.query(entities.MissingCompany) # type: ignore + .where(entities.MissingCompany.name == company.name) + .first() + ) + company.searched_for = True # type: ignore session.commit() - print(f"Processed {company.name}") + logger.info(f"Processed {company.name}") except Exception as e: logger.error(e) return @@ -109,22 +117,23 @@ if __name__ == "__main__": parsed = parser.parse_args(sys.argv[1:]) configer_logger(namespace=parsed) config = parsed.config - configProvider = get_config_provider(config) - session = connector.get_session(configProvider) + config_provider = get_config_provider(config) + session = connector.get_session(config_provider) - companyMongoService = CompanyMongoService( - MongoConnector(configProvider.get_mongo_connection_string()) + company_mongo_service = CompanyMongoService( + MongoConnector(config_provider.get_mongo_connection_string()) ) - missing_companies = session.query(entities.MissingCompany).where(entities.MissingCompany.searched_for == False).all() + missing_companies = ( + session.query(entities.MissingCompany) + .where(entities.MissingCompany.searched_for is False) + .all() + ) batch_size = 5 pool = multiprocessing.Pool(processes=batch_size) # Scrape data from unternehmensregister - params = [ - (company, configProvider) - for company in missing_companies - ] + params = [(company, config_provider) for company in missing_companies] # Map the process_handler function to the parameter list using the Pool pool.starmap(work, params) @@ -134,4 +143,3 @@ if __name__ == "__main__": # Wait for all the processes to complete pool.join() # for company in tqdm(missing_companies): - diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py index 1343566..fb946d3 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py @@ -3,7 +3,6 @@ import glob import multiprocessing import os -from pathlib import Path from loguru import logger from selenium import webdriver @@ -13,12 +12,19 @@ from selenium.webdriver.support.ui import WebDriverWait from tqdm import tqdm -def scrape(query: str, download_dir: str, full_match: bool = False) -> None: +def scrape( + query: str, + download_dir: str, + full_match: bool = False, + early_stopping: bool = False, +) -> None: """Fetch results from Unternehmensregister for given query. Args: query (str): Search Query (RegEx supported) download_dir (list[str]): Directory to place output files in + full_match (bool, optional): Only scrape first result. Defaults to False. + early_stopping (bool, optional): Stop scraping after first page. Defaults to False. """ # download_path = os.path.join(str(Path.cwd()), *download_dir) download_path = download_dir @@ -75,7 +81,9 @@ def scrape(query: str, download_dir: str, full_match: bool = False) -> None: ] for index, company_link in enumerate(companies_tab): company_name = company_names[index] - if company_name in processed_companies or (full_match == True and company_name != query): + if company_name in processed_companies or ( + full_match is True and company_name != query + ): continue # Go to intermediary page company_link.click() @@ -122,8 +130,10 @@ def scrape(query: str, download_dir: str, full_match: bool = False) -> None: finally: for _ in range(6): driver.back() - if company_name == query and full_match == True: - break + if company_name == query and full_match is True: + break # noqa: B012 + if early_stopping is True: + break driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click() driver.close() diff --git a/tests/apps/find_missing_companies_test.py b/tests/apps/find_missing_companies_test.py new file mode 100644 index 0000000..8146e51 --- /dev/null +++ b/tests/apps/find_missing_companies_test.py @@ -0,0 +1,6 @@ +"""Testing find_missing_companies.py.""" +from aki_prj23_transparenzregister.apps import find_missing_companies + + +def test_import_find_missing_companies() -> None: + assert find_missing_companies diff --git a/tests/utils/data_extraction/unternehmensregister/extract_test.py b/tests/utils/data_extraction/unternehmensregister/extract_test.py index ca2fca5..0e13273 100644 --- a/tests/utils/data_extraction/unternehmensregister/extract_test.py +++ b/tests/utils/data_extraction/unternehmensregister/extract_test.py @@ -86,4 +86,4 @@ def test_wait_for_download_condition() -> None: def test_scrape() -> None: with TemporaryDirectory(dir="./") as temp_dir: - extract.scrape("GEA Farm Technologies GmbH", [temp_dir]) + extract.scrape("GEA Farm Technologies GmbH", temp_dir) diff --git a/tests/utils/data_extraction/unternehmensregister/transform/main_test.py b/tests/utils/data_extraction/unternehmensregister/transform/main_test.py new file mode 100644 index 0000000..a9c4799 --- /dev/null +++ b/tests/utils/data_extraction/unternehmensregister/transform/main_test.py @@ -0,0 +1,24 @@ +"""Testing main.py.""" +import json +import os +from tempfile import TemporaryDirectory + +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import ( + main, +) + + +def test_transform_xml_to_json() -> None: + with TemporaryDirectory(dir="./") as temp_source_dir: + with open(os.path.join(temp_source_dir, "test.xml"), "w") as file: + xml_input = """ + + Hello World! + + """ + file.write(xml_input) + with TemporaryDirectory(dir="./") as temp_target_dir: + main.transform_xml_to_json(temp_source_dir, temp_target_dir) + with open(os.path.join(temp_target_dir, "test.json")) as file: + json_output = json.load(file) + assert json_output == {"test": {"message": "Hello World!"}} diff --git a/tests/utils/data_extraction/unternehmensregister/transform_test.py b/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py similarity index 95% rename from tests/utils/data_extraction/unternehmensregister/transform_test.py rename to tests/utils/data_extraction/unternehmensregister/transform/v1_test.py index 08f24a7..47c525c 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py @@ -1,7 +1,4 @@ """Testing utils/data_extraction/unternehmensregister/transform.py.""" -import json -import os -from tempfile import TemporaryDirectory from unittest.mock import Mock, patch import pytest @@ -21,27 +18,11 @@ from aki_prj23_transparenzregister.models.company import ( PersonToCompanyRelationship, RelationshipRoleEnum, ) -from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import ( - transform, +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import ( + v1 as transform, ) -def test_transform_xml_to_json() -> None: - with TemporaryDirectory(dir="./") as temp_source_dir: - with open(os.path.join(temp_source_dir, "test.xml"), "w") as file: - xml_input = """ - - Hello World! - - """ - file.write(xml_input) - with TemporaryDirectory(dir="./") as temp_target_dir: - transform.transform_xml_to_json(temp_source_dir, temp_target_dir) - with open(os.path.join(temp_target_dir, "test.json")) as file: - json_output = json.load(file) - assert json_output == {"test": {"message": "Hello World!"}} - - def test_parse_stakeholder_org_hidden_in_person() -> None: data = { "Beteiligter": { @@ -787,34 +768,34 @@ def test_map_co_relation(value: dict, expected_result: dict) -> None: @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_co_relation" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_co_relation" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_company_id" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_company_id" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.name_from_beteiligung" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.name_from_beteiligung" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.loc_from_beteiligung" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.loc_from_beteiligung" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_last_update" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_last_update" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_rechtsform" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_rechtsform" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_capital" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_capital" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_business_purpose" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_business_purpose" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_founding_date" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_founding_date" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.parse_stakeholder" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.parse_stakeholder" ) def test_map_unternehmensregister_json( # noqa: PLR0913 mock_map_parse_stakeholder: Mock, From 1121f2605297b49d5189046b30309171fb1980d3 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 4 Nov 2023 09:12:04 +0100 Subject: [PATCH 08/14] refactor: Apply linter feedback --- .../unternehmensregister/load.py | 16 +- .../transform/__init__.py | 1 + .../unternehmensregister/transform/main.py | 33 +- .../unternehmensregister/transform/v1/v1.py | 8 - .../transform/v3/__init__.py | 1 + .../transform/v3/role_mapper.py | 44 +- .../unternehmensregister/transform/v3/v3.py | 92 +- tmp/transformation.ipynb | 1215 ----------------- 8 files changed, 142 insertions(+), 1268 deletions(-) delete mode 100644 tmp/transformation.ipynb diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/load.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/load.py index 4f58bf4..f4f317e 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/load.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/load.py @@ -14,7 +14,17 @@ from aki_prj23_transparenzregister.utils.mongo.connector import ( MongoConnector, ) + def load_directory_to_mongo(base_path: str, service: CompanyMongoService) -> int: + """Load all json files in a directory to MongoDB company collection. + + Args: + base_path (str): Directory to scan + service (CompanyMongoService): MongoDB service + + Returns: + int: Number of processed files + """ num_processed = 0 for file in tqdm(glob.glob1(base_path, "*.json")): path = os.path.join(base_path, file) @@ -26,10 +36,14 @@ def load_directory_to_mongo(base_path: str, service: CompanyMongoService) -> int num_processed += 1 return num_processed + if __name__ == "__main__": provider = JsonFileConfigProvider("secrets.json") conn_string = provider.get_mongo_connection_string() connector = MongoConnector(conn_string) service = CompanyMongoService(connector) - load_directory_to_mongo("./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister/transformed", service) \ No newline at end of file + load_directory_to_mongo( + "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister/transformed", + service, + ) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/__init__.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/__init__.py index e69de29..e6ede79 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/__init__.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/__init__.py @@ -0,0 +1 @@ +"""Transform Unternehmensregister data to Transparenzregister API.""" diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py index 9025029..452e620 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py @@ -3,16 +3,21 @@ import dataclasses import glob import json import os -import re import sys +import typing import xmltodict -from tqdm import tqdm from loguru import logger +from tqdm import tqdm -from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import v1 -from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3 import v3 from aki_prj23_transparenzregister.models.company import Company +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import ( + v1, +) +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3 import ( + v3, +) + def transform_xml_to_json(source_dir: str, target_dir: str) -> None: """Convert all xml files in a directory to json files. @@ -36,13 +41,27 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None: except Exception as e: logger.error(e) -def determine_version(data: dict): + +def determine_version(data: dict) -> typing.Any: + """Determine Unternehmensregister data API version of given entry. + + Args: + data (dict): Unternehmensregister data + + Raises: + ValueError: If version could not be determined + + Returns: + module: Version module + """ if "XJustiz_Daten" in data: + # TODO consider class inheritance for version modules return v1 - elif "tns:nachrichtenkopf" in data[list(data.keys())[0]]: + if "tns:nachrichtenkopf" in data[list(data.keys())[0]]: return v3 raise ValueError("Could not determine Unternehmensregister version.") + def map_unternehmensregister_json(data: dict) -> Company: """Processes the Unternehmensregister structured export to a Company by using several helper methods. @@ -57,8 +76,6 @@ def map_unternehmensregister_json(data: dict) -> Company: if __name__ == "__main__": - from loguru import logger - base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): path = os.path.join(f"{base_path}/export", file) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py index 92164fa..d9e8868 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py @@ -1,13 +1,5 @@ """Transform raw Unternehmensregister export (*.xml) to processed .json files for loading.""" -import dataclasses -import glob -import json -import os import re -import sys - -import xmltodict -from tqdm import tqdm from aki_prj23_transparenzregister.models.company import ( Capital, diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/__init__.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/__init__.py index e69de29..dcfe19d 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/__init__.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/__init__.py @@ -0,0 +1 @@ +"""Transforms data from the Unternehmensregister v3 API to the data model of the Transparenzregister API.""" diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/role_mapper.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/role_mapper.py index 81a24a8..586ccf9 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/role_mapper.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/role_mapper.py @@ -1,34 +1,60 @@ +"""RoleMapper for Unternehmensregister v3 API.""" import os +from pathlib import Path + import xmltodict -from pathlib import Path from aki_prj23_transparenzregister.models.company import RelationshipRoleEnum class RoleMapper: + """RoleMapper for Unternehmensregister v3 API.""" + singleton = None - def __init__(self): - # TODO Automated file retrieval + + def __init__(self) -> None: + """Initialize RoleMapper by ingesting XSD schema file.""" + # TODO Automated file retrieval base_path = os.path.dirname(Path(__file__)) - path = os.path.join(base_path, "assets", "xjustiz_0040_cl_rollenbezeichnung_3_3.xsd") + path = os.path.join( + base_path, "assets", "xjustiz_0040_cl_rollenbezeichnung_3_3.xsd" + ) with open(path, encoding="utf-8") as file: content = file.read() data = xmltodict.parse(content) mapping = {} - for entry in data["xs:schema"]["xs:simpleType"]["xs:restriction"]["xs:enumeration"]: - mapping[entry['@value']] = entry['xs:annotation']['xs:appinfo']['wert'] + for entry in data["xs:schema"]["xs:simpleType"]["xs:restriction"][ + "xs:enumeration" + ]: + mapping[entry["@value"]] = entry["xs:annotation"]["xs:appinfo"]["wert"] self.dictionary = mapping @staticmethod - def mapper(): + def mapper() -> "RoleMapper": + """Singleton getter for RoleMapper. + + Returns: + RoleMapper: Singleton instance + """ if RoleMapper.singleton is None: RoleMapper.singleton = RoleMapper() return RoleMapper.singleton def get(self, key: str) -> RelationshipRoleEnum: + """Get mapped value for given key. + + Args: + key (str): Key to map + + Returns: + RelationshipRoleEnum: Mapped value + """ return RelationshipRoleEnum(self.dictionary[key]) -if __name__ == '__main__': + +if __name__ == "__main__": + from loguru import logger + mapper = RoleMapper() - print(mapper.get("201")) \ No newline at end of file + logger.info(f"Mapped value for role 201 - {mapper.get('201')}") diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py index f66dc69..cdc9981 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py @@ -1,13 +1,8 @@ """Transform raw Unternehmensregister export (*.xml) to processed .json files for loading.""" -import dataclasses -import glob -import json -import os -import re -import sys -import xmltodict -from tqdm import tqdm +import re +import typing +from collections.abc import Sequence from aki_prj23_transparenzregister.models.company import ( Capital, @@ -25,15 +20,14 @@ from aki_prj23_transparenzregister.models.company import ( PersonToCompanyRelationship, RelationshipRoleEnum, ) +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.role_mapper import ( + RoleMapper, +) from aki_prj23_transparenzregister.utils.string_tools import ( remove_traling_and_leading_quotes, transform_date_to_iso, ) -from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.role_mapper import ( - RoleMapper, -) - def parse_date_of_birth(data: dict) -> str | None: """Retreives the date of birth from a stakeholder entry if possible. @@ -56,6 +50,14 @@ def parse_date_of_birth(data: dict) -> str | None: def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: + """Map Unternehmensregister role ID to RelationshipRoleEnum. + + Args: + role_id (str): Unternehmensregister role ID + + Returns: + RelationshipRoleEnum: Role enum + """ mapper = RoleMapper.mapper() return mapper.get(role_id) @@ -229,10 +231,7 @@ def loc_from_beteiligung(data: dict) -> Location: # "tns:anschrift", ] base = traversal(data, base_path) - if "tns:anschrift" in base: - base = base["tns:anschrift"] - else: - base = base["tns:sitz"] + base = base["tns:anschrift"] if "tns:anschrift" in base else base["tns:sitz"] if isinstance(base, list): base = base[0] @@ -318,7 +317,9 @@ def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None: return None -def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: +def map_capital( # noqa: PLR0912 + data: dict, company_type: CompanyTypeEnum +) -> Capital | None: """Extracts the company capital from the given Unternehmensregister export. Args: @@ -332,7 +333,11 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]: return None capital: dict = {"tns:zahl": 0.0, "tns:waehrung": {"code": None}} - if company_type == CompanyTypeEnum.KG and "tns:personengesellschaft" in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"]: + if ( + company_type == CompanyTypeEnum.KG + and "tns:personengesellschaft" + in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"] + ): capital_type = "Hafteinlage" base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ "tns:personengesellschaft" @@ -475,17 +480,40 @@ def map_founding_date(data: dict) -> str | None: return None -def traversal(data: dict, path: list[str | int]) -> any: +def traversal(data: dict, path: Sequence[str | int | object]) -> typing.Any: + """Traverse a dict using list of keys. + + Args: + data (dict): Data export + path (Sequence[str | int | object]): List of keys + + Raises: + KeyError: If key not found + + Returns: + any: Value at the end of the path + """ current = data for key in path: try: current = current[key] - except: - raise KeyError(f"Key {key} not found") + except KeyError as e: + raise KeyError(f"Key {key} not found") from e return current def map_hr_number(data: dict) -> str: + """Extract the HR number from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Raises: + KeyError: If key not found + + Returns: + str: HR number + """ base = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:instanzdaten"][ "tns:aktenzeichen" ]["tns:auswahl_aktenzeichen"] @@ -493,12 +521,20 @@ def map_hr_number(data: dict) -> str: hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"]["code"] hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"] return f"{hr_prefix} {hr_number}" - elif "tns:aktenzeichen.freitext" in base: + if "tns:aktenzeichen.freitext" in base: return base["tns:aktenzeichen.freitext"] - return hr_full + raise KeyError("Could not find HR number") def map_district_court(data: dict) -> DistrictCourt: + """Extract the district court from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + DistrictCourt: District court + """ base_path = [ "tns:grunddaten", "tns:verfahrensdaten", @@ -525,11 +561,13 @@ def map_company_id(data: dict) -> CompanyID: CompanyID: ID of the company """ try: - return CompanyID( - **{"hr_number": map_hr_number(data), "district_court": map_district_court(data)} - ) + return CompanyID(map_hr_number(data), map_district_court(data)) # type: ignore except KeyError: - hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][0]["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"]["tns:registereintragung"]["tns:registernummer"] + hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][0][ + "tns:beteiligter" + ]["tns:auswahl_beteiligter"]["tns:organisation"]["tns:registereintragung"][ + "tns:registernummer" + ] district_court = map_district_court(data) return CompanyID(hr_number=hr_number, district_court=district_court) diff --git a/tmp/transformation.ipynb b/tmp/transformation.ipynb deleted file mode 100644 index cd06c34..0000000 --- a/tmp/transformation.ipynb +++ /dev/null @@ -1,1215 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id: 001\n", - "value: Abwesenheitspfleger(in)\n", - "id: 002\n", - "value: Aliasidentität\n", - "id: 003\n", - "value: Angehörige(r)\n", - "id: 004\n", - "value: Angeklagte(r)\n", - "id: 005\n", - "value: Angeschuldigte(r)\n", - "id: 006\n", - "value: Annehmende(r)\n", - "id: 007\n", - "value: Anschlussberufungsbeklagte(r)\n", - "id: 008\n", - "value: Anschlussberufungskläger(in)\n", - "id: 009\n", - "value: Anschlussbeschwerdeführer(in)\n", - "id: 010\n", - "value: Anschlussbeschwerdegegner(in)\n", - "id: 011\n", - "value: Anschlussrechtsbeschwerdeführer(in)\n", - "id: 012\n", - "value: Anschlussrechtsbeschwerdegegner(in)\n", - "id: 013\n", - "value: Anschlussrevisionsbeklagte(r)\n", - "id: 014\n", - "value: Anschlussrevisionskläger(in)\n", - "id: 015\n", - "value: Antragsgegner(in)\n", - "id: 016\n", - "value: Antragsteller(in)\n", - "id: 017\n", - "value: Anzeigeerstatter(in)\n", - "id: 018\n", - "value: Anzunehmende(r)\n", - "id: 019\n", - "value: Arrestgläubiger(in)\n", - "id: 020\n", - "value: Arrestschuldner(in)\n", - "id: 021\n", - "value: Aufsichtsbehörde\n", - "id: 022\n", - "value: Ausschlagende(r)\n", - "id: 023\n", - "value: Beamter (Beamtin)\n", - "id: 024\n", - "value: Behörde\n", - "id: 025\n", - "value: Beigeladene(r)\n", - "id: 026\n", - "value: Beistand\n", - "id: 027\n", - "value: Bekannte(r)\n", - "id: 028\n", - "value: Beklagte(r)\n", - "id: 029\n", - "value: Berufungsbeklagte(r)\n", - "id: 030\n", - "value: Berufungskläger(in)\n", - "id: 031\n", - "value: Beschuldigte(r)\n", - "id: 032\n", - "value: Beschwerdeführer(in)\n", - "id: 033\n", - "value: Beschwerdegegner(in)\n", - "id: 034\n", - "value: Besucher(in)\n", - "id: 036\n", - "value: Betreibende(r) Gläubige(r)\n", - "id: 037\n", - "value: Betreuer(in)\n", - "id: 038\n", - "value: Betreute(r)\n", - "id: 039\n", - "value: Betreuungsbehörde\n", - "id: 040\n", - "value: Betroffene(r)\n", - "id: 041\n", - "value: Bevollmächtigte(r)\n", - "id: 042\n", - "value: Bewährungshelfer(in)\n", - "id: 043\n", - "value: Beweisanwalt (-anwältin)\n", - "id: 044\n", - "value: Bruder (Schwester)\n", - "id: 045\n", - "value: Bundeswehrdisziplinaranwalt (-anwältin)\n", - "id: 046\n", - "value: Bußgeldempfänger(in)\n", - "id: 047\n", - "value: Cousin(e)\n", - "id: 048\n", - "value: Dienstvorgesetzte(r)\n", - "id: 049\n", - "value: director\n", - "id: 050\n", - "value: Dolmetscher(in)\n", - "id: 051\n", - "value: Dritte(r)\n", - "id: 052\n", - "value: Drittschuldner(in)\n", - "id: 053\n", - "value: Drittwiderbeklagte(r)\n", - "id: 054\n", - "value: Drittwiderkläger(in)\n", - "id: 056\n", - "value: Ehemann (Ehefrau)\n", - "id: 057\n", - "value: Eigentümer(in)\n", - "id: 058\n", - "value: Eingetragene(r) Lebenspartner(in)\n", - "id: 059\n", - "value: Einleitungsbehörde\n", - "id: 060\n", - "value: Eltern\n", - "id: 061\n", - "value: Elternteil\n", - "id: 062\n", - "value: Enkel(in)\n", - "id: 063\n", - "value: Erbe (Erbin)\n", - "id: 064\n", - "value: Erbe (Erbin) (ausschlagend)\n", - "id: 065\n", - "value: Erbe (Erbin) (vorverstorben)\n", - "id: 066\n", - "value: Erblasser(in)\n", - "id: 067\n", - "value: Ergänzungspfleger(in)\n", - "id: 068\n", - "value: Erinnerungsführer(in)\n", - "id: 069\n", - "value: Erinnerungsgegner(in)\n", - "id: 070\n", - "value: Ersatzbetreuer(in)\n", - "id: 071\n", - "value: Ersteher(in)\n", - "id: 072\n", - "value: Erwerber(in)\n", - "id: 073\n", - "value: Erziehungsberechtigte(r)\n", - "id: 074\n", - "value: Frühere(r) Ehegatte (Ehegattin)\n", - "id: 075\n", - "value: Frühere(r) Beklagte(r)\n", - "id: 076\n", - "value: Frühere(r) Beteiligte(r)\n", - "id: 077\n", - "value: Frühere(r) Gläubiger(in)\n", - "id: 078\n", - "value: Frühere(r) Kläger(in)\n", - "id: 079\n", - "value: Frühere(r) Soldat(in)\n", - "id: 080\n", - "value: Gegenvormund\n", - "id: 081\n", - "value: Generalbundesanwalt (-anwältin)\n", - "id: 082\n", - "value: Gericht\n", - "id: 083\n", - "value: Gerichtsvollzieher(in)\n", - "id: 084\n", - "value: Geschädigte(r)\n", - "id: 085\n", - "value: Geschäftsführende(r) Gesellschafter(in)\n", - "id: 086\n", - "value: Geschäftsführer(in)\n", - "id: 087\n", - "value: Gesetzliche(r) Erbe (Erbin)\n", - "id: 088\n", - "value: Gesetzliche(r) Vertreter(in)\n", - "id: 089\n", - "value: Gläubiger(in)\n", - "id: 090\n", - "value: Großeltern\n", - "id: 092\n", - "value: Großvater (Großmutter)\n", - "id: 093\n", - "value: Hauptbevollmächtigte(r)\n", - "id: 094\n", - "value: Hoferbe (Hoferbin)\n", - "id: 095\n", - "value: Inhaber(in) der Firma\n", - "id: 096\n", - "value: Insolvenzverwalter(in)\n", - "id: 097\n", - "value: Jugendamt\n", - "id: 098\n", - "value: Kammer\n", - "id: 099\n", - "value: Kammermitglied\n", - "id: 100\n", - "value: Kind\n", - "id: 101\n", - "value: Kläger(in)\n", - "id: 102\n", - "value: Kontrollbetreuer(in)\n", - "id: 103\n", - "value: Korrespondenzanwalt (-anwältin)\n", - "id: 104\n", - "value: Kostenschuldner(in)\n", - "id: 105\n", - "value: Landwirtschaftsrichter(in)\n", - "id: 106\n", - "value: Lebenspartner(in)\n", - "id: 107\n", - "value: Liquidator(in)\n", - "id: 108\n", - "value: Minderjährige(r)\n", - "id: 109\n", - "value: Mitvormund\n", - "id: 110\n", - "value: Mündel\n", - "id: 112\n", - "value: Nachbesserungsgläubiger(in)\n", - "id: 113\n", - "value: Nachlasspfleger(in)\n", - "id: 114\n", - "value: Nachlassverwalter(in)\n", - "id: 115\n", - "value: Nebenkläger(in)\n", - "id: 116\n", - "value: Neffe (Nichte)\n", - "id: 117\n", - "value: Nicht verwandt\n", - "id: 118\n", - "value: Onkel (Tante)\n", - "id: 119\n", - "value: Opfer\n", - "id: 120\n", - "value: Pächter(in)\n", - "id: 121\n", - "value: Pflegeeltern\n", - "id: 123\n", - "value: Pfleger(in)\n", - "id: 124\n", - "value: Pfleger(in) für das Sammelvermögen\n", - "id: 125\n", - "value: Pfleger(in) für die Leibesfrucht\n", - "id: 126\n", - "value: Pflegevater (Pflegemutter) des Mündels\n", - "id: 127\n", - "value: Pflegling\n", - "id: 128\n", - "value: Pflichtverteidiger(in)\n", - "id: 129\n", - "value: Polizei\n", - "id: 130\n", - "value: Privatbeklagte(r)\n", - "id: 131\n", - "value: Privatkläger(in)\n", - "id: 132\n", - "value: Prozessbevollmächtigte(r)\n", - "id: 133\n", - "value: Prozesskostenhilfe-Anwalt (-Anwältin)\n", - "id: 134\n", - "value: Prozesskostenhilfe-Korrespondenzanwalt (-anwältin)\n", - "id: 135\n", - "value: Rechtsanwalt (-anwältin)\n", - "id: 136\n", - "value: Rechtsbeistand\n", - "id: 137\n", - "value: Rechtsbeschwerdeführer(in)\n", - "id: 138\n", - "value: Rechtsbeschwerdegegner(in)\n", - "id: 139\n", - "value: Revisionsbeklagte(r)\n", - "id: 140\n", - "value: Revisionskläger(in)\n", - "id: 141\n", - "value: Sachbearbeiter(in)\n", - "id: 142\n", - "value: Sachverständige(r)\n", - "id: 143\n", - "value: Schuldner(in)\n", - "id: 144\n", - "value: Schwager (Schwägerin)\n", - "id: 146\n", - "value: Schwiegersohn (Schwiegertochter)\n", - "id: 148\n", - "value: Schwiegervater (Schwiegermutter)\n", - "id: 149\n", - "value: Sohn (Tochter)\n", - "id: 150\n", - "value: Soldat(in)\n", - "id: 151\n", - "value: Sonstige(r) Beteiligte(r)\n", - "id: 152\n", - "value: Sonstige(r) Vertreter(in)\n", - "id: 153\n", - "value: Staatsanwaltschaft\n", - "id: 154\n", - "value: Stiefeltern\n", - "id: 156\n", - "value: Stiefvater (Stiefmutter)\n", - "id: 157\n", - "value: Streithelfer(in) Beklagte(r)\n", - "id: 158\n", - "value: Streithelfer(in) Kläger(in)\n", - "id: 159\n", - "value: Streitverkündete(r) Beklagte(r)\n", - "id: 160\n", - "value: Streitverkündete(r) Kläger(in)\n", - "id: 161\n", - "value: Terminsbevollmächtigte(r)\n", - "id: 162\n", - "value: Testamentsvollstrecker(in)\n", - "id: 163\n", - "value: Testator(in)\n", - "id: 164\n", - "value: Übernehmer(in)\n", - "id: 165\n", - "value: Unterbevollmächtigte(r)\n", - "id: 166\n", - "value: Ur-Enkel(in)\n", - "id: 167\n", - "value: Vater (Mutter)\n", - "id: 168\n", - "value: Veräußerer (Veräußerin)\n", - "id: 169\n", - "value: Verfahrensbevollmächtigte(r)\n", - "id: 170\n", - "value: Verfahrenskostenhilfe-Anwalt(-Anwältin)\n", - "id: 171\n", - "value: Verfahrenskostenhilfe-Korrespondenzanwalt (-anwältin)\n", - "id: 172\n", - "value: Verfahrenspfleger(in)\n", - "id: 173\n", - "value: Verfahrensvertreter(in) (§787 ZPO)\n", - "id: 174\n", - "value: Verfügungsbeklagte(r)\n", - "id: 175\n", - "value: Verfügungskläger(in)\n", - "id: 176\n", - "value: Verkehrsanwalt (-anwältin)\n", - "id: 177\n", - "value: Verlobte(r)\n", - "id: 178\n", - "value: Vermächtnisnehmer(in)\n", - "id: 179\n", - "value: Vermieter(in)\n", - "id: 180\n", - "value: Verpächter(in)\n", - "id: 181\n", - "value: Versorgungsträger(in)\n", - "id: 182\n", - "value: Verteidiger(in)\n", - "id: 183\n", - "value: Vertreter(in) der Interessen des Ausgleichsfonds\n", - "id: 184\n", - "value: Vertreter(in) der Staatskasse\n", - "id: 185\n", - "value: Vertreter(in) des Bundesinteresses beim Bundesverwaltungsgericht\n", - "id: 186\n", - "value: Vertreter(in) des öffentlichen Interesses\n", - "id: 187\n", - "value: Verurteilte(r)\n", - "id: 188\n", - "value: Verwalter(in) der Wohnungseigentümergemeinschaft\n", - "id: 189\n", - "value: Verwaltungsbehörde\n", - "id: 190\n", - "value: Vollstreckungsgläubiger(in)\n", - "id: 191\n", - "value: Vollstreckungsschuldner(in)\n", - "id: 192\n", - "value: Vorläufige(r) Betreuer(in)\n", - "id: 193\n", - "value: Vormund\n", - "id: 194\n", - "value: Vorstand\n", - "id: 195\n", - "value: Vorsorgebevollmächtigte(r)\n", - "id: 196\n", - "value: Wahlverteidiger(in)\n", - "id: 197\n", - "value: Widerbeklagte(r)\n", - "id: 198\n", - "value: Widerkläger(in)\n", - "id: 199\n", - "value: Wiederaufnahmebeklagte(r)\n", - "id: 200\n", - "value: Wiederaufnahmekläger(in)\n", - "id: 201\n", - "value: Zahlungs- und Auflagenempfänger(in)\n", - "id: 202\n", - "value: Zeuge (Zeugin)\n", - "id: 203\n", - "value: Zeugenbeistand\n", - "id: 204\n", - "value: Zulassungsantragsgegner(in)\n", - "id: 205\n", - "value: Zulassungsantragsteller(in)\n", - "id: 206\n", - "value: Zustellungsbevollmächtigte(r)\n", - "id: 207\n", - "value: Zustellungsvertreter(in) (§6 ZVG)\n", - "id: 208\n", - "value: Notar(in)\n", - "id: 209\n", - "value: Auskunftsempfänger(in)\n", - "id: 210\n", - "value: Melder(in)\n", - "id: 211\n", - "value: Verwahrstelle\n", - "id: 212\n", - "value: Aussteller(in)\n", - "id: 213\n", - "value: Berechtigte(r)\n", - "id: 214\n", - "value: Berechtigte(r) an einem Recht\n", - "id: 215\n", - "value: Einreicher(in)\n", - "id: 216\n", - "value: Erbbauberechtigte(r)\n", - "id: 217\n", - "value: Finanzamt\n", - "id: 218\n", - "value: Grundbuchvertreter(in)\n", - "id: 219\n", - "value: Insolvenzgericht\n", - "id: 220\n", - "value: Mitteilungsempfänger(in)\n", - "id: 221\n", - "value: Nacherbe (Nacherbin)\n", - "id: 222\n", - "value: Rechnungsempfänger(in)\n", - "id: 223\n", - "value: Veranlasser(in)\n", - "id: 224\n", - "value: Versteigerungsabteilung\n", - "id: 225\n", - "value: Vertretungsberechtigte(r)\n", - "id: 226\n", - "value: Zweitschuldner(in)\n", - "id: 227\n", - "value: Vertreter(in)\n", - "id: 228\n", - "value: Arbeitgeber(in)\n", - "id: 229\n", - "value: RV-Träger(in)\n", - "id: 230\n", - "value: Vollstreckungsstelle\n", - "id: 231\n", - "value: Abkömmling\n", - "id: 232\n", - "value: Kreditnehmer(in)\n", - "id: 233\n", - "value: Neu vorzutragende(r) Eigentümer(in)\n", - "id: 234\n", - "value: Notariatsverwalter(in)\n", - "id: 235\n", - "value: Notarvertreter(in)\n", - "id: 236\n", - "value: Partei kraft Amtes\n", - "id: 237\n", - "value: Sequester\n", - "id: 238\n", - "value: Treuhänder(in)\n", - "id: 239\n", - "value: Zustimmende(r)\n", - "id: 240\n", - "value: Gläubigervertreter(in)\n", - "id: 241\n", - "value: Schuldnervertreter(in)\n", - "id: 242\n", - "value: Zahlungsempfänger(in)\n", - "id: 243\n", - "value: Anteilsinhaber(in)\n", - "id: 244\n", - "value: Antragsteller(in) -Eröffnung\n", - "id: 245\n", - "value: Debitor(in)\n", - "id: 246\n", - "value: Gesellschafter(in)\n", - "id: 247\n", - "value: Handlungsbevollmächtigte(r)\n", - "id: 248\n", - "value: Mitglied einer Gesamt-Anteilsinhaberschaft\n", - "id: 249\n", - "value: Mitglied einer Gläubigergemeinschaft\n", - "id: 250\n", - "value: Mitreeder(in)\n", - "id: 251\n", - "value: Partner(in)\n", - "id: 252\n", - "value: Persönlich haftende(r) Gesellschafter(in)\n", - "id: 253\n", - "value: Prozesspfleger(in)\n", - "id: 254\n", - "value: Sachwalter(in)\n", - "id: 255\n", - "value: Treuhänder(in) (Wohlverhaltensperiode)\n", - "id: 256\n", - "value: Vermögensträger(in)\n", - "id: 257\n", - "value: Vorläufige(r) Insolvenzverwalter(in)\n", - "id: 258\n", - "value: Vorläufige(r) Treuhänder(in)\n", - "id: 259\n", - "value: Sondersachwalter(in)\n", - "id: 260\n", - "value: Sonderinsolvenzverwalter(in)\n", - "id: 261\n", - "value: Vorläufige(r) Sachwalter(in)\n", - "id: 262\n", - "value: Abwickler(in)\n", - "id: 263\n", - "value: Übernehmender Rechtsträger\n", - "id: 264\n", - "value: Aufsichtsrat (-rätin)\n", - "id: 265\n", - "value: Besondere(r) Vertreter(in) nach § 30 BGB\n", - "id: 266\n", - "value: Betriebsleiter(in)\n", - "id: 267\n", - "value: Empfangsberechtigte(r)\n", - "id: 268\n", - "value: Geschäftsführende(r) Direktor(in)\n", - "id: 269\n", - "value: Geschäftsleiter(in)\n", - "id: 271\n", - "value: Gründer(in)\n", - "id: 272\n", - "value: Gründungsprüfer(in)\n", - "id: 273\n", - "value: Hauptniederlassung\n", - "id: 274\n", - "value: Inhaber(in)\n", - "id: 275\n", - "value: Kommanditist(in)\n", - "id: 276\n", - "value: Konkursverwalter(in)\n", - "id: 277\n", - "value: Mitglied des Leitungsorgans\n", - "id: 278\n", - "value: Mitglied EWIV\n", - "id: 279\n", - "value: Nachgründungsprüfer(in)\n", - "id: 280\n", - "value: Nachtragsabwickler(in)\n", - "id: 281\n", - "value: Nachtragsliquidator(in)\n", - "id: 282\n", - "value: Notgeschäftsführer(in)\n", - "id: 283\n", - "value: Notliquidator(in)\n", - "id: 284\n", - "value: Notvorstand\n", - "id: 285\n", - "value: Prokurist(in)\n", - "id: 287\n", - "value: Rechtsträger(in)\n", - "id: 288\n", - "value: Registergericht\n", - "id: 289\n", - "value: Sacheinlagenprüfer(in)\n", - "id: 290\n", - "value: Ständige(r) Vertreter(in) für die Zweigniederlassung\n", - "id: 291\n", - "value: Übertragender Rechtsträger\n", - "id: 292\n", - "value: Vergleichsverwalter(in)\n", - "id: 293\n", - "value: Verwaltungsrat (-rätin)\n", - "id: 294\n", - "value: Vorstandsvorsitzende(r)\n", - "id: 295\n", - "value: Zweigniederlassung\n", - "id: 296\n", - "value: Vertreter(in) des Klägers/der Klägerin\n", - "id: 297\n", - "value: Vertreter(in) des/der Beklagten\n", - "id: 298\n", - "value: Bewährungshilfe\n", - "id: 299\n", - "value: Gerichtshilfe\n", - "id: 300\n", - "value: Justizvollzug\n", - "id: 301\n", - "value: Pseudoname\n", - "id: 302\n", - "value: Gesetzliche(r) Vertreter(in) des Gläubigers/der Gläubigerin\n", - "id: 303\n", - "value: Gesetzliche(r) Vertreter(in) des Schuldners/der Schuldnerin\n", - "id: 304\n", - "value: Bevollmächtigte(r) des Gläubigers/der Gläubigerin\n", - "id: 305\n", - "value: Bevollmächtigte(r) des Schuldners/der Schuldnerin\n", - "id: 306\n", - "value: Fahrzeughalter(in)\n", - "id: 307\n", - "value: Frühere(r) Lebenspartner(in)\n", - "id: 308\n", - "value: Sicherungsverwalter(in)\n", - "id: 309\n", - "value: Zwangsverwalter(in)\n", - "id: 310\n", - "value: Mieter(in)\n", - "id: 311\n", - "value: Bürge (Bürgin)\n", - "id: 312\n", - "value: Meistbietende(r)\n", - "id: 313\n", - "value: Abschlussprüfer(in)\n", - "id: 314\n", - "value: Antragstellervertreter(in)\n", - "id: 315\n", - "value: Aufsichtsratsvorsitzende(r)\n", - "id: 316\n", - "value: Berufskammer\n", - "id: 317\n", - "value: Betroffenenvertreter(in)\n", - "id: 318\n", - "value: Bürgermeister(in)\n", - "id: 319\n", - "value: Eingliederungsbeteiligte(r)\n", - "id: 320\n", - "value: Formwechselnder Rechtsträger\n", - "id: 321\n", - "value: Gewerbeamt\n", - "id: 322\n", - "value: Inhaber(in) (nicht eingetragen)\n", - "id: 323\n", - "value: Kostenempfänger(in)\n", - "id: 324\n", - "value: Nachlassgericht\n", - "id: 325\n", - "value: Sonderprüfer(in)\n", - "id: 326\n", - "value: Sonstige(r) gerichtlich bestellte(r) Vertreter(in)\n", - "id: 327\n", - "value: Sonstige(r) gesetzliche(r) Vertreter(in) BGB\n", - "id: 328\n", - "value: Sonstige(r) organschaftliche(r) Vertreter(in) HRB\n", - "id: 329\n", - "value: Standardkostenschuldner(in)\n", - "id: 330\n", - "value: Übernahmeschuldner(in)\n", - "id: 331\n", - "value: Unternehmensvertragsbeteiligte(r)\n", - "id: 332\n", - "value: Vertreter(in) des persönlich haftenden Gesellschafters\n", - "id: 333\n", - "value: Werkleiter(in)\n", - "id: 334\n", - "value: Mehrfachsitz\n", - "id: 335\n", - "value: Mitglied VR\n", - "id: 336\n", - "value: Mitglied e. BGB-Gesellschaft als Abwicklerin\n", - "id: 337\n", - "value: Mitglied e. BGB-Gesellschaft als ges. Vertreterin\n", - "id: 338\n", - "value: Mitglied e. BGB-Gesellschaft als Kommanditistin\n", - "id: 339\n", - "value: Mitglied e. BGB-Gesellschaft als Liquidatorin\n", - "id: 340\n", - "value: Mitglied e. BGB-Gesellschaft als Mitglied e. EWIV\n", - "id: 341\n", - "value: Mitglied e. BGB-Gesellschaft als phG\n", - "id: 342\n", - "value: Mitglied e. Erbengemeinschaft als ges. Vertreterin\n", - "id: 343\n", - "value: Mitglied e. Erbengemeinschaft als Inhaberin\n", - "id: 344\n", - "value: Mitglied e. Erbengemeinschaft als Kommanditistin\n", - "id: 345\n", - "value: Mitglied e. Erbengemeinschaft als Mitglied e. EWIV\n", - "id: 346\n", - "value: Nebensitz\n", - "id: 347\n", - "value: Gesamthandsgemeinschaft\n", - "id: 348\n", - "value: Mitglied einer Gesamthandsgemeinschaft\n", - "id: 349\n", - "value: Leitungsperson i.S.v. § 30 Abs. 1 Nr. 1 bis 5 OWiG\n", - "id: 350\n", - "value: Einziehungsbeteiligter\n", - "id: 351\n", - "value: Antragsgegnervertreter(in)\n", - "id: 352\n", - "value: Verbraucher(in)\n", - "id: 353\n", - "value: Vielmelder(in)\n", - "id: 354\n", - "value: Vollmachtgeber(in)\n", - "id: 355\n", - "value: Nebenbeteiligte(r) § 444 StPO\n", - "id: 356\n", - "value: Verfallsbeteiligte(r) § 442 StPO a.F.\n", - "id: 357\n", - "value: Verfolgte(r) § 34 IRG\n", - "id: 358\n", - "value: Rechtsnachfolger(in)\n", - "id: 359\n", - "value: Statuswechselnde(r) Rechtsträger(in)\n", - "id: 360\n", - "value: Haftangehörige\n" - ] - }, - { - "data": { - "text/plain": [ - "{'001': 'Abwesenheitspfleger(in)',\n", - " '002': 'Aliasidentität',\n", - " '003': 'Angehörige(r)',\n", - " '004': 'Angeklagte(r)',\n", - " '005': 'Angeschuldigte(r)',\n", - " '006': 'Annehmende(r)',\n", - " '007': 'Anschlussberufungsbeklagte(r)',\n", - " '008': 'Anschlussberufungskläger(in)',\n", - " '009': 'Anschlussbeschwerdeführer(in)',\n", - " '010': 'Anschlussbeschwerdegegner(in)',\n", - " '011': 'Anschlussrechtsbeschwerdeführer(in)',\n", - " '012': 'Anschlussrechtsbeschwerdegegner(in)',\n", - " '013': 'Anschlussrevisionsbeklagte(r)',\n", - " '014': 'Anschlussrevisionskläger(in)',\n", - " '015': 'Antragsgegner(in)',\n", - " '016': 'Antragsteller(in)',\n", - " '017': 'Anzeigeerstatter(in)',\n", - " '018': 'Anzunehmende(r)',\n", - " '019': 'Arrestgläubiger(in)',\n", - " '020': 'Arrestschuldner(in)',\n", - " '021': 'Aufsichtsbehörde',\n", - " '022': 'Ausschlagende(r)',\n", - " '023': 'Beamter (Beamtin)',\n", - " '024': 'Behörde',\n", - " '025': 'Beigeladene(r)',\n", - " '026': 'Beistand',\n", - " '027': 'Bekannte(r)',\n", - " '028': 'Beklagte(r)',\n", - " '029': 'Berufungsbeklagte(r)',\n", - " '030': 'Berufungskläger(in)',\n", - " '031': 'Beschuldigte(r)',\n", - " '032': 'Beschwerdeführer(in)',\n", - " '033': 'Beschwerdegegner(in)',\n", - " '034': 'Besucher(in)',\n", - " '036': 'Betreibende(r) Gläubige(r)',\n", - " '037': 'Betreuer(in)',\n", - " '038': 'Betreute(r)',\n", - " '039': 'Betreuungsbehörde',\n", - " '040': 'Betroffene(r)',\n", - " '041': 'Bevollmächtigte(r)',\n", - " '042': 'Bewährungshelfer(in)',\n", - " '043': 'Beweisanwalt (-anwältin)',\n", - " '044': 'Bruder (Schwester)',\n", - " '045': 'Bundeswehrdisziplinaranwalt (-anwältin)',\n", - " '046': 'Bußgeldempfänger(in)',\n", - " '047': 'Cousin(e)',\n", - " '048': 'Dienstvorgesetzte(r)',\n", - " '049': 'director',\n", - " '050': 'Dolmetscher(in)',\n", - " '051': 'Dritte(r)',\n", - " '052': 'Drittschuldner(in)',\n", - " '053': 'Drittwiderbeklagte(r)',\n", - " '054': 'Drittwiderkläger(in)',\n", - " '056': 'Ehemann (Ehefrau)',\n", - " '057': 'Eigentümer(in)',\n", - " '058': 'Eingetragene(r) Lebenspartner(in)',\n", - " '059': 'Einleitungsbehörde',\n", - " '060': 'Eltern',\n", - " '061': 'Elternteil',\n", - " '062': 'Enkel(in)',\n", - " '063': 'Erbe (Erbin)',\n", - " '064': 'Erbe (Erbin) (ausschlagend)',\n", - " '065': 'Erbe (Erbin) (vorverstorben)',\n", - " '066': 'Erblasser(in)',\n", - " '067': 'Ergänzungspfleger(in)',\n", - " '068': 'Erinnerungsführer(in)',\n", - " '069': 'Erinnerungsgegner(in)',\n", - " '070': 'Ersatzbetreuer(in)',\n", - " '071': 'Ersteher(in)',\n", - " '072': 'Erwerber(in)',\n", - " '073': 'Erziehungsberechtigte(r)',\n", - " '074': 'Frühere(r) Ehegatte (Ehegattin)',\n", - " '075': 'Frühere(r) Beklagte(r)',\n", - " '076': 'Frühere(r) Beteiligte(r)',\n", - " '077': 'Frühere(r) Gläubiger(in)',\n", - " '078': 'Frühere(r) Kläger(in)',\n", - " '079': 'Frühere(r) Soldat(in)',\n", - " '080': 'Gegenvormund',\n", - " '081': 'Generalbundesanwalt (-anwältin)',\n", - " '082': 'Gericht',\n", - " '083': 'Gerichtsvollzieher(in)',\n", - " '084': 'Geschädigte(r)',\n", - " '085': 'Geschäftsführende(r) Gesellschafter(in)',\n", - " '086': 'Geschäftsführer(in)',\n", - " '087': 'Gesetzliche(r) Erbe (Erbin)',\n", - " '088': 'Gesetzliche(r) Vertreter(in)',\n", - " '089': 'Gläubiger(in)',\n", - " '090': 'Großeltern',\n", - " '092': 'Großvater (Großmutter)',\n", - " '093': 'Hauptbevollmächtigte(r)',\n", - " '094': 'Hoferbe (Hoferbin)',\n", - " '095': 'Inhaber(in) der Firma',\n", - " '096': 'Insolvenzverwalter(in)',\n", - " '097': 'Jugendamt',\n", - " '098': 'Kammer',\n", - " '099': 'Kammermitglied',\n", - " '100': 'Kind',\n", - " '101': 'Kläger(in)',\n", - " '102': 'Kontrollbetreuer(in)',\n", - " '103': 'Korrespondenzanwalt (-anwältin)',\n", - " '104': 'Kostenschuldner(in)',\n", - " '105': 'Landwirtschaftsrichter(in)',\n", - " '106': 'Lebenspartner(in)',\n", - " '107': 'Liquidator(in)',\n", - " '108': 'Minderjährige(r)',\n", - " '109': 'Mitvormund',\n", - " '110': 'Mündel',\n", - " '112': 'Nachbesserungsgläubiger(in)',\n", - " '113': 'Nachlasspfleger(in)',\n", - " '114': 'Nachlassverwalter(in)',\n", - " '115': 'Nebenkläger(in)',\n", - " '116': 'Neffe (Nichte)',\n", - " '117': 'Nicht verwandt',\n", - " '118': 'Onkel (Tante)',\n", - " '119': 'Opfer',\n", - " '120': 'Pächter(in)',\n", - " '121': 'Pflegeeltern',\n", - " '123': 'Pfleger(in)',\n", - " '124': 'Pfleger(in) für das Sammelvermögen',\n", - " '125': 'Pfleger(in) für die Leibesfrucht',\n", - " '126': 'Pflegevater (Pflegemutter) des Mündels',\n", - " '127': 'Pflegling',\n", - " '128': 'Pflichtverteidiger(in)',\n", - " '129': 'Polizei',\n", - " '130': 'Privatbeklagte(r)',\n", - " '131': 'Privatkläger(in)',\n", - " '132': 'Prozessbevollmächtigte(r)',\n", - " '133': 'Prozesskostenhilfe-Anwalt (-Anwältin)',\n", - " '134': 'Prozesskostenhilfe-Korrespondenzanwalt (-anwältin)',\n", - " '135': 'Rechtsanwalt (-anwältin)',\n", - " '136': 'Rechtsbeistand',\n", - " '137': 'Rechtsbeschwerdeführer(in)',\n", - " '138': 'Rechtsbeschwerdegegner(in)',\n", - " '139': 'Revisionsbeklagte(r)',\n", - " '140': 'Revisionskläger(in)',\n", - " '141': 'Sachbearbeiter(in)',\n", - " '142': 'Sachverständige(r)',\n", - " '143': 'Schuldner(in)',\n", - " '144': 'Schwager (Schwägerin)',\n", - " '146': 'Schwiegersohn (Schwiegertochter)',\n", - " '148': 'Schwiegervater (Schwiegermutter)',\n", - " '149': 'Sohn (Tochter)',\n", - " '150': 'Soldat(in)',\n", - " '151': 'Sonstige(r) Beteiligte(r)',\n", - " '152': 'Sonstige(r) Vertreter(in)',\n", - " '153': 'Staatsanwaltschaft',\n", - " '154': 'Stiefeltern',\n", - " '156': 'Stiefvater (Stiefmutter)',\n", - " '157': 'Streithelfer(in) Beklagte(r)',\n", - " '158': 'Streithelfer(in) Kläger(in)',\n", - " '159': 'Streitverkündete(r) Beklagte(r)',\n", - " '160': 'Streitverkündete(r) Kläger(in)',\n", - " '161': 'Terminsbevollmächtigte(r)',\n", - " '162': 'Testamentsvollstrecker(in)',\n", - " '163': 'Testator(in)',\n", - " '164': 'Übernehmer(in)',\n", - " '165': 'Unterbevollmächtigte(r)',\n", - " '166': 'Ur-Enkel(in)',\n", - " '167': 'Vater (Mutter)',\n", - " '168': 'Veräußerer (Veräußerin)',\n", - " '169': 'Verfahrensbevollmächtigte(r)',\n", - " '170': 'Verfahrenskostenhilfe-Anwalt(-Anwältin)',\n", - " '171': 'Verfahrenskostenhilfe-Korrespondenzanwalt (-anwältin)',\n", - " '172': 'Verfahrenspfleger(in)',\n", - " '173': 'Verfahrensvertreter(in) (§787 ZPO)',\n", - " '174': 'Verfügungsbeklagte(r)',\n", - " '175': 'Verfügungskläger(in)',\n", - " '176': 'Verkehrsanwalt (-anwältin)',\n", - " '177': 'Verlobte(r)',\n", - " '178': 'Vermächtnisnehmer(in)',\n", - " '179': 'Vermieter(in)',\n", - " '180': 'Verpächter(in)',\n", - " '181': 'Versorgungsträger(in)',\n", - " '182': 'Verteidiger(in)',\n", - " '183': 'Vertreter(in) der Interessen des Ausgleichsfonds',\n", - " '184': 'Vertreter(in) der Staatskasse',\n", - " '185': 'Vertreter(in) des Bundesinteresses beim Bundesverwaltungsgericht',\n", - " '186': 'Vertreter(in) des öffentlichen Interesses',\n", - " '187': 'Verurteilte(r)',\n", - " '188': 'Verwalter(in) der Wohnungseigentümergemeinschaft',\n", - " '189': 'Verwaltungsbehörde',\n", - " '190': 'Vollstreckungsgläubiger(in)',\n", - " '191': 'Vollstreckungsschuldner(in)',\n", - " '192': 'Vorläufige(r) Betreuer(in)',\n", - " '193': 'Vormund',\n", - " '194': 'Vorstand',\n", - " '195': 'Vorsorgebevollmächtigte(r)',\n", - " '196': 'Wahlverteidiger(in)',\n", - " '197': 'Widerbeklagte(r)',\n", - " '198': 'Widerkläger(in)',\n", - " '199': 'Wiederaufnahmebeklagte(r)',\n", - " '200': 'Wiederaufnahmekläger(in)',\n", - " '201': 'Zahlungs- und Auflagenempfänger(in)',\n", - " '202': 'Zeuge (Zeugin)',\n", - " '203': 'Zeugenbeistand',\n", - " '204': 'Zulassungsantragsgegner(in)',\n", - " '205': 'Zulassungsantragsteller(in)',\n", - " '206': 'Zustellungsbevollmächtigte(r)',\n", - " '207': 'Zustellungsvertreter(in) (§6 ZVG)',\n", - " '208': 'Notar(in)',\n", - " '209': 'Auskunftsempfänger(in)',\n", - " '210': 'Melder(in)',\n", - " '211': 'Verwahrstelle',\n", - " '212': 'Aussteller(in)',\n", - " '213': 'Berechtigte(r)',\n", - " '214': 'Berechtigte(r) an einem Recht',\n", - " '215': 'Einreicher(in)',\n", - " '216': 'Erbbauberechtigte(r)',\n", - " '217': 'Finanzamt',\n", - " '218': 'Grundbuchvertreter(in)',\n", - " '219': 'Insolvenzgericht',\n", - " '220': 'Mitteilungsempfänger(in)',\n", - " '221': 'Nacherbe (Nacherbin)',\n", - " '222': 'Rechnungsempfänger(in)',\n", - " '223': 'Veranlasser(in)',\n", - " '224': 'Versteigerungsabteilung',\n", - " '225': 'Vertretungsberechtigte(r)',\n", - " '226': 'Zweitschuldner(in)',\n", - " '227': 'Vertreter(in)',\n", - " '228': 'Arbeitgeber(in)',\n", - " '229': 'RV-Träger(in)',\n", - " '230': 'Vollstreckungsstelle',\n", - " '231': 'Abkömmling',\n", - " '232': 'Kreditnehmer(in)',\n", - " '233': 'Neu vorzutragende(r) Eigentümer(in)',\n", - " '234': 'Notariatsverwalter(in)',\n", - " '235': 'Notarvertreter(in)',\n", - " '236': 'Partei kraft Amtes',\n", - " '237': 'Sequester',\n", - " '238': 'Treuhänder(in)',\n", - " '239': 'Zustimmende(r)',\n", - " '240': 'Gläubigervertreter(in)',\n", - " '241': 'Schuldnervertreter(in)',\n", - " '242': 'Zahlungsempfänger(in)',\n", - " '243': 'Anteilsinhaber(in)',\n", - " '244': 'Antragsteller(in) -Eröffnung',\n", - " '245': 'Debitor(in)',\n", - " '246': 'Gesellschafter(in)',\n", - " '247': 'Handlungsbevollmächtigte(r)',\n", - " '248': 'Mitglied einer Gesamt-Anteilsinhaberschaft',\n", - " '249': 'Mitglied einer Gläubigergemeinschaft',\n", - " '250': 'Mitreeder(in)',\n", - " '251': 'Partner(in)',\n", - " '252': 'Persönlich haftende(r) Gesellschafter(in)',\n", - " '253': 'Prozesspfleger(in)',\n", - " '254': 'Sachwalter(in)',\n", - " '255': 'Treuhänder(in) (Wohlverhaltensperiode)',\n", - " '256': 'Vermögensträger(in)',\n", - " '257': 'Vorläufige(r) Insolvenzverwalter(in)',\n", - " '258': 'Vorläufige(r) Treuhänder(in)',\n", - " '259': 'Sondersachwalter(in)',\n", - " '260': 'Sonderinsolvenzverwalter(in)',\n", - " '261': 'Vorläufige(r) Sachwalter(in)',\n", - " '262': 'Abwickler(in)',\n", - " '263': 'Übernehmender Rechtsträger',\n", - " '264': 'Aufsichtsrat (-rätin)',\n", - " '265': 'Besondere(r) Vertreter(in) nach § 30 BGB',\n", - " '266': 'Betriebsleiter(in)',\n", - " '267': 'Empfangsberechtigte(r)',\n", - " '268': 'Geschäftsführende(r) Direktor(in)',\n", - " '269': 'Geschäftsleiter(in)',\n", - " '271': 'Gründer(in)',\n", - " '272': 'Gründungsprüfer(in)',\n", - " '273': 'Hauptniederlassung',\n", - " '274': 'Inhaber(in)',\n", - " '275': 'Kommanditist(in)',\n", - " '276': 'Konkursverwalter(in)',\n", - " '277': 'Mitglied des Leitungsorgans',\n", - " '278': 'Mitglied EWIV',\n", - " '279': 'Nachgründungsprüfer(in)',\n", - " '280': 'Nachtragsabwickler(in)',\n", - " '281': 'Nachtragsliquidator(in)',\n", - " '282': 'Notgeschäftsführer(in)',\n", - " '283': 'Notliquidator(in)',\n", - " '284': 'Notvorstand',\n", - " '285': 'Prokurist(in)',\n", - " '287': 'Rechtsträger(in)',\n", - " '288': 'Registergericht',\n", - " '289': 'Sacheinlagenprüfer(in)',\n", - " '290': 'Ständige(r) Vertreter(in) für die Zweigniederlassung',\n", - " '291': 'Übertragender Rechtsträger',\n", - " '292': 'Vergleichsverwalter(in)',\n", - " '293': 'Verwaltungsrat (-rätin)',\n", - " '294': 'Vorstandsvorsitzende(r)',\n", - " '295': 'Zweigniederlassung',\n", - " '296': 'Vertreter(in) des Klägers/der Klägerin',\n", - " '297': 'Vertreter(in) des/der Beklagten',\n", - " '298': 'Bewährungshilfe',\n", - " '299': 'Gerichtshilfe',\n", - " '300': 'Justizvollzug',\n", - " '301': 'Pseudoname',\n", - " '302': 'Gesetzliche(r) Vertreter(in) des Gläubigers/der Gläubigerin',\n", - " '303': 'Gesetzliche(r) Vertreter(in) des Schuldners/der Schuldnerin',\n", - " '304': 'Bevollmächtigte(r) des Gläubigers/der Gläubigerin',\n", - " '305': 'Bevollmächtigte(r) des Schuldners/der Schuldnerin',\n", - " '306': 'Fahrzeughalter(in)',\n", - " '307': 'Frühere(r) Lebenspartner(in)',\n", - " '308': 'Sicherungsverwalter(in)',\n", - " '309': 'Zwangsverwalter(in)',\n", - " '310': 'Mieter(in)',\n", - " '311': 'Bürge (Bürgin)',\n", - " '312': 'Meistbietende(r)',\n", - " '313': 'Abschlussprüfer(in)',\n", - " '314': 'Antragstellervertreter(in)',\n", - " '315': 'Aufsichtsratsvorsitzende(r)',\n", - " '316': 'Berufskammer',\n", - " '317': 'Betroffenenvertreter(in)',\n", - " '318': 'Bürgermeister(in)',\n", - " '319': 'Eingliederungsbeteiligte(r)',\n", - " '320': 'Formwechselnder Rechtsträger',\n", - " '321': 'Gewerbeamt',\n", - " '322': 'Inhaber(in) (nicht eingetragen)',\n", - " '323': 'Kostenempfänger(in)',\n", - " '324': 'Nachlassgericht',\n", - " '325': 'Sonderprüfer(in)',\n", - " '326': 'Sonstige(r) gerichtlich bestellte(r) Vertreter(in)',\n", - " '327': 'Sonstige(r) gesetzliche(r) Vertreter(in) BGB',\n", - " '328': 'Sonstige(r) organschaftliche(r) Vertreter(in) HRB',\n", - " '329': 'Standardkostenschuldner(in)',\n", - " '330': 'Übernahmeschuldner(in)',\n", - " '331': 'Unternehmensvertragsbeteiligte(r)',\n", - " '332': 'Vertreter(in) des persönlich haftenden Gesellschafters',\n", - " '333': 'Werkleiter(in)',\n", - " '334': 'Mehrfachsitz',\n", - " '335': 'Mitglied VR',\n", - " '336': 'Mitglied e. BGB-Gesellschaft als Abwicklerin',\n", - " '337': 'Mitglied e. BGB-Gesellschaft als ges. Vertreterin',\n", - " '338': 'Mitglied e. BGB-Gesellschaft als Kommanditistin',\n", - " '339': 'Mitglied e. BGB-Gesellschaft als Liquidatorin',\n", - " '340': 'Mitglied e. BGB-Gesellschaft als Mitglied e. EWIV',\n", - " '341': 'Mitglied e. BGB-Gesellschaft als phG',\n", - " '342': 'Mitglied e. Erbengemeinschaft als ges. Vertreterin',\n", - " '343': 'Mitglied e. Erbengemeinschaft als Inhaberin',\n", - " '344': 'Mitglied e. Erbengemeinschaft als Kommanditistin',\n", - " '345': 'Mitglied e. Erbengemeinschaft als Mitglied e. EWIV',\n", - " '346': 'Nebensitz',\n", - " '347': 'Gesamthandsgemeinschaft',\n", - " '348': 'Mitglied einer Gesamthandsgemeinschaft',\n", - " '349': 'Leitungsperson i.S.v. § 30 Abs. 1 Nr. 1 bis 5 OWiG',\n", - " '350': 'Einziehungsbeteiligter',\n", - " '351': 'Antragsgegnervertreter(in)',\n", - " '352': 'Verbraucher(in)',\n", - " '353': 'Vielmelder(in)',\n", - " '354': 'Vollmachtgeber(in)',\n", - " '355': 'Nebenbeteiligte(r) § 444 StPO',\n", - " '356': 'Verfallsbeteiligte(r) § 442 StPO a.F.',\n", - " '357': 'Verfolgte(r) § 34 IRG',\n", - " '358': 'Rechtsnachfolger(in)',\n", - " '359': 'Statuswechselnde(r) Rechtsträger(in)',\n", - " '360': 'Haftangehörige'}" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import xmltodict\n", - "\n", - "with open('../src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/xjustiz_0040_cl_rollenbezeichnung_3_3.xsd', encoding=\"utf-8\") as file:\n", - " content = file.read()\n", - " data = xmltodict.parse(content)\n", - "mapping = {}\n", - "for entry in data[\"xs:schema\"][\"xs:simpleType\"][\"xs:restriction\"][\"xs:enumeration\"]:\n", - " mapping[entry['@value']] = entry['xs:annotation']['xs:appinfo']['wert']\n", - " print(f\"id: {entry['@value']}\")\n", - " print(f\"value: {entry['xs:annotation']['xs:appinfo']['wert']}\")\n", - "mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Geschäftsführer(in)'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - }, - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." - ] - } - ], - "source": [ - "mapping[\"086\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Company(id=CompanyID(district_court=DistrictCourt(name='Amtsgericht Hamm', city='Hamm'), hr_number='HRB 5363'), location=Location(city='Bönen', street='Siemensstraße', house_number='25-27', zip_code='59199'), name='GEA Farm Technologies GmbH', last_update='2023-10-27', relationships=[PersonToCompanyRelationship(role=, location=Location(city='Oelde', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Reinhard', lastname='Gebing'), date_of_birth='1964-04-26'), PersonToCompanyRelationship(role=, location=Location(city='Wetter', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Markus', lastname='Kreft'), date_of_birth='1966-04-03'), PersonToCompanyRelationship(role=, location=Location(city='Holzminden', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Kai', lastname='Luntz'), date_of_birth='1970-12-04'), PersonToCompanyRelationship(role=, location=Location(city='Rheda-Wiedenbrück', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Thomas', lastname='Mader'), date_of_birth='1972-05-24'), PersonToCompanyRelationship(role=, location=Location(city='Düsseldorf', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Peter', lastname='Lauwers'), date_of_birth='1970-03-26'), PersonToCompanyRelationship(role=, location=Location(city='Erkrath', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Erkul', lastname='Basaran'), date_of_birth='1977-05-06'), PersonToCompanyRelationship(role=, location=Location(city='Bochum', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Henrik', lastname='Böttner'), date_of_birth='1982-11-07'), PersonToCompanyRelationship(role=, location=Location(city='Oelde', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ulrich', lastname='Raßenhövel'), date_of_birth='1969-04-16'), PersonToCompanyRelationship(role=, location=Location(city='Herdecke', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Andreas', lastname='Naroska'), date_of_birth='1967-03-23'), PersonToCompanyRelationship(role=, location=Location(city='Witten', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Mark', lastname='Kramps'), date_of_birth='1967-09-04'), PersonToCompanyRelationship(role=, location=Location(city='Dortmund', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ralf', lastname='Barkmeyer'), date_of_birth='1974-02-28'), PersonToCompanyRelationship(role=, location=Location(city='Tönnisvorst', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Holger', lastname='Siegwarth'), date_of_birth='1967-05-13'), PersonToCompanyRelationship(role=, location=Location(city='Herne', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Oliver', lastname='Liß'), date_of_birth='1981-04-13'), PersonToCompanyRelationship(role=, location=Location(city='Göppingen', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Liang', lastname='Cheng'), date_of_birth='1980-12-29'), PersonToCompanyRelationship(role=, location=Location(city='Beckum', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Astrid', lastname='Dörner-Rodeheger'), date_of_birth='1968-12-24'), PersonToCompanyRelationship(role=, location=Location(city='Dortmund', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Jon', lastname='Lange'), date_of_birth='1978-04-25'), PersonToCompanyRelationship(role=, location=Location(city='Werne', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ralf', lastname='Frombach'), date_of_birth='1977-01-25'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Sven', lastname='Hommel'), date_of_birth='1979-04-22'), PersonToCompanyRelationship(role=, location=Location(city='Oberhausen', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Matthias', lastname='Peters'), date_of_birth='1973-08-28')], company_type=, capital=Capital(value=5115000.0, currency=, type=), business_purpose='Entwicklung, Herstellung und der Vertrieb von Landtechnik, insbesondere\\n von Komponenten und Anlagen (a) zur Gewinnung, Kühlung, Behandlung und Lagerung von\\n Milch; (b) für das Milchvieh-Herdenmanagement; (c) zur Tierhygiene und Sicherung der\\n Milchqualität und (d) zur Aufstallung von Tieren; sowie die Herstellung und der\\n Vertrieb von Anlagen und Fahrzeugen zur Aufbereitung und zum Transport von Gülle.', founding_date='1995-04-25')\n" - ] - } - ], - "source": [ - "import json\n", - "import dataclasses\n", - "from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import map_unternehmensregister_json\n", - "\n", - "with open('../tmp/json/GEAFarmTechnologiesGmbH.json', \"r\") as file:\n", - " content = json.load(file)\n", - " company_data = map_unternehmensregister_json(content)\n", - " print(company_data)\n", - " with open('../tmp/transformed/GEAFarmTechnologiesGmbH.json', \"w+\", encoding=\"utf-8\") as file:\n", - " json.dump(dataclasses.asdict(company_data), file, ensure_ascii=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Company(id=CompanyID(district_court=DistrictCourt(name='Amtsgericht Charlottenburg', city='Berlin'), hr_number='HRB 153385 B'), location=Location(city='Berlin', street='Valeska-Gert-Straße', house_number='5', zip_code='10243'), name='Zalando Lounge Service GmbH', last_update='2022-10-05', relationships=[PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Martin', lastname='Rost'), date_of_birth='1982-09-24'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Karen', lastname='Kennes'), date_of_birth='1979-06-22'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Andre', lastname='Hörschel'), date_of_birth='1973-06-15')], company_type=, capital=Capital(value=25000.0, currency=, type=), business_purpose='Die Erbringung von Dienstleistungen für e-Commerce Unternehmen im Bereich Kundenservice und Logistik.', founding_date='2014-02-05')\n" - ] - } - ], - "source": [ - "with open('../tmp/json/ZalandoLoungeServiceGmbH.json', \"r\") as file:\n", - " content = json.load(file)\n", - " company_data = map_unternehmensregister_json(content)\n", - " print(company_data)\n", - " with open('../tmp/transformed/ZalandoLoungeServiceGmbH.json', \"w+\", encoding=\"utf-8\") as file:\n", - " json.dump(dataclasses.asdict(company_data), file, ensure_ascii=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Company(id=CompanyID(district_court=DistrictCourt(name='Amtsgericht Charlottenburg', city='Berlin'), hr_number='HRB 158855 B'), location=Location(city='Berlin', street='Valeska-Gert-Straße', house_number='5', zip_code='10243'), name='Zalando SE', last_update='2023-07-04', relationships=[PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Robert', lastname='Gentz'), date_of_birth='1983-09-24'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='David', lastname='Schneider'), date_of_birth='1982-07-29'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='David', lastname='Schröder'), date_of_birth='1982-11-27'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Astrid', lastname='Arndt'), date_of_birth='1971-11-05'), PersonToCompanyRelationship(role=, location=Location(city='Passau', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Sandra', lastname='Dembeck'), date_of_birth='1974-03-06'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Martin', lastname='Rost'), date_of_birth='1982-09-24'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Jan-Hendrik', lastname='Bartels'), date_of_birth='1980-11-05'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Ulrich', lastname='Kalk'), date_of_birth='1978-05-27'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Anne', lastname='Pascual'), date_of_birth='1976-01-03'), PersonToCompanyRelationship(role=, location=Location(city='Knokke/Belgien', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Bruno', lastname='Vanhoorickx'), date_of_birth='1981-08-25'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Andreas', lastname='Antrup'), date_of_birth='1983-06-27'), PersonToCompanyRelationship(role=, location=Location(city='Berlin', street=None, house_number=None, zip_code=None), type=, name=PersonName(firstname='Lena', lastname='Wallenhorst'), date_of_birth='1978-07-20')], company_type=, capital=Capital(value=263531672.0, currency=, type=), business_purpose='Die Entwicklung, Vermarktung und Erbringung von Internetdienstleistungen (E-Commerce-Handel mit Waren verschiedener Art, insbesondere Bekleidung und Schuhe), die Entwicklung, Herstellung, Vermarktung und der Handel mit solchen Waren, insbesondere Bekleidung und Schuhe, die Erbringung von Logistikdienstleistungen, digitalen Dienstleistungen und alle mit dem vorgenannten Unternehmensgegenstand zusammenhängenden Dienstleistungen.', founding_date='2023-05-24')\n" - ] - } - ], - "source": [ - "with open('../tmp/json/ZalandoSE.json', \"r\") as file:\n", - " content = json.load(file)\n", - " company_data = map_unternehmensregister_json(content)\n", - " print(company_data)\n", - " with open('../tmp/transformed/ZalandoSE.json', \"w+\", encoding=\"utf-8\") as file:\n", - " json.dump(dataclasses.asdict(company_data), file, ensure_ascii=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "aki-prj23-transparenzregister-jVJfu35g-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 7605858234d80882771b7c0f67d10fd04c53f4a8 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 4 Nov 2023 09:18:05 +0100 Subject: [PATCH 09/14] checkpoint: Apply missing docstrings --- .../data_extraction/unternehmensregister/transform/common.py | 1 + .../unternehmensregister/transform/v1/__init__.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py index e69de29..25f54b1 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py @@ -0,0 +1 @@ +"""Common functions for data transformation.""" diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/__init__.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/__init__.py index e69de29..a172906 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/__init__.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/__init__.py @@ -0,0 +1 @@ +"""Module for transforming Unternehmensregister data from v1 to Transparenzregister API data model.""" From 61f94fa3b958873ac2c380b7d86a853550846a86 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 4 Nov 2023 10:32:35 +0100 Subject: [PATCH 10/14] test: Unit tests --- .../unternehmensregister/transform/common.py | 122 +++++++++++++++ .../unternehmensregister/transform/v1/v1.py | 92 +---------- .../unternehmensregister/transform/v3/v3.py | 119 +-------------- .../transform/common_test.py | 144 ++++++++++++++++++ .../unternehmensregister/transform/v1_test.py | 115 -------------- 5 files changed, 277 insertions(+), 315 deletions(-) create mode 100644 tests/utils/data_extraction/unternehmensregister/transform/common_test.py diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py index 25f54b1..6d30ad6 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py @@ -1 +1,123 @@ """Common functions for data transformation.""" +import re +import typing +from collections.abc import Sequence + +from aki_prj23_transparenzregister.models.company import ( + CompanyRelationshipEnum, + CompanyToCompanyRelationship, + Location, + RelationshipRoleEnum, +) +from aki_prj23_transparenzregister.utils.string_tools import ( + transform_date_to_iso, +) + + +def traversal(data: dict, path: Sequence[str | int | object]) -> typing.Any: + """Traverse a dict using list of keys. + + Args: + data (dict): Data export + path (Sequence[str | int | object]): List of keys + + Raises: + KeyError: If key not found + + Returns: + any: Value at the end of the path + """ + current = data + for key in path: + try: + current = current[key] + except KeyError as e: + raise KeyError(f"Key {key} not found") from e + return current + + +def normalize_street(street: str) -> str: + """Normalize street names by extending them to `Straße` or `straße`. + + Args: + street (str): Name of street + + Returns: + str: Normalized street name + """ + if street is None: + return None + regex = r"(Str\.|Strasse)" + street = re.sub(regex, "Straße", street) + regex = r"(str\.|strasse)" + street = re.sub(regex, "straße", street) + return street.strip() + + +def extract_date_from_string(value: str) -> str | None: + """Extract a date in ISO format from the given string if possible. + + Args: + value (str): Input text + + Returns: + str | None: Date in ISO format, None if not found + """ + date_regex = [ # type: ignore + {"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso}, + {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, + ] + results = [] + for regex in date_regex: + result = re.findall(regex["regex"], value) # type: ignore + if len(result) == 1: + relevant_data = result[0] + if regex["mapper"] is not None: # type: ignore + results.append(regex["mapper"](relevant_data)) # type: ignore + else: + results.append(relevant_data) + if len(results) != 1: + return None + return results[0] + + +def map_co_relation(data: dict) -> dict: + """Search for and map the c/o relation from location.street if possible. + + Args: + data (dict): Company dict + + Returns: + dict: Modified Company dict + """ + street = data["location"].street + if street is None: + return data + parts = street.split(",") + co_company = None + co_company_index = None + for index, part in enumerate(parts): + trimmed_part = part.strip() + result = re.findall(r"^c\/o(.*)$", trimmed_part) + if len(result) == 1: + co_company = result[0].strip() + co_company_index = index + if co_company_index is not None: + del parts[co_company_index] + street = "".join(parts).strip() + data["location"].street = street + + if co_company is not None and co_company != "": + relation = CompanyToCompanyRelationship( + RelationshipRoleEnum.CARE_OF, # type: ignore + Location( + data["location"].city, + street, + data["location"].house_number, + data["location"].zip_code, + ), + CompanyRelationshipEnum.COMPANY, # type: ignore + co_company, + ) + data["relationships"].append(relation) + return data diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py index d9e8868..834b1e5 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py @@ -17,6 +17,11 @@ from aki_prj23_transparenzregister.models.company import ( PersonToCompanyRelationship, RelationshipRoleEnum, ) +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.common import ( + extract_date_from_string, + map_co_relation, + normalize_street, +) from aki_prj23_transparenzregister.utils.string_tools import ( remove_traling_and_leading_quotes, transform_date_to_iso, @@ -149,24 +154,6 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None: return None -def normalize_street(street: str) -> str: - """Normalize street names by extending them to `Straße` or `straße`. - - Args: - street (str): Name of street - - Returns: - str: Normalized street name - """ - if street is None: - return None - regex = r"(Str\.|Strasse)" - street = re.sub(regex, "Straße", street) - regex = r"(str\.|strasse)" - street = re.sub(regex, "straße", street) - return street.strip() - - def loc_from_beteiligung(data: dict) -> Location: """Extract the company location from the first relationship in the export. @@ -338,33 +325,6 @@ def map_business_purpose(data: dict) -> str | None: return None -def extract_date_from_string(value: str) -> str | None: - """Extract a date in ISO format from the given string if possible. - - Args: - value (str): Input text - - Returns: - str | None: Date in ISO format, None if not found - """ - date_regex = [ # type: ignore - {"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso}, - {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, - ] - results = [] - for regex in date_regex: - result = re.findall(regex["regex"], value) # type: ignore - if len(result) == 1: - relevant_data = result[0] - if regex["mapper"] is not None: # type: ignore - results.append(regex["mapper"](relevant_data)) # type: ignore - else: - results.append(relevant_data) - if len(results) != 1: - return None - return results[0] - - def map_founding_date(data: dict) -> str | None: """Extracts the founding date from a given Unternehmensregister export. @@ -457,48 +417,6 @@ def map_last_update(data: dict) -> str: return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"]["letzte_Eintragung"] -def map_co_relation(data: dict) -> dict: - """Search for and map the c/o relation from location.street if possible. - - Args: - data (dict): Company dict - - Returns: - dict: Modified Company dict - """ - street = data["location"].street - if street is None: - return data - parts = street.split(",") - co_company = None - co_company_index = None - for index, part in enumerate(parts): - trimmed_part = part.strip() - result = re.findall(r"^c\/o(.*)$", trimmed_part) - if len(result) == 1: - co_company = result[0].strip() - co_company_index = index - if co_company_index is not None: - del parts[co_company_index] - street = "".join(parts).strip() - data["location"].street = street - - if co_company is not None and co_company != "": - relation = CompanyToCompanyRelationship( - RelationshipRoleEnum.CARE_OF, # type: ignore - Location( - data["location"].city, - street, - data["location"].house_number, - data["location"].zip_code, - ), - CompanyRelationshipEnum.COMPANY, # type: ignore - co_company, - ) - data["relationships"].append(relation) - return data - - def map_unternehmensregister_json(data: dict) -> Company: """Processes the Unternehmensregister structured export to a Company by using several helper methods. diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py index cdc9981..240231a 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py @@ -1,8 +1,6 @@ """Transform raw Unternehmensregister export (*.xml) to processed .json files for loading.""" import re -import typing -from collections.abc import Sequence from aki_prj23_transparenzregister.models.company import ( Capital, @@ -20,6 +18,11 @@ from aki_prj23_transparenzregister.models.company import ( PersonToCompanyRelationship, RelationshipRoleEnum, ) +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.common import ( + map_co_relation, + normalize_street, + traversal, +) from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.role_mapper import ( RoleMapper, ) @@ -193,24 +196,6 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None: return None -def normalize_street(street: str) -> str: - """Normalize street names by extending them to `Straße` or `straße`. - - Args: - street (str): Name of street - - Returns: - str: Normalized street name - """ - if street is None: - return None - regex = r"(Str\.|Strasse)" - street = re.sub(regex, "Straße", street) - regex = r"(str\.|strasse)" - street = re.sub(regex, "straße", street) - return street.strip() - - def loc_from_beteiligung(data: dict) -> Location: """Extract the company location from the first relationship in the export. @@ -228,7 +213,6 @@ def loc_from_beteiligung(data: dict) -> Location: "tns:beteiligter", "tns:auswahl_beteiligter", "tns:organisation", - # "tns:anschrift", ] base = traversal(data, base_path) base = base["tns:anschrift"] if "tns:anschrift" in base else base["tns:sitz"] @@ -419,33 +403,6 @@ def map_business_purpose(data: dict) -> str | None: return None -def extract_date_from_string(value: str) -> str | None: - """Extract a date in ISO format from the given string if possible. - - Args: - value (str): Input text - - Returns: - str | None: Date in ISO format, None if not found - """ - date_regex = [ # type: ignore - {"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso}, - {"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None}, - ] - results = [] - for regex in date_regex: - result = re.findall(regex["regex"], value) # type: ignore - if len(result) == 1: - relevant_data = result[0] - if regex["mapper"] is not None: # type: ignore - results.append(regex["mapper"](relevant_data)) # type: ignore - else: - results.append(relevant_data) - if len(results) != 1: - return None - return results[0] - - def map_founding_date(data: dict) -> str | None: """Extracts the founding date from a given Unternehmensregister export. @@ -480,28 +437,6 @@ def map_founding_date(data: dict) -> str | None: return None -def traversal(data: dict, path: Sequence[str | int | object]) -> typing.Any: - """Traverse a dict using list of keys. - - Args: - data (dict): Data export - path (Sequence[str | int | object]): List of keys - - Raises: - KeyError: If key not found - - Returns: - any: Value at the end of the path - """ - current = data - for key in path: - try: - current = current[key] - except KeyError as e: - raise KeyError(f"Key {key} not found") from e - return current - - def map_hr_number(data: dict) -> str: """Extract the HR number from a given Unternehmensregister export. @@ -585,48 +520,7 @@ def map_last_update(data: dict) -> str: return traversal(data, path) -def map_co_relation(data: dict) -> dict: - """Search for and map the c/o relation from location.street if possible. - - Args: - data (dict): Company dict - - Returns: - dict: Modified Company dict - """ - street = data["location"].street - if street is None: - return data - parts = street.split(",") - co_company = None - co_company_index = None - for index, part in enumerate(parts): - trimmed_part = part.strip() - result = re.findall(r"^c\/o(.*)$", trimmed_part) - if len(result) == 1: - co_company = result[0].strip() - co_company_index = index - if co_company_index is not None: - del parts[co_company_index] - street = "".join(parts).strip() - data["location"].street = street - - if co_company is not None and co_company != "": - relation = CompanyToCompanyRelationship( - RelationshipRoleEnum.CARE_OF, # type: ignore - Location( - data["location"].city, - street, - data["location"].house_number, - data["location"].zip_code, - ), - CompanyRelationshipEnum.COMPANY, # type: ignore - co_company, - ) - data["relationships"].append(relation) - return data - - +# TODO class model with inheritance - only difference: Determine root in __init__ def map_unternehmensregister_json(data: dict) -> Company: """Processes the Unternehmensregister structured export to a Company by using several helper methods. @@ -651,7 +545,6 @@ def map_unternehmensregister_json(data: dict) -> Company: result["business_purpose"] = map_business_purpose(data) result["founding_date"] = map_founding_date(data) - # TODO adapt... for i in range( 2, len(data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"]) ): diff --git a/tests/utils/data_extraction/unternehmensregister/transform/common_test.py b/tests/utils/data_extraction/unternehmensregister/transform/common_test.py new file mode 100644 index 0000000..8a4c5b5 --- /dev/null +++ b/tests/utils/data_extraction/unternehmensregister/transform/common_test.py @@ -0,0 +1,144 @@ +"""Testing data_extraction/unternehmensregister/transform/common.py.""" +import pytest + +from aki_prj23_transparenzregister.models.company import ( + CompanyRelationshipEnum, + CompanyToCompanyRelationship, + Location, + RelationshipRoleEnum, +) +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import ( + common, +) + + +def test_import_common() -> None: + assert common + + +def test_traversal() -> None: + data = {"a": {"b": {"c": "d"}}} + assert common.traversal(data, ["a", "b", "c"]) == "d" + + +# def test_traversal_raises_key_error(): +# data = {"a": {"b": {"c": "d"}}} +# try: +# common.traversal(data, ["a", "b", "d"]) +# except KeyError: +# assert True +# else: +# assert False + + +@pytest.mark.parametrize( + ("value", "expected_result"), + [ + (None, None), + ("Ludwig-Ganghofer-Str.", "Ludwig-Ganghofer-Straße"), + ("Ludwig-Ganghofer-Strasse", "Ludwig-Ganghofer-Straße"), + ("Str. des Tests", "Straße des Tests"), + ], +) +def test_normalize_street(value: str, expected_result: str) -> None: + result = common.normalize_street(value) + assert result == expected_result + + +@pytest.mark.parametrize( + ("value", "expected_result"), + [ + ("", None), + ("Tag der ersten Eintragung: 01.05.2004", "2004-05-01"), + ("Tag der ersten Eintragung: 1.05.2004", "2004-05-01"), + ("Tag der ersten Eintragung: 1.5.2004", "2004-05-01"), + ("Tag der ersten Eintragung: 01.5.2004", "2004-05-01"), + ("Gesellschaftsvertrag vom 06.04.2016 Hallo Welt", "2016-04-06"), + ("Str. des Tests vom 1999-04-05", "1999-04-05"), + ("Once upon a midnight dreary while I pondered weak and weary...", None), + ( + "This company was first founded in 2016-06-10 and then again on 1.5.2004", + None, + ), + ], +) +def test_extract_date_from_string(value: str, expected_result: str) -> None: + result = common.extract_date_from_string(value) + assert result == expected_result + + +@pytest.mark.parametrize( + ("value", "expected_result"), + [ + ( + { + "location": Location( + "", "c/o Youco24 Business Center, Abc ffda", None, None + ), + "relationships": [], + }, + { + "location": Location("", "Abc ffda", None, None), + "relationships": [ + CompanyToCompanyRelationship( + RelationshipRoleEnum.CARE_OF, # type: ignore + Location("", "Abc ffda", None, None), + CompanyRelationshipEnum.COMPANY, + "Youco24 Business Center", + ) + ], + }, + ), + ( + { + "location": Location( + "Iserlohn", "c/o Youco24 Business Center, Abc Str.", "42", "58644" + ), + "relationships": [], + }, + { + "location": Location("Iserlohn", "Abc Str.", "42", "58644"), + "relationships": [ + CompanyToCompanyRelationship( + RelationshipRoleEnum.CARE_OF, # type: ignore + Location("Iserlohn", "Abc Str.", "42", "58644"), + CompanyRelationshipEnum.COMPANY, + "Youco24 Business Center", + ) + ], + }, + ), + ( + { + "location": Location( + "Iserlohn", "Abc Str., c/o Youco24 Business Center", "42", "58644" + ), + "relationships": [], + }, + { + "location": Location("Iserlohn", "Abc Str.", "42", "58644"), + "relationships": [ + CompanyToCompanyRelationship( + RelationshipRoleEnum.CARE_OF, # type: ignore + Location("Iserlohn", "Abc Str.", "42", "58644"), + CompanyRelationshipEnum.COMPANY, + "Youco24 Business Center", + ) + ], + }, + ), + ( + { + "location": Location("Iserlohn", "Abc Str., c/o", "42", "58644"), + "relationships": [], + }, + { + "location": Location("Iserlohn", "Abc Str.", "42", "58644"), + "relationships": [], + }, + ), + ], +) +def test_map_co_relation(value: dict, expected_result: dict) -> None: + result = common.map_co_relation(value) + assert result == expected_result diff --git a/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py b/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py index 47c525c..34b8ead 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py @@ -1,8 +1,6 @@ """Testing utils/data_extraction/unternehmensregister/transform.py.""" from unittest.mock import Mock, patch -import pytest - from aki_prj23_transparenzregister.models.company import ( Capital, CapitalTypeEnum, @@ -266,20 +264,6 @@ def test_loc_from_beteiligung_combine() -> None: assert transform.loc_from_beteiligung(data) == expected_result -@pytest.mark.parametrize( - ("value", "expected_result"), - [ - (None, None), - ("Ludwig-Ganghofer-Str.", "Ludwig-Ganghofer-Straße"), - ("Ludwig-Ganghofer-Strasse", "Ludwig-Ganghofer-Straße"), - ("Str. des Tests", "Straße des Tests"), - ], -) -def test_normalize_street(value: str, expected_result: str) -> None: - result = transform.normalize_street(value) - assert result == expected_result - - def test_name_from_beteiligung() -> None: data = { "XJustiz_Daten": { @@ -582,28 +566,6 @@ def test_map_business_purpose_no_result() -> None: assert result is None -@pytest.mark.parametrize( - ("value", "expected_result"), - [ - ("", None), - ("Tag der ersten Eintragung: 01.05.2004", "2004-05-01"), - ("Tag der ersten Eintragung: 1.05.2004", "2004-05-01"), - ("Tag der ersten Eintragung: 1.5.2004", "2004-05-01"), - ("Tag der ersten Eintragung: 01.5.2004", "2004-05-01"), - ("Gesellschaftsvertrag vom 06.04.2016 Hallo Welt", "2016-04-06"), - ("Str. des Tests vom 1999-04-05", "1999-04-05"), - ("Once upon a midnight dreary while I pondered weak and weary...", None), - ( - "This company was first founded in 2016-06-10 and then again on 1.5.2004", - None, - ), - ], -) -def test_extract_date_from_string(value: str, expected_result: str) -> None: - result = transform.extract_date_from_string(value) - assert result == expected_result - - def test_map_founding_date_from_tag_der_ersten_eintragung() -> None: data = { "some entry": "Tag der ersten Eintragung: 01.05.2004", @@ -690,83 +652,6 @@ def test_map_last_update() -> None: assert result == date -@pytest.mark.parametrize( - ("value", "expected_result"), - [ - ( - { - "location": Location( - "", "c/o Youco24 Business Center, Abc ffda", None, None - ), - "relationships": [], - }, - { - "location": Location("", "Abc ffda", None, None), - "relationships": [ - CompanyToCompanyRelationship( - RelationshipRoleEnum.CARE_OF, # type: ignore - Location("", "Abc ffda", None, None), - CompanyRelationshipEnum.COMPANY, - "Youco24 Business Center", - ) - ], - }, - ), - ( - { - "location": Location( - "Iserlohn", "c/o Youco24 Business Center, Abc Str.", "42", "58644" - ), - "relationships": [], - }, - { - "location": Location("Iserlohn", "Abc Str.", "42", "58644"), - "relationships": [ - CompanyToCompanyRelationship( - RelationshipRoleEnum.CARE_OF, # type: ignore - Location("Iserlohn", "Abc Str.", "42", "58644"), - CompanyRelationshipEnum.COMPANY, - "Youco24 Business Center", - ) - ], - }, - ), - ( - { - "location": Location( - "Iserlohn", "Abc Str., c/o Youco24 Business Center", "42", "58644" - ), - "relationships": [], - }, - { - "location": Location("Iserlohn", "Abc Str.", "42", "58644"), - "relationships": [ - CompanyToCompanyRelationship( - RelationshipRoleEnum.CARE_OF, # type: ignore - Location("Iserlohn", "Abc Str.", "42", "58644"), - CompanyRelationshipEnum.COMPANY, - "Youco24 Business Center", - ) - ], - }, - ), - ( - { - "location": Location("Iserlohn", "Abc Str., c/o", "42", "58644"), - "relationships": [], - }, - { - "location": Location("Iserlohn", "Abc Str.", "42", "58644"), - "relationships": [], - }, - ), - ], -) -def test_map_co_relation(value: dict, expected_result: dict) -> None: - result = transform.map_co_relation(value) - assert result == expected_result - - @patch( "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_co_relation" ) From e8d1a37cff6aca70cbfea0a74bd242353b793b15 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sat, 4 Nov 2023 14:19:41 +0100 Subject: [PATCH 11/14] test: Extend unit tests --- .../unternehmensregister/load_test.py | 38 +++++++++++++++++++ .../transform/common_test.py | 12 ++---- .../transform/role_mapper_test.py | 13 +++++++ 3 files changed, 55 insertions(+), 8 deletions(-) create mode 100644 tests/utils/data_extraction/unternehmensregister/transform/role_mapper_test.py diff --git a/tests/utils/data_extraction/unternehmensregister/load_test.py b/tests/utils/data_extraction/unternehmensregister/load_test.py index 6f6b58b..dd71859 100644 --- a/tests/utils/data_extraction/unternehmensregister/load_test.py +++ b/tests/utils/data_extraction/unternehmensregister/load_test.py @@ -1,4 +1,8 @@ """Test load utils from Unternehmensregister.""" +import json +import tempfile +from unittest.mock import Mock, patch + from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import ( load, ) @@ -6,3 +10,37 @@ from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister im def test_smoke() -> None: assert load + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.load.CompanyMongoService" +) +def test_load_directory_to_mongo(mock_company_service: Mock) -> None: + mock_company_service.migration_of_base_data.return_value = None + with tempfile.TemporaryDirectory() as tmp_dir: + with open(f"{tmp_dir}/test.json", "w") as f: + mock_company = { + "id": { + "district_court": { + "name": "Amtsgericht Hamburg", + "city": "Hamburg", + }, + "hr_number": "HRB 47899", + }, + "location": { + "city": "Hamburg", + "street": "Heußweg", + "house_number": "35", + "zip_code": "20255", + }, + "name": "Aurelius Immo GmbH", + "last_update": "2021-07-05", + "relationships": [], + "business_purpose": "Erwerb und Verwaltung von Immobilien; Geschäftsführung von Immobilienfonds und anderen Gesellschaften; Dienstleistungen in diesem Zusammenhang.", + "capital": {"value": 50000, "currency": "DM", "type": "Stammkapital"}, + "company_type": "Gesellschaft mit beschränkter Haftung", + "founding_date": "1977-03-03", + } + json.dump(mock_company, f) + result = load.load_directory_to_mongo(tmp_dir, mock_company_service) + assert result == 1 diff --git a/tests/utils/data_extraction/unternehmensregister/transform/common_test.py b/tests/utils/data_extraction/unternehmensregister/transform/common_test.py index 8a4c5b5..3c62864 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform/common_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform/common_test.py @@ -21,14 +21,10 @@ def test_traversal() -> None: assert common.traversal(data, ["a", "b", "c"]) == "d" -# def test_traversal_raises_key_error(): -# data = {"a": {"b": {"c": "d"}}} -# try: -# common.traversal(data, ["a", "b", "d"]) -# except KeyError: -# assert True -# else: -# assert False +def test_traversal_raises_key_error() -> None: + data = {"a": {"b": {"c": "d"}}} + with pytest.raises(KeyError): + common.traversal(data, ["a", "b", "d"]) @pytest.mark.parametrize( diff --git a/tests/utils/data_extraction/unternehmensregister/transform/role_mapper_test.py b/tests/utils/data_extraction/unternehmensregister/transform/role_mapper_test.py new file mode 100644 index 0000000..f94f205 --- /dev/null +++ b/tests/utils/data_extraction/unternehmensregister/transform/role_mapper_test.py @@ -0,0 +1,13 @@ +"""Test role_mapper.py.""" +from aki_prj23_transparenzregister.models.company import RelationshipRoleEnum +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.role_mapper import ( + RoleMapper, +) + + +def test_init() -> None: + assert isinstance(RoleMapper.mapper(), RoleMapper) + + +def test_map_role() -> None: + assert RoleMapper.mapper().get("285") == RelationshipRoleEnum.PROKURIST From f7ec3eaf249dc3a84a44032a5a745c81b0e6b380 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sun, 5 Nov 2023 12:55:47 +0100 Subject: [PATCH 12/14] test: Increase test coverage and refactor v3 --- .../unternehmensregister/transform/common.py | 133 +++ .../unternehmensregister/transform/main.py | 11 +- .../unternehmensregister/transform/v1/v1.py | 750 +++++++------- .../unternehmensregister/transform/v3/v3.py | 970 +++++++++--------- .../unternehmensregister/transform/v1_test.py | 24 +- .../unternehmensregister/transform/v3_test.py | 731 +++++++++++++ 6 files changed, 1751 insertions(+), 868 deletions(-) create mode 100644 tests/utils/data_extraction/unternehmensregister/transform/v3_test.py diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py index 6d30ad6..8a75843 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/common.py @@ -1,11 +1,17 @@ """Common functions for data transformation.""" +import abc import re import typing from collections.abc import Sequence from aki_prj23_transparenzregister.models.company import ( + Capital, + Company, + CompanyID, + CompanyRelationship, CompanyRelationshipEnum, CompanyToCompanyRelationship, + CompanyTypeEnum, Location, RelationshipRoleEnum, ) @@ -121,3 +127,130 @@ def map_co_relation(data: dict) -> dict: ) data["relationships"].append(relation) return data + + +class BaseTransformer(metaclass=abc.ABCMeta): + """Generic abstract class for data transformation between Unternehmensregister and Transparenzregister API.""" + + @abc.abstractmethod + def parse_date_of_birth(self, data: dict) -> str | None: + """Retreives the date of birth from a stakeholder entry if possible. + + Args: + data (dict): Stakeholder data + + Returns: + str | None: date of birth or None if not found + """ + + @abc.abstractmethod + def parse_stakeholder(self, data: dict) -> CompanyRelationship | None: + """Extract the company stakeholder/relation from a single "Beteiligung". + + Args: + data (dict): Data export + + Returns: + CompanyRelationship | None: Relationship if it could be processed + """ + + @abc.abstractmethod + def loc_from_beteiligung(self, data: dict) -> Location: + """Extract the company location from the first relationship in the export. + + Args: + data (dict): Data export + + Returns: + Location: location + """ + + @abc.abstractmethod + def name_from_beteiligung(self, data: dict) -> str: + """Extract the Company name from an Unternehmensregister export by using the first relationship found. + + Args: + data (dict): Data export + + Returns: + str: Company name + """ + + @abc.abstractmethod + def map_rechtsform(self, company_name: str, data: dict) -> CompanyTypeEnum | None: + """Extracts the company type from a given Unternehmensregister export. + + Args: + company_name (str): Name of the company as a fallback solution + data (dict): Data export + + Returns: + CompanyTypeEnum | None: Company type if found + """ + + @abc.abstractmethod + def map_capital(self, data: dict, company_type: CompanyTypeEnum) -> Capital | None: + """Extracts the company capital from the given Unternehmensregister export. + + Args: + data (dict): Data export + company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung') + + Returns: + Capital | None: Company Capital if found + """ + + @abc.abstractmethod + def map_business_purpose(self, data: dict) -> str | None: + """Extracts the "Geschäftszweck" from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Business purpose if found + """ + + @abc.abstractmethod + def map_founding_date(self, data: dict) -> str | None: + """Extracts the founding date from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Founding date if found + """ + + @abc.abstractmethod + def map_company_id(self, data: dict) -> CompanyID: + """Retrieve Company ID from export. + + Args: + data (dict): Data export + + Returns: + CompanyID: ID of the company + """ + + @abc.abstractmethod + def map_last_update(self, data: dict) -> str: + """Extract last update date from export. + + Args: + data (dict): Unternehmensregister export + + Returns: + str: Last update date + """ + + @abc.abstractmethod + def map_unternehmensregister_json(self, data: dict) -> Company: + """Processes the Unternehmensregister structured export to a Company by using several helper methods. + + Args: + data (dict): Data export + + Returns: + Company: Transformed data + """ diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py index 452e620..6459311 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py @@ -4,13 +4,15 @@ import glob import json import os import sys -import typing import xmltodict from loguru import logger from tqdm import tqdm from aki_prj23_transparenzregister.models.company import Company +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.common import ( + BaseTransformer, +) from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import ( v1, ) @@ -42,7 +44,7 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None: logger.error(e) -def determine_version(data: dict) -> typing.Any: +def determine_version(data: dict) -> BaseTransformer: """Determine Unternehmensregister data API version of given entry. Args: @@ -56,9 +58,9 @@ def determine_version(data: dict) -> typing.Any: """ if "XJustiz_Daten" in data: # TODO consider class inheritance for version modules - return v1 + return v1.V1_Transformer() if "tns:nachrichtenkopf" in data[list(data.keys())[0]]: - return v3 + return v3.V3_Transformer() raise ValueError("Could not determine Unternehmensregister version.") @@ -77,6 +79,7 @@ def map_unternehmensregister_json(data: dict) -> Company: if __name__ == "__main__": base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" + # TODO Adapt to new structure with different versions for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): path = os.path.join(f"{base_path}/export", file) with open(path, encoding="utf-8") as file_object: diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py index 834b1e5..77993d2 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py @@ -18,6 +18,7 @@ from aki_prj23_transparenzregister.models.company import ( RelationshipRoleEnum, ) from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.common import ( + BaseTransformer, extract_date_from_string, map_co_relation, normalize_street, @@ -28,41 +29,81 @@ from aki_prj23_transparenzregister.utils.string_tools import ( ) -def parse_date_of_birth(data: dict) -> str | None: - """Retreives the date of birth from a stakeholder entry if possible. +class V1_Transformer(BaseTransformer): # noqa: N801 + """Transformer for data exports from Unternehmensregister (v1).""" - Args: - data (dict): Stakeholder data + def parse_date_of_birth(self, data: dict) -> str | None: + """Retreives the date of birth from a stakeholder entry if possible. - Returns: - str | None: date of birth or None if not found - """ - if "Geburt" in (base := data["Beteiligter"]["Natuerliche_Person"]): - base = base["Geburt"]["Geburtsdatum"] - if isinstance(base, str): - return base - return None + Args: + data (dict): Stakeholder data + Returns: + str | None: date of birth or None if not found + """ + if "Geburt" in (base := data["Beteiligter"]["Natuerliche_Person"]): + base = base["Geburt"]["Geburtsdatum"] + if isinstance(base, str): + return base + return None -def parse_stakeholder(data: dict) -> CompanyRelationship | None: - """Extract the company stakeholder/relation from a single "Beteiligung". + def parse_stakeholder(self, data: dict) -> CompanyRelationship | None: + """Extract the company stakeholder/relation from a single "Beteiligung". - Args: - data (dict): Data export + Args: + data (dict): Data export - Returns: - CompanyRelationship | None: Relationship if it could be processed - """ - if "Natuerliche_Person" in data["Beteiligter"]: - # It's a Company serving as a "Kommanditist" or similar - if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None: - return CompanyToCompanyRelationship( + Returns: + CompanyRelationship | None: Relationship if it could be processed + """ + if "Natuerliche_Person" in data["Beteiligter"]: + # It's a Company serving as a "Kommanditist" or similar + if ( + data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] + is None + ): + return CompanyToCompanyRelationship( + **{ # type: ignore + "name": remove_traling_and_leading_quotes( + data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ + "Nachname" + ] + ), + "location": Location( + **{ + "city": data["Beteiligter"]["Natuerliche_Person"][ + "Anschrift" + ][-1]["Ort"] + if isinstance( + data["Beteiligter"]["Natuerliche_Person"][ + "Anschrift" + ], + list, + ) + else data["Beteiligter"]["Natuerliche_Person"][ + "Anschrift" + ]["Ort"] + } + ), + "role": RelationshipRoleEnum( + data["Rolle"]["Rollenbezeichnung"]["content"] + ), + "type": CompanyRelationshipEnum.COMPANY, + } + ) + return PersonToCompanyRelationship( **{ # type: ignore - "name": remove_traling_and_leading_quotes( - data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ - "Nachname" - ] + "name": PersonName( + **{ + "firstname": data["Beteiligter"]["Natuerliche_Person"][ + "Voller_Name" + ]["Vorname"], + "lastname": data["Beteiligter"]["Natuerliche_Person"][ + "Voller_Name" + ]["Nachname"], + } ), + "date_of_birth": self.parse_date_of_birth(data), "location": Location( **{ "city": data["Beteiligter"]["Natuerliche_Person"][ @@ -80,372 +121,339 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None: "role": RelationshipRoleEnum( data["Rolle"]["Rollenbezeichnung"]["content"] ), + "type": CompanyRelationshipEnum.PERSON, + } + ) + if "Organisation" in data["Beteiligter"]: + return CompanyToCompanyRelationship( + **{ # type: ignore + "role": RelationshipRoleEnum( + data["Rolle"]["Rollenbezeichnung"]["content"] + ), + "name": remove_traling_and_leading_quotes( + data["Beteiligter"]["Organisation"]["Bezeichnung"][ + "Bezeichnung_Aktuell" + ] + ), + "location": Location( + **{ + "city": data["Beteiligter"]["Organisation"]["Anschrift"][ + "Ort" + ], + "street": data["Beteiligter"]["Organisation"]["Anschrift"][ + "Strasse" + ] + if "Strasse" + in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + "house_number": data["Beteiligter"]["Organisation"][ + "Anschrift" + ]["Hausnummer"] + if "Hausnummer" + in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + "zip_code": data["Beteiligter"]["Organisation"][ + "Anschrift" + ]["Postleitzahl"] + if "Postleitzahl" + in data["Beteiligter"]["Organisation"]["Anschrift"] + else None, + } + ), "type": CompanyRelationshipEnum.COMPANY, } ) - return PersonToCompanyRelationship( - **{ # type: ignore - "name": PersonName( - **{ - "firstname": data["Beteiligter"]["Natuerliche_Person"][ - "Voller_Name" - ]["Vorname"], - "lastname": data["Beteiligter"]["Natuerliche_Person"][ - "Voller_Name" - ]["Nachname"], - } - ), - "date_of_birth": parse_date_of_birth(data), - "location": Location( - **{ - "city": data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ - -1 - ]["Ort"] - if isinstance( - data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], list - ) - else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ - "Ort" - ] - } - ), - "role": RelationshipRoleEnum( - data["Rolle"]["Rollenbezeichnung"]["content"] - ), - "type": CompanyRelationshipEnum.PERSON, - } - ) - if "Organisation" in data["Beteiligter"]: - return CompanyToCompanyRelationship( - **{ # type: ignore - "role": RelationshipRoleEnum( - data["Rolle"]["Rollenbezeichnung"]["content"] - ), - "name": remove_traling_and_leading_quotes( - data["Beteiligter"]["Organisation"]["Bezeichnung"][ - "Bezeichnung_Aktuell" - ] - ), - "location": Location( - **{ - "city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"], - "street": data["Beteiligter"]["Organisation"]["Anschrift"][ - "Strasse" - ] - if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - "house_number": data["Beteiligter"]["Organisation"][ - "Anschrift" - ]["Hausnummer"] - if "Hausnummer" - in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - "zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][ - "Postleitzahl" - ] - if "Postleitzahl" - in data["Beteiligter"]["Organisation"]["Anschrift"] - else None, - } - ), - "type": CompanyRelationshipEnum.COMPANY, - } - ) - return None - - -def loc_from_beteiligung(data: dict) -> Location: - """Extract the company location from the first relationship in the export. - - Args: - data (dict): Data export - - Returns: - Location: location - """ - base = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][ - "Beteiligter" - ]["Organisation"]["Anschrift"] - - house_number = None - street = None - if "Strasse" in base: - regex = r".(\d+)$" - hits = re.findall(regex, base["Strasse"]) - if len(hits) == 1: - house_number = hits[0] - street = base["Strasse"][: (-1 * len(house_number))] - if "Hausnummer" in base: - house_number = house_number + base["Hausnummer"] - else: - if "Hausnummer" in base: - house_number = base["Hausnummer"] - street = base["Strasse"] - return Location( - **{ - "city": base["Ort"], - "zip_code": base["Postleitzahl"], - "street": normalize_street(street), # type: ignore - "house_number": house_number, - } - ) - - -def name_from_beteiligung(data: dict) -> str: - """Extract the Company name from an Unternehmensregister export by using the first relationship found. - - Args: - data (dict): Data export - - Returns: - str: Company name - """ - name = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][ - "Beteiligter" - ]["Organisation"]["Bezeichnung"]["Bezeichnung_Aktuell"] - return remove_traling_and_leading_quotes(name) - - -def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None: - """Extracts the company type from a given Unternehmensregister export. - - Args: - company_name (str): Name of the company as a fallback solution - data (dict): Data export - - Returns: - CompanyTypeEnum | None: Company type if found - """ - try: - return CompanyTypeEnum( - data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ - "Rechtstraeger" - ]["Rechtsform"]["content"] - ) - except KeyError: - if ( - company_name.endswith("GmbH") - or company_name.endswith("UG") - or company_name.endswith("UG (haftungsbeschränkt)") - ): - return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung") - if company_name.endswith("SE"): - return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)") - if company_name.endswith("KG"): - return CompanyTypeEnum("Kommanditgesellschaft") return None + def loc_from_beteiligung(self, data: dict) -> Location: + """Extract the company location from the first relationship in the export. -def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None: - """Extracts the company capital from the given Unternehmensregister export. + Args: + data (dict): Data export - Args: - data (dict): Data export - company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung') + Returns: + Location: location + """ + base = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][ + "Beteiligter" + ]["Organisation"]["Anschrift"] - Returns: - Capital | None: Company Capital if found - """ - # Early return - if "Zusatzangaben" not in data["XJustiz_Daten"]["Fachdaten_Register"]: - return None - capital: dict = {"Zahl": 0.0, "Waehrung": ""} - if company_type == CompanyTypeEnum.KG: - capital_type = "Hafteinlage" - base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ - "Personengesellschaft" - ]["Zusatz_KG"]["Daten_Kommanditist"] - if isinstance(base, list): - for entry in base: - # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below - capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"]) - capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"] - elif isinstance(base, dict): - capital = base["Hafteinlage"] - elif company_type in [ - CompanyTypeEnum.GMBH, - CompanyTypeEnum.SE, - CompanyTypeEnum.AG, - CompanyTypeEnum.KGaA, - CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM, - CompanyTypeEnum.OHG, - ]: - if ( - "Kapitalgesellschaft" - not in data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"] - ): + house_number = None + street = None + if "Strasse" in base: + regex = r".(\d+)$" + hits = re.findall(regex, base["Strasse"]) + if len(hits) == 1: + house_number = hits[0] + street = base["Strasse"][: (-1 * len(house_number))] + if "Hausnummer" in base: + house_number = house_number + base["Hausnummer"] + else: + if "Hausnummer" in base: + house_number = base["Hausnummer"] + street = base["Strasse"] + return Location( + **{ + "city": base["Ort"], + "zip_code": base["Postleitzahl"], + "street": normalize_street(street), # type: ignore + "house_number": house_number, + } + ) + + def name_from_beteiligung(self, data: dict) -> str: + """Extract the Company name from an Unternehmensregister export by using the first relationship found. + + Args: + data (dict): Data export + + Returns: + str: Company name + """ + name = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][ + "Beteiligter" + ]["Organisation"]["Bezeichnung"]["Bezeichnung_Aktuell"] + return remove_traling_and_leading_quotes(name) + + def map_rechtsform(self, company_name: str, data: dict) -> CompanyTypeEnum | None: + """Extracts the company type from a given Unternehmensregister export. + + Args: + company_name (str): Name of the company as a fallback solution + data (dict): Data export + + Returns: + CompanyTypeEnum | None: Company type if found + """ + try: + return CompanyTypeEnum( + data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ + "Rechtstraeger" + ]["Rechtsform"]["content"] + ) + except KeyError: + if ( + company_name.endswith("GmbH") + or company_name.endswith("UG") + or company_name.endswith("UG (haftungsbeschränkt)") + ): + return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung") + if company_name.endswith("SE"): + return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)") + if company_name.endswith("KG"): + return CompanyTypeEnum("Kommanditgesellschaft") + return None + + def map_capital(self, data: dict, company_type: CompanyTypeEnum) -> Capital | None: + """Extracts the company capital from the given Unternehmensregister export. + + Args: + data (dict): Data export + company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung') + + Returns: + Capital | None: Company Capital if found + """ + # Early return + if "Zusatzangaben" not in data["XJustiz_Daten"]["Fachdaten_Register"]: + return None + capital: dict = {"Zahl": 0.0, "Waehrung": ""} + if company_type == CompanyTypeEnum.KG: + capital_type = "Hafteinlage" base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ "Personengesellschaft" - ] - else: - base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ + ]["Zusatz_KG"]["Daten_Kommanditist"] + if isinstance(base, list): + for entry in base: + # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below + capital["Zahl"] = capital["Zahl"] + float( + entry["Hafteinlage"]["Zahl"] + ) + capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"] + elif isinstance(base, dict): + capital = base["Hafteinlage"] + elif company_type in [ + CompanyTypeEnum.GMBH, + CompanyTypeEnum.SE, + CompanyTypeEnum.AG, + CompanyTypeEnum.KGaA, + CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM, + CompanyTypeEnum.OHG, + ]: + if ( "Kapitalgesellschaft" + not in data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"] + ): + base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ + "Personengesellschaft" + ] + else: + base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][ + "Kapitalgesellschaft" + ] + if "Zusatz_GmbH" in base: + capital_type = "Stammkapital" + capital = base["Zusatz_GmbH"]["Stammkapital"] + elif "Zusatz_Aktiengesellschaft" in base: + capital_type = "Grundkapital" + capital = base["Zusatz_Aktiengesellschaft"]["Grundkapital"]["Hoehe"] + elif company_type in [ + CompanyTypeEnum.EINZELKAUFMANN, + CompanyTypeEnum.EG, + CompanyTypeEnum.PARTNERSCHAFT, + CompanyTypeEnum.PARTNERGESELLSCHAFT, + CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT, + None, + ]: + return None + # Catch entries having the dict but with null values + if not all(capital.values()): + return None + return Capital( + **{ # type: ignore + "value": float(capital["Zahl"]), + "currency": CurrencyEnum(capital["Waehrung"]), + "type": CapitalTypeEnum(capital_type), + } + ) + + def map_business_purpose(self, data: dict) -> str | None: + """Extracts the "Geschäftszweck" from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Business purpose if found + """ + try: + return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ + "Gegenstand_oder_Geschaeftszweck" ] - if "Zusatz_GmbH" in base: - capital_type = "Stammkapital" - capital = base["Zusatz_GmbH"]["Stammkapital"] - elif "Zusatz_Aktiengesellschaft" in base: - capital_type = "Grundkapital" - capital = base["Zusatz_Aktiengesellschaft"]["Grundkapital"]["Hoehe"] - elif company_type in [ - CompanyTypeEnum.EINZELKAUFMANN, - CompanyTypeEnum.EG, - CompanyTypeEnum.PARTNERSCHAFT, - CompanyTypeEnum.PARTNERGESELLSCHAFT, - CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT, - None, - ]: + except KeyError: + return None + + def map_founding_date(self, data: dict) -> str | None: + """Extracts the founding date from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Founding date if found + """ + text = str(data) + entry_date = re.findall( + r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text + ) + if len(entry_date) == 1: + return transform_date_to_iso(entry_date[0][1]) + + entry_date = re.findall( + r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text + ) + if len(entry_date) == 1: + return transform_date_to_iso(entry_date[0]) + if ( + "Gruendungsmetadaten" + in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"] + ): + return extract_date_from_string( + data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ + "Gruendungsmetadaten" + ]["Gruendungsdatum"] + ) + # No reliable answer return None - # Catch entries having the dict but with null values - if not all(capital.values()): - return None - return Capital( - **{ # type: ignore - "value": float(capital["Zahl"]), - "currency": CurrencyEnum(capital["Waehrung"]), - "type": CapitalTypeEnum(capital_type), - } - ) + def map_company_id(self, data: dict) -> CompanyID: + """Retrieve Company ID from export. -def map_business_purpose(data: dict) -> str | None: - """Extracts the "Geschäftszweck" from a given Unternehmensregister export. + Args: + data (dict): Data export - Args: - data (dict): Data export + Returns: + CompanyID: ID of the company + """ + return CompanyID( + **{ + "hr_number": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Instanzdaten" + ]["Aktenzeichen"], + "district_court": DistrictCourt( + **{ + "name": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"]["Organisation"]["Bezeichnung"][ + "Bezeichnung_Aktuell" + ] + if "Organisation" + in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"] + else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ + "Nachname" + ], + "city": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"]["Organisation"]["Sitz"]["Ort"] + if "Organisation" + in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"] + else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ + "Beteiligung" + ][1]["Beteiligter"]["Natuerliche_Person"]["Anschrift"]["Ort"], + } + ), + } + ) - Returns: - str | None: Business purpose if found - """ - try: - return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ - "Gegenstand_oder_Geschaeftszweck" + def map_last_update(self, data: dict) -> str: + """Extract last update date from export. + + Args: + data (dict): Unternehmensregister export + + Returns: + str: Last update date + """ + return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"][ + "letzte_Eintragung" ] - except KeyError: - return None + def map_unternehmensregister_json(self, data: dict) -> Company: + """Processes the Unternehmensregister structured export to a Company by using several helper methods. -def map_founding_date(data: dict) -> str | None: - """Extracts the founding date from a given Unternehmensregister export. + Args: + data (dict): Data export - Args: - data (dict): Data export + Returns: + Company: Transformed data + """ + result: dict = {"relationships": []} - Returns: - str | None: Founding date if found - """ - text = str(data) - entry_date = re.findall( - r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text - ) - if len(entry_date) == 1: - return transform_date_to_iso(entry_date[0][1]) + # TODO Refactor mapping - this is a nightmare... + result["id"] = self.map_company_id(data) + result["name"] = self.name_from_beteiligung(data) - entry_date = re.findall( - r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text - ) - if len(entry_date) == 1: - return transform_date_to_iso(entry_date[0]) - if ( - "Gruendungsmetadaten" - in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"] - ): - return extract_date_from_string( - data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][ - "Gruendungsmetadaten" - ]["Gruendungsdatum"] - ) - # No reliable answer - return None + result["location"] = self.loc_from_beteiligung(data) + result["last_update"] = self.map_last_update(data) + result["company_type"] = self.map_rechtsform(result["name"], data) + result["capital"] = self.map_capital(data, result["company_type"]) + result["business_purpose"] = self.map_business_purpose(data) + result["founding_date"] = self.map_founding_date(data) -def map_company_id(data: dict) -> CompanyID: - """Retrieve Company ID from export. - - Args: - data (dict): Data export - - Returns: - CompanyID: ID of the company - """ - return CompanyID( - **{ - "hr_number": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Instanzdaten" - ]["Aktenzeichen"], - "district_court": DistrictCourt( - **{ - "name": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"]["Organisation"]["Bezeichnung"][ - "Bezeichnung_Aktuell" - ] - if "Organisation" - in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"] - else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ - "Nachname" - ], - "city": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"]["Organisation"]["Sitz"]["Ort"] - if "Organisation" - in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"] - else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][ - "Beteiligung" - ][1]["Beteiligter"]["Natuerliche_Person"]["Anschrift"]["Ort"], - } - ), - } - ) - - -def map_last_update(data: dict) -> str: - """Extract last update date from export. - - Args: - data (dict): Unternehmensregister export - - Returns: - str: Last update date - """ - return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"]["letzte_Eintragung"] - - -def map_unternehmensregister_json(data: dict) -> Company: - """Processes the Unternehmensregister structured export to a Company by using several helper methods. - - Args: - data (dict): Data export - - Returns: - Company: Transformed data - """ - result: dict = {"relationships": []} - - # TODO Refactor mapping - this is a nightmare... - result["id"] = map_company_id(data) - result["name"] = name_from_beteiligung(data) - - result["location"] = loc_from_beteiligung(data) - result["last_update"] = map_last_update(data) - - result["company_type"] = map_rechtsform(result["name"], data) - result["capital"] = map_capital(data, result["company_type"]) - result["business_purpose"] = map_business_purpose(data) - result["founding_date"] = map_founding_date(data) - - for i in range( - 2, len(data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"]) - ): - people = parse_stakeholder( - data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][i] - ) - result["relationships"].append(people) - result = map_co_relation(result) - return Company(**result) + for i in range( + 2, + len(data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"]), + ): + people = self.parse_stakeholder( + data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][i] + ) + result["relationships"].append(people) + result = map_co_relation(result) + return Company(**result) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py index 240231a..70f97cb 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py @@ -19,6 +19,7 @@ from aki_prj23_transparenzregister.models.company import ( RelationshipRoleEnum, ) from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.common import ( + BaseTransformer, map_co_relation, normalize_street, traversal, @@ -32,63 +33,106 @@ from aki_prj23_transparenzregister.utils.string_tools import ( ) -def parse_date_of_birth(data: dict) -> str | None: - """Retreives the date of birth from a stakeholder entry if possible. +class V3_Transformer(BaseTransformer): # noqa: N801 + """Transformer for data exports from Unternehmensregister (v3).""" - Args: - data (dict): Stakeholder data + def parse_date_of_birth(self, data: dict) -> str | None: + """Retreives the date of birth from a stakeholder entry if possible. - Returns: - str | None: date of birth or None if not found - """ - if "tns:geburt" in ( - base := data["tns:beteiligter"]["tns:auswahl_beteiligter"][ - "tns:natuerlichePerson" - ] - ): - base = base["tns:geburt"]["tns:geburtsdatum"] - if isinstance(base, str): - return base - return None + Args: + data (dict): Stakeholder data - -def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: - """Map Unternehmensregister role ID to RelationshipRoleEnum. - - Args: - role_id (str): Unternehmensregister role ID - - Returns: - RelationshipRoleEnum: Role enum - """ - mapper = RoleMapper.mapper() - return mapper.get(role_id) - - -def parse_stakeholder(data: dict) -> CompanyRelationship | None: - """Extract the company stakeholder/relation from a single "Beteiligung". - - Args: - data (dict): Data export - - Returns: - CompanyRelationship | None: Relationship if it could be processed - """ - if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: - # It's a Company serving as a "Kommanditist" or similar - if ( - "tns:vorname" - not in data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + Returns: + str | None: date of birth or None if not found + """ + if "tns:geburt" in ( + base := data["tns:beteiligter"]["tns:auswahl_beteiligter"][ "tns:natuerlichePerson" - ]["tns:vollerName"] + ] ): - return CompanyToCompanyRelationship( + base = base["tns:geburt"]["tns:geburtsdatum"] + if isinstance(base, str): + return base + return None + + def map_role_id_to_enum(self, role_id: str) -> RelationshipRoleEnum: + """Map Unternehmensregister role ID to RelationshipRoleEnum. + + Args: + role_id (str): Unternehmensregister role ID + + Returns: + RelationshipRoleEnum: Role enum + """ + mapper = RoleMapper.mapper() + return mapper.get(role_id) + + def parse_stakeholder(self, data: dict) -> CompanyRelationship | None: + """Extract the company stakeholder/relation from a single "Beteiligung". + + Args: + data (dict): Data export + + Returns: + CompanyRelationship | None: Relationship if it could be processed + """ + if ( + "tns:natuerlichePerson" + in data["tns:beteiligter"]["tns:auswahl_beteiligter"] + ): + # It's a Company serving as a "Kommanditist" or similar + if ( + "tns:vorname" + not in data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:vollerName"] + ): + return CompanyToCompanyRelationship( + **{ # type: ignore + "name": remove_traling_and_leading_quotes( + data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:vollerName"]["tns:nachname"] + ), + "location": Location( + **{ + "city": data["tns:beteiligter"][ + "tns:auswahl_beteiligter" + ]["tns:natuerlichePerson"]["tns:anschrift"][-1][ + "tns:ort" + ] + if isinstance( + data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:anschrift"], + list, + ) + else data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:natuerlichePerson" + ]["tns:anschrift"]["tns:ort"] + } + ), + "role": self.map_role_id_to_enum( + data["tns:rolle"]["tns:rollenbezeichnung"]["code"] + ), + "type": CompanyRelationshipEnum.COMPANY, + } + ) + return PersonToCompanyRelationship( **{ # type: ignore - "name": remove_traling_and_leading_quotes( - data["tns:beteiligter"]["tns:auswahl_beteiligter"][ - "tns:natuerlichePerson" - ]["tns:vollerName"]["tns:nachname"] + "name": PersonName( + **{ + "firstname": data["tns:beteiligter"][ + "tns:auswahl_beteiligter" + ]["tns:natuerlichePerson"]["tns:vollerName"]["tns:vorname"], + "lastname": data["tns:beteiligter"][ + "tns:auswahl_beteiligter" + ]["tns:natuerlichePerson"]["tns:vollerName"][ + "tns:nachname" + ], + } ), + "date_of_birth": self.parse_date_of_birth(data), "location": Location( **{ "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"][ @@ -105,452 +149,414 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None: ]["tns:anschrift"]["tns:ort"] } ), - "role": map_role_id_to_enum( + "role": self.map_role_id_to_enum( data["tns:rolle"]["tns:rollenbezeichnung"]["code"] ), + "type": CompanyRelationshipEnum.PERSON, + } + ) + if "tns:organisation" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: + base = data["tns:beteiligter"]["tns:auswahl_beteiligter"][ + "tns:organisation" + ] + + location = None + if "tns:anschrift" in base: + location = Location( + **{ + "city": base["tns:anschrift"]["tns:ort"], + "street": base["tns:anschrift"]["tns:strasse"] + if "tns:strasse" in base["tns:anschrift"] + else None, + "house_number": base["tns:anschrift"]["tns:hausnummer"] + if "tns:hausnummer" in base["tns:anschrift"] + else None, + "zip_code": base["tns:anschrift"]["tns:postleitzahl"] + if "tns:postleitzahl" in base["tns:anschrift"] + else None, + } + ) + else: + location = Location( + **{ + "city": base["tns:sitz"]["tns:ort"], + "street": base["tns:sitz"]["tns:strasse"] + if "tns:strasse" in base["tns:sitz"] + else None, + "house_number": base["tns:sitz"]["tns:hausnummer"] + if "tns:hausnummer" in base["tns:sitz"] + else None, + "zip_code": base["tns:sitz"]["tns:postleitzahl"] + if "tns:postleitzahl" in base["tns:sitz"] + else None, + } + ) + + return CompanyToCompanyRelationship( + **{ # type: ignore + "role": self.map_role_id_to_enum( + data["tns:rolle"]["tns:rollenbezeichnung"]["code"] + ), + "name": remove_traling_and_leading_quotes( + base["tns:bezeichnung"]["tns:bezeichnung.aktuell"] + ), + "location": location, "type": CompanyRelationshipEnum.COMPANY, } ) - return PersonToCompanyRelationship( - **{ # type: ignore - "name": PersonName( - **{ - "firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"][ - "tns:natuerlichePerson" - ]["tns:vollerName"]["tns:vorname"], - "lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"][ - "tns:natuerlichePerson" - ]["tns:vollerName"]["tns:nachname"], - } - ), - "date_of_birth": parse_date_of_birth(data), - "location": Location( - **{ - "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"][ - "tns:natuerlichePerson" - ]["tns:anschrift"][-1]["tns:ort"] - if isinstance( - data["tns:beteiligter"]["tns:auswahl_beteiligter"][ - "tns:natuerlichePerson" - ]["tns:anschrift"], - list, - ) - else data["tns:beteiligter"]["tns:auswahl_beteiligter"][ - "tns:natuerlichePerson" - ]["tns:anschrift"]["tns:ort"] - } - ), - "role": map_role_id_to_enum( - data["tns:rolle"]["tns:rollenbezeichnung"]["code"] - ), - "type": CompanyRelationshipEnum.PERSON, - } - ) - if "tns:organisation" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: - base = data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"] + return None - location = None - if "tns:anschrift" in base: - location = Location( - **{ - "city": base["tns:anschrift"]["tns:ort"], - "street": base["tns:anschrift"]["tns:strasse"] - if "tns:strasse" in base["tns:anschrift"] - else None, - "house_number": base["tns:anschrift"]["tns:hausnummer"] - if "tns:hausnummer" in base["tns:anschrift"] - else None, - "zip_code": base["tns:anschrift"]["tns:postleitzahl"] - if "tns:potsleitzahl" in base["tns:anschrift"] - else None, - } - ) - else: - location = Location( - **{ - "city": base["tns:sitz"]["tns:ort"], - "street": base["tns:sitz"]["tns:strasse"] - if "tns:strasse" in base["tns:sitz"] - else None, - "house_number": base["tns:sitz"]["tns:hausnummer"] - if "tns:hausnummer" in base["tns:sitz"] - else None, - "zip_code": base["tns:sitz"]["tns:postleitzahl"] - if "tns:potsleitzahl" in base["tns:sitz"] - else None, - } - ) + def loc_from_beteiligung(self, data: dict) -> Location: + """Extract the company location from the first relationship in the export. - return CompanyToCompanyRelationship( - **{ # type: ignore - "role": map_role_id_to_enum( - data["tns:rolle"]["tns:rollenbezeichnung"]["code"] - ), - "name": remove_traling_and_leading_quotes( - base["tns:bezeichnung"]["tns:bezeichnung.aktuell"] - ), - "location": location, - "type": CompanyRelationshipEnum.COMPANY, - } - ) - return None + Args: + data (dict): Data export - -def loc_from_beteiligung(data: dict) -> Location: - """Extract the company location from the first relationship in the export. - - Args: - data (dict): Data export - - Returns: - Location: location - """ - base_path = [ - "tns:grunddaten", - "tns:verfahrensdaten", - "tns:beteiligung", - 0, - "tns:beteiligter", - "tns:auswahl_beteiligter", - "tns:organisation", - ] - base = traversal(data, base_path) - base = base["tns:anschrift"] if "tns:anschrift" in base else base["tns:sitz"] - - if isinstance(base, list): - base = base[0] - house_number = None - street = None - if "tns:strasse" in base: - regex = r".(\d+)$" - hits = re.findall(regex, base["tns:strasse"]) - if len(hits) == 1: - house_number = hits[0] - street = base["tns:strasse"][: (-1 * len(house_number))] - if "tns:hausnummer" in base: - house_number = house_number + base["tns:hausnummer"] - else: - if "tns:hausnummer" in base: - house_number = base["tns:hausnummer"] - street = base["tns:strasse"] - return Location( - **{ - "city": base["tns:ort"], - "zip_code": base["tns:postleitzahl"], - "street": normalize_street(street), # type: ignore - "house_number": house_number, - } - ) - - -def name_from_beteiligung(data: dict) -> str: - """Extract the Company name from an Unternehmensregister export by using the first relationship found. - - Args: - data (dict): Data export - - Returns: - str: Company name - """ - path = [ - "tns:grunddaten", - "tns:verfahrensdaten", - "tns:beteiligung", - 0, - "tns:beteiligter", - "tns:auswahl_beteiligter", - "tns:organisation", - "tns:bezeichnung", - "tns:bezeichnung.aktuell", - ] - name = traversal(data, path) - return remove_traling_and_leading_quotes(name) - - -def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None: - """Extracts the company type from a given Unternehmensregister export. - - Args: - company_name (str): Name of the company as a fallback solution - data (dict): Data export - - Returns: - CompanyTypeEnum | None: Company type if found - """ - try: - path = [ - "tns:fachdatenRegister", - "tns:basisdatenRegister", - "tns:rechtstraeger", - "tns:angabenZurRechtsform", - "tns:rechtsform", - "code", + Returns: + Location: location + """ + base_path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 0, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation", ] - return CompanyTypeEnum(traversal(data, path)) - except Exception: - if ( - company_name.endswith("GmbH") - or company_name.endswith("UG") - or company_name.endswith("UG (haftungsbeschränkt)") - ): - return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung") - if company_name.endswith("SE"): - return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)") - if company_name.endswith("KG"): - return CompanyTypeEnum("Kommanditgesellschaft") - return None + base = traversal(data, base_path) + base = base["tns:anschrift"] if "tns:anschrift" in base else base["tns:sitz"] - -def map_capital( # noqa: PLR0912 - data: dict, company_type: CompanyTypeEnum -) -> Capital | None: - """Extracts the company capital from the given Unternehmensregister export. - - Args: - data (dict): Data export - company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung') - - Returns: - Capital | None: Company Capital if found - """ - # Early return - if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]: - return None - capital: dict = {"tns:zahl": 0.0, "tns:waehrung": {"code": None}} - if ( - company_type == CompanyTypeEnum.KG - and "tns:personengesellschaft" - in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"] - ): - capital_type = "Hafteinlage" - base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ - "tns:personengesellschaft" - ]["tns:zusatzKG"]["tns:datenKommanditist"] if isinstance(base, list): - for entry in base: - # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below - capital["tns:zahl"] = capital["tns:zahl"] + float( - entry["tns:hafteinlage"]["tns:zahl"] - ) - capital["tns:waehrung"]["code"] = entry["tns:hafteinlage"][ - "tns:waehrung" - ]["code"] - elif isinstance(base, dict): - capital = base["tns:hafteinlage"] - elif company_type in [ - CompanyTypeEnum.GMBH, - CompanyTypeEnum.SE, - CompanyTypeEnum.AG, - CompanyTypeEnum.KGaA, - CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM, - CompanyTypeEnum.OHG, - ]: + base = base[0] + house_number = None + street = None + if "tns:strasse" in base: + regex = r".(\d+)$" + hits = re.findall(regex, base["tns:strasse"]) + if len(hits) == 1: + house_number = hits[0] + street = base["tns:strasse"][: (-1 * len(house_number))] + if "tns:hausnummer" in base: + house_number = house_number + base["tns:hausnummer"] + else: + if "tns:hausnummer" in base: + house_number = base["tns:hausnummer"] + street = base["tns:strasse"] + return Location( + **{ + "city": base["tns:ort"], + "zip_code": base["tns:postleitzahl"], + "street": normalize_street(street), # type: ignore + "house_number": house_number, + } + ) + + def name_from_beteiligung(self, data: dict) -> str: + """Extract the Company name from an Unternehmensregister export by using the first relationship found. + + Args: + data (dict): Data export + + Returns: + str: Company name + """ + path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 0, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation", + "tns:bezeichnung", + "tns:bezeichnung.aktuell", + ] + name = traversal(data, path) + return remove_traling_and_leading_quotes(name) + + def map_rechtsform(self, company_name: str, data: dict) -> CompanyTypeEnum | None: + """Extracts the company type from a given Unternehmensregister export. + + Args: + company_name (str): Name of the company as a fallback solution + data (dict): Data export + + Returns: + CompanyTypeEnum | None: Company type if found + """ + try: + path = [ + "tns:fachdatenRegister", + "tns:basisdatenRegister", + "tns:rechtstraeger", + "tns:angabenZurRechtsform", + "tns:rechtsform", + "code", + ] + return CompanyTypeEnum(traversal(data, path)) + except Exception: + if ( + company_name.endswith("GmbH") + or company_name.endswith("UG") + or company_name.endswith("UG (haftungsbeschränkt)") + ): + return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung") + if company_name.endswith("SE"): + return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)") + if company_name.endswith("KG"): + return CompanyTypeEnum("Kommanditgesellschaft") + return None + + def map_capital( # noqa: PLR0912 + self, data: dict, company_type: CompanyTypeEnum + ) -> Capital | None: + """Extracts the company capital from the given Unternehmensregister export. + + Args: + data (dict): Data export + company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung') + + Returns: + Capital | None: Company Capital if found + """ + # Early return + if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]: + return None + capital: dict = {"tns:zahl": 0.0, "tns:waehrung": {"code": None}} if ( - "tns:kapitalgesellschaft" - not in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"] + company_type == CompanyTypeEnum.KG + and "tns:personengesellschaft" + in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"] ): + capital_type = "Hafteinlage" base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ "tns:personengesellschaft" - ] - else: - base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + ]["tns:zusatzKG"]["tns:datenKommanditist"] + if isinstance(base, list): + for entry in base: + # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below + capital["tns:zahl"] = capital["tns:zahl"] + float( + entry["tns:hafteinlage"]["tns:zahl"] + ) + capital["tns:waehrung"]["code"] = entry["tns:hafteinlage"][ + "tns:waehrung" + ]["code"] + elif isinstance(base, dict): + capital = base["tns:hafteinlage"] + elif company_type in [ + CompanyTypeEnum.GMBH, + CompanyTypeEnum.SE, + CompanyTypeEnum.AG, + CompanyTypeEnum.KGaA, + CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM, + CompanyTypeEnum.OHG, + ]: + if ( "tns:kapitalgesellschaft" - ] - if "tns:zusatzGmbH" in base: - capital_type = "Stammkapital" - capital = base["tns:zusatzGmbH"]["tns:stammkapital"] - elif "tns:zusatzAktiengesellschaft" in base: - capital_type = "Grundkapital" - capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"][ - "tns:hoehe" - ] - elif company_type in [ - CompanyTypeEnum.EINZELKAUFMANN, - CompanyTypeEnum.EG, - CompanyTypeEnum.PARTNERSCHAFT, - CompanyTypeEnum.PARTNERGESELLSCHAFT, - CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT, - None, - ]: - return None - # Catch entries having the dict but with null values - if isinstance(capital, list): - capital = capital[0] - if not all(capital.values()): - return None - return Capital( - **{ # type: ignore - "value": float(capital["tns:zahl"]), - "currency": CurrencyEnum(capital["tns:waehrung"]["code"]), - "type": CapitalTypeEnum(capital_type), - } - ) - - -def map_business_purpose(data: dict) -> str | None: - """Extracts the "Geschäftszweck" from a given Unternehmensregister export. - - Args: - data (dict): Data export - - Returns: - str | None: Business purpose if found - """ - try: - path = ["tns:fachdatenRegister", "tns:basisdatenRegister", "tns:gegenstand"] - return traversal(data, path) - except KeyError: - return None - - -def map_founding_date(data: dict) -> str | None: - """Extracts the founding date from a given Unternehmensregister export. - - Args: - data (dict): Data export - - Returns: - str | None: Founding date if found - """ - text = str(data) - entry_date = re.findall( - r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text - ) - if len(entry_date) == 1: - return transform_date_to_iso(entry_date[0][1]) - - entry_date = re.findall( - r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text - ) - if len(entry_date) == 1: - return transform_date_to_iso(entry_date[0]) - if "tns:satzungsdatum" in data["tns:fachdatenRegister"]["tns:basisdatenRegister"]: - path = [ - "tns:fachdatenRegister", - "tns:basisdatenRegister", - "tns:satzungsdatum", - ] - base = traversal(data, path) - if "tns:aktuellesSatzungsdatum" in base: - return base["tns:aktuellesSatzungsdatum"] - # No reliable answer - return None - - -def map_hr_number(data: dict) -> str: - """Extract the HR number from a given Unternehmensregister export. - - Args: - data (dict): Data export - - Raises: - KeyError: If key not found - - Returns: - str: HR number - """ - base = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:instanzdaten"][ - "tns:aktenzeichen" - ]["tns:auswahl_aktenzeichen"] - if "tns:aktenzeichen.strukturiert" in base: - hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"]["code"] - hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"] - return f"{hr_prefix} {hr_number}" - if "tns:aktenzeichen.freitext" in base: - return base["tns:aktenzeichen.freitext"] - raise KeyError("Could not find HR number") - - -def map_district_court(data: dict) -> DistrictCourt: - """Extract the district court from a given Unternehmensregister export. - - Args: - data (dict): Data export - - Returns: - DistrictCourt: District court - """ - base_path = [ - "tns:grunddaten", - "tns:verfahrensdaten", - "tns:beteiligung", - 1, - "tns:beteiligter", - "tns:auswahl_beteiligter", - "tns:organisation", - ] - path = [*base_path, "tns:bezeichnung", "tns:bezeichnung.aktuell"] - name = traversal(data, path) - path = [*base_path, "tns:anschrift", "tns:ort"] - city = traversal(data, path) - return DistrictCourt(name=name, city=city) - - -def map_company_id(data: dict) -> CompanyID: - """Retrieve Company ID from export. - - Args: - data (dict): Data export - - Returns: - CompanyID: ID of the company - """ - try: - return CompanyID(map_hr_number(data), map_district_court(data)) # type: ignore - except KeyError: - hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][0][ - "tns:beteiligter" - ]["tns:auswahl_beteiligter"]["tns:organisation"]["tns:registereintragung"][ - "tns:registernummer" - ] - district_court = map_district_court(data) - return CompanyID(hr_number=hr_number, district_court=district_court) - - -def map_last_update(data: dict) -> str: - """Extract last update date from export. - - Args: - data (dict): Unternehmensregister export - - Returns: - str: Last update date - """ - path = ["tns:fachdatenRegister", "tns:auszug", "tns:letzteEintragung"] - return traversal(data, path) - - -# TODO class model with inheritance - only difference: Determine root in __init__ -def map_unternehmensregister_json(data: dict) -> Company: - """Processes the Unternehmensregister structured export to a Company by using several helper methods. - - Args: - data (dict): Data export - - Returns: - Company: Transformed data - """ - root_key = list(data.keys())[0] - data = data[root_key] - result: dict = {"relationships": []} - - result["id"] = map_company_id(data) - result["name"] = name_from_beteiligung(data) - - result["location"] = loc_from_beteiligung(data) - result["last_update"] = map_last_update(data) - - result["company_type"] = map_rechtsform(result["name"], data) - result["capital"] = map_capital(data, result["company_type"]) - result["business_purpose"] = map_business_purpose(data) - result["founding_date"] = map_founding_date(data) - - for i in range( - 2, len(data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"]) - ): - people = parse_stakeholder( - data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][i] + not in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"] + ): + base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + "tns:personengesellschaft" + ] + else: + base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ + "tns:kapitalgesellschaft" + ] + if "tns:zusatzGmbH" in base: + capital_type = "Stammkapital" + capital = base["tns:zusatzGmbH"]["tns:stammkapital"] + elif "tns:zusatzAktiengesellschaft" in base: + capital_type = "Grundkapital" + capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"][ + "tns:hoehe" + ] + elif company_type in [ + CompanyTypeEnum.EINZELKAUFMANN, + CompanyTypeEnum.EG, + CompanyTypeEnum.PARTNERSCHAFT, + CompanyTypeEnum.PARTNERGESELLSCHAFT, + CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT, + None, + ]: + return None + # Catch entries having the dict but with null values + if isinstance(capital, list): + capital = capital[0] + if not all(capital.values()): + return None + return Capital( + **{ # type: ignore + "value": float(capital["tns:zahl"]), + "currency": CurrencyEnum(capital["tns:waehrung"]["code"]), + "type": CapitalTypeEnum(capital_type), + } ) - result["relationships"].append(people) - result = map_co_relation(result) - return Company(**result) + + def map_business_purpose(self, data: dict) -> str | None: + """Extracts the "Geschäftszweck" from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Business purpose if found + """ + try: + path = ["tns:fachdatenRegister", "tns:basisdatenRegister", "tns:gegenstand"] + return traversal(data, path) + except KeyError: + return None + + def map_founding_date(self, data: dict) -> str | None: + """Extracts the founding date from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + str | None: Founding date if found + """ + text = str(data) + entry_date = re.findall( + r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text + ) + if len(entry_date) == 1: + return transform_date_to_iso(entry_date[0][1]) + + entry_date = re.findall( + r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text + ) + if len(entry_date) == 1: + return transform_date_to_iso(entry_date[0]) + if ( + "tns:satzungsdatum" + in data["tns:fachdatenRegister"]["tns:basisdatenRegister"] + ): + path = [ + "tns:fachdatenRegister", + "tns:basisdatenRegister", + "tns:satzungsdatum", + ] + base = traversal(data, path) + if "tns:aktuellesSatzungsdatum" in base: + return base["tns:aktuellesSatzungsdatum"] + # No reliable answer + return None + + def map_hr_number(self, data: dict) -> str: + """Extract the HR number from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Raises: + KeyError: If key not found + + Returns: + str: HR number + """ + base = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:instanzdaten"][ + "tns:aktenzeichen" + ]["tns:auswahl_aktenzeichen"] + if "tns:aktenzeichen.strukturiert" in base: + hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"]["code"] + hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"] + return f"{hr_prefix} {hr_number}" + if "tns:aktenzeichen.freitext" in base: + return base["tns:aktenzeichen.freitext"] + raise KeyError("Could not find HR number") + + def map_district_court(self, data: dict) -> DistrictCourt: + """Extract the district court from a given Unternehmensregister export. + + Args: + data (dict): Data export + + Returns: + DistrictCourt: District court + """ + base_path = [ + "tns:grunddaten", + "tns:verfahrensdaten", + "tns:beteiligung", + 1, + "tns:beteiligter", + "tns:auswahl_beteiligter", + "tns:organisation", + ] + path = [*base_path, "tns:bezeichnung", "tns:bezeichnung.aktuell"] + name = traversal(data, path) + + path = [*base_path, "tns:anschrift", "tns:ort"] + city = traversal(data, path) + return DistrictCourt(name=name, city=city) + + def map_company_id(self, data: dict) -> CompanyID: + """Retrieve Company ID from export. + + Args: + data (dict): Data export + + Returns: + CompanyID: ID of the company + """ + try: + return CompanyID(hr_number=self.map_hr_number(data), district_court=self.map_district_court(data)) # type: ignore + except KeyError: + hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"][ + "tns:beteiligung" + ][0]["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"][ + "tns:registereintragung" + ][ + "tns:registernummer" + ] + district_court = self.map_district_court(data) + return CompanyID(hr_number=hr_number, district_court=district_court) + + def map_last_update(self, data: dict) -> str: + """Extract last update date from export. + + Args: + data (dict): Unternehmensregister export + + Returns: + str: Last update date + """ + path = ["tns:fachdatenRegister", "tns:auszug", "tns:letzteEintragung"] + return traversal(data, path) + + # TODO class model with inheritance - only difference: Determine root in __init__ + def map_unternehmensregister_json(self, data: dict) -> Company: + """Processes the Unternehmensregister structured export to a Company by using several helper methods. + + Args: + data (dict): Data export + + Returns: + Company: Transformed data + """ + root_key = list(data.keys())[0] + data = data[root_key] + result: dict = {"relationships": []} + + result["id"] = self.map_company_id(data) + result["name"] = self.name_from_beteiligung(data) + + result["location"] = self.loc_from_beteiligung(data) + result["last_update"] = self.map_last_update(data) + + result["company_type"] = self.map_rechtsform(result["name"], data) + result["capital"] = self.map_capital(data, result["company_type"]) + result["business_purpose"] = self.map_business_purpose(data) + result["founding_date"] = self.map_founding_date(data) + + for i in range( + 2, len(data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"]) + ): + people = self.parse_stakeholder( + data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][i] + ) + result["relationships"].append(people) + result = map_co_relation(result) + return Company(**result) diff --git a/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py b/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py index 34b8ead..4c89d35 100644 --- a/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py +++ b/tests/utils/data_extraction/unternehmensregister/transform/v1_test.py @@ -16,10 +16,12 @@ from aki_prj23_transparenzregister.models.company import ( PersonToCompanyRelationship, RelationshipRoleEnum, ) -from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import ( - v1 as transform, +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1 import ( + V1_Transformer, ) +transform = V1_Transformer() + def test_parse_stakeholder_org_hidden_in_person() -> None: data = { @@ -656,31 +658,31 @@ def test_map_last_update() -> None: "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_co_relation" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_company_id" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.map_company_id" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.name_from_beteiligung" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.name_from_beteiligung" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.loc_from_beteiligung" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.loc_from_beteiligung" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_last_update" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.map_last_update" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_rechtsform" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.map_rechtsform" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_capital" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.map_capital" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_business_purpose" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.map_business_purpose" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_founding_date" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.map_founding_date" ) @patch( - "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.parse_stakeholder" + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.V1_Transformer.parse_stakeholder" ) def test_map_unternehmensregister_json( # noqa: PLR0913 mock_map_parse_stakeholder: Mock, diff --git a/tests/utils/data_extraction/unternehmensregister/transform/v3_test.py b/tests/utils/data_extraction/unternehmensregister/transform/v3_test.py new file mode 100644 index 0000000..d23e048 --- /dev/null +++ b/tests/utils/data_extraction/unternehmensregister/transform/v3_test.py @@ -0,0 +1,731 @@ +"""Testing utils/data_extraction/unternehmensregister/transform.py.""" +from unittest.mock import Mock, patch + +from aki_prj23_transparenzregister.models.company import ( + Capital, + CapitalTypeEnum, + Company, + CompanyID, + CompanyRelationshipEnum, + CompanyToCompanyRelationship, + CompanyTypeEnum, + CurrencyEnum, + DistrictCourt, + Location, + PersonName, + PersonToCompanyRelationship, + RelationshipRoleEnum, +) +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3 import ( + V3_Transformer, +) + +transform = V3_Transformer() + + +def test_parse_stakeholder_org_hidden_in_person() -> None: + data = { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:natuerlichePerson": { + "tns:vollerName": {"tns:nachname": '"Some Company KG'}, + "tns:anschrift": {"tns:ort": "Area 51"}, + } + } + }, + "tns:rolle": {"tns:rollenbezeichnung": {"code": "275"}}, + } + expected_result = CompanyToCompanyRelationship( + role=RelationshipRoleEnum.KOMMANDITIST, # type: ignore + name="Some Company KG", + type=CompanyRelationshipEnum.COMPANY, + location=Location(**{"city": "Area 51"}), + ) + assert transform.parse_stakeholder(data) == expected_result + + +def test_parse_stakeholder_person() -> None: + data = { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:natuerlichePerson": { + "tns:vollerName": { + "tns:vorname": "Stephen", + "tns:nachname": "King", + }, + "tns:anschrift": {"tns:ort": "Maine"}, + "tns:geburt": {"tns:geburtsdatum": "1947-09-21"}, + } + } + }, + "tns:rolle": {"tns:rollenbezeichnung": {"code": "269"}}, + } + expected_result = PersonToCompanyRelationship( + role=RelationshipRoleEnum.GESCHAEFTSLEITER, # type: ignore + date_of_birth="1947-09-21", + name=PersonName(**{"firstname": "Stephen", "lastname": "King"}), + type=CompanyRelationshipEnum.PERSON, + location=Location(**{"city": "Maine"}), + ) + assert transform.parse_stakeholder(data) == expected_result + + +def test_parse_stakeholder_person_missing_date_of_birth() -> None: + data = { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:natuerlichePerson": { + "tns:vollerName": { + "tns:vorname": "Stephen", + "tns:nachname": "King", + }, + "tns:anschrift": {"tns:ort": "Maine"}, + } + } + }, + "tns:rolle": {"tns:rollenbezeichnung": {"code": "269"}}, + } + expected_result = PersonToCompanyRelationship( + role=RelationshipRoleEnum.GESCHAEFTSLEITER, # type: ignore + date_of_birth=None, + name=PersonName(**{"firstname": "Stephen", "lastname": "King"}), + type=CompanyRelationshipEnum.PERSON, + location=Location(**{"city": "Maine"}), + ) + assert transform.parse_stakeholder(data) == expected_result + + +def test_parse_stakeholder_org() -> None: + data = { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:bezeichnung": { + "tns:bezeichnung.aktuell": "Transparenzregister kG" + }, + "tns:anschrift": { + "tns:ort": "Iserlohn", + "tns:strasse": "Hauptstrasse", + "tns:hausnummer": "42", + "tns:postleitzahl": "58636", + }, + } + } + }, + "tns:rolle": {"tns:rollenbezeichnung": {"code": "268"}}, + } + expected_result = CompanyToCompanyRelationship( + name="Transparenzregister kG", + role=RelationshipRoleEnum.DIREKTOR, # type: ignore + type=CompanyRelationshipEnum.COMPANY, + location=Location( + **{ + "city": "Iserlohn", + "zip_code": "58636", + "house_number": "42", + "street": "Hauptstrasse", + } + ), + ) + assert transform.parse_stakeholder(data) == expected_result + + +def test_parse_stakeholder_org_loc_from_sitz() -> None: + data = { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:bezeichnung": { + "tns:bezeichnung.aktuell": "Transparenzregister kG" + }, + "tns:sitz": { + "tns:ort": "Iserlohn", + "tns:strasse": "Hauptstrasse", + "tns:hausnummer": "42", + "tns:postleitzahl": "58636", + }, + } + } + }, + "tns:rolle": {"tns:rollenbezeichnung": {"code": "268"}}, + } + expected_result = CompanyToCompanyRelationship( + name="Transparenzregister kG", + role=RelationshipRoleEnum.DIREKTOR, # type: ignore + type=CompanyRelationshipEnum.COMPANY, + location=Location( + **{ + "city": "Iserlohn", + "zip_code": "58636", + "house_number": "42", + "street": "Hauptstrasse", + } + ), + ) + assert transform.parse_stakeholder(data) == expected_result + + +def test_parse_stakeholder_no_result() -> None: + data: dict = {"tns:beteiligter": {"tns:auswahl_beteiligter": {}}} # type: ignore + assert transform.parse_stakeholder(data) is None + + +def test_loc_from_beteiligung() -> None: + data = { + "tns:grunddaten": { + "tns:verfahrensdaten": { + "tns:beteiligung": [ + { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:anschrift": { + "tns:strasse": "Gewerbestraße", + "tns:hausnummer": "8", + "tns:postleitzahl": "72535", + "tns:ort": "Heroldstatt", + }, + }, + } + } + }, + ] + } + } + } + + expected_result = Location( + city="Heroldstatt", house_number="8", street="Gewerbestraße", zip_code="72535" + ) + assert transform.loc_from_beteiligung(data) == expected_result + + +def test_loc_from_beteiligung_number_contained_in_street() -> None: + data = { + "tns:grunddaten": { + "tns:verfahrensdaten": { + "tns:beteiligung": [ + { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:anschrift": { + "tns:strasse": "Gewerbestraße8", + "tns:postleitzahl": "72535", + "tns:ort": "Heroldstatt", + }, + }, + } + } + }, + ] + } + } + } + + expected_result = Location( + city="Heroldstatt", house_number="8", street="Gewerbestraße", zip_code="72535" + ) + assert transform.loc_from_beteiligung(data) == expected_result + + +def test_loc_from_beteiligung_no_result() -> None: + data = { + "tns:grunddaten": { + "tns:verfahrensdaten": { + "tns:beteiligung": [ + { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:anschrift": { + "tns:postleitzahl": "72535", + "tns:ort": "Heroldstatt", + }, + }, + } + } + }, + ] + } + } + } + + expected_result = Location( + city="Heroldstatt", house_number=None, street=None, zip_code="72535" + ) + assert transform.loc_from_beteiligung(data) == expected_result + + +def test_loc_from_beteiligung_combine() -> None: + data = { + "tns:grunddaten": { + "tns:verfahrensdaten": { + "tns:beteiligung": [ + { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:anschrift": { + "tns:postleitzahl": "72535", + "tns:strasse": "Pliangenserstr. 40", + "tns:hausnummer": "a", + "tns:ort": "Heroldstatt", + }, + }, + } + } + }, + ] + } + } + } + + expected_result = Location( + city="Heroldstatt", + house_number="40a", + street="Pliangenserstraße", + zip_code="72535", + ) + assert transform.loc_from_beteiligung(data) == expected_result + + +def test_name_from_beteiligung() -> None: + data = { + "tns:grunddaten": { + "tns:verfahrensdaten": { + "tns:beteiligung": [ + { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:bezeichnung": { + "tns:bezeichnung.aktuell": "1 A Autenrieth Kunststofftechnik GmbH & Co. KG" + }, + }, + } + }, + } + ] + } + } + } + + expected_result = "1 A Autenrieth Kunststofftechnik GmbH & Co. KG" + assert transform.name_from_beteiligung(data) == expected_result + + +def test_name_from_beteiligung_remove_quotes() -> None: + data = { + "tns:grunddaten": { + "tns:verfahrensdaten": { + "tns:beteiligung": [ + { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:bezeichnung": { + "tns:bezeichnung.aktuell": '"Siemes Verwaltungs-GmbH"' + }, + }, + } + }, + } + ] + } + } + } + + expected_result = "Siemes Verwaltungs-GmbH" + assert transform.name_from_beteiligung(data) == expected_result + + +def test_map_rechtsform() -> None: + data = { + "tns:fachdatenRegister": { + "tns:basisdatenRegister": { + "tns:rechtstraeger": { + "tns:angabenZurRechtsform": { + "tns:rechtsform": { + "code": "Gesellschaft mit beschränkter Haftung" + }, + } + }, + } + } + } + expected_result = CompanyTypeEnum.GMBH + assert transform.map_rechtsform("", data) == expected_result + + +def test_map_rechtsform_from_name() -> None: + data = [ + ("GEA Farm Technologies GmbH", "Gesellschaft mit beschränkter Haftung"), + ("Atos SE", "Europäische Aktiengesellschaft (SE)"), + ("Bilkenroth KG", "Kommanditgesellschaft"), + ("jfoiahfo8sah 98548902 öhz ö", None), + ] + + for company_name, expected_result in data: + assert transform.map_rechtsform(company_name, {}) == expected_result + + +def test_map_capital_kg_single() -> None: + capital = Capital( + currency=CurrencyEnum.EURO, value=69000, type=CapitalTypeEnum.HAFTEINLAGE # type: ignore + ) + data = { + "tns:fachdatenRegister": { + "tns:auswahl_zusatzangaben": { + "tns:personengesellschaft": { + "tns:zusatzKG": { + "tns:datenKommanditist": { + "tns:hafteinlage": { + "tns:zahl": str(capital.value), + "tns:waehrung": {"code": capital.currency}, + }, + } + } + } + } + } + } + + result = transform.map_capital(data, CompanyTypeEnum.KG) # type: ignore + assert result == capital + + +def test_map_capital_kg_sum() -> None: + capital = Capital( + currency=CurrencyEnum.EURO, value=20000, type=CapitalTypeEnum.HAFTEINLAGE # type: ignore + ) + data = { + "tns:fachdatenRegister": { + "tns:auswahl_zusatzangaben": { + "tns:personengesellschaft": { + "tns:zusatzKG": { + "tns:datenKommanditist": [ + { + "tns:hafteinlage": { + "tns:zahl": str(10000), + "tns:waehrung": {"code": capital.currency}, + } + }, + { + "tns:hafteinlage": { + "tns:zahl": str(10000), + "tns:waehrung": {"code": capital.currency}, + }, + }, + ] + } + } + } + } + } + + result = transform.map_capital(data, CompanyTypeEnum.KG) # type: ignore + assert result == capital + + +def test_map_capital_no_fachdaten() -> None: + data: dict = {"tns:fachdatenRegister": {}} + + result = transform.map_capital(data, CompanyTypeEnum.KG) # type: ignore + assert result is None + + +def test_map_capital_gmbh() -> None: + capital = Capital( + currency=CurrencyEnum.DEUTSCHE_MARK, value=42, type=CapitalTypeEnum.STAMMKAPITAL # type: ignore + ) + data = { + "tns:fachdatenRegister": { + "tns:auswahl_zusatzangaben": { + "tns:kapitalgesellschaft": { + "tns:zusatzGmbH": { + "tns:stammkapital": { + "tns:zahl": str(capital.value), + "tns:waehrung": {"code": capital.currency}, + }, + } + } + } + } + } + + result = transform.map_capital(data, CompanyTypeEnum.GMBH) # type: ignore + assert result == capital + + +def test_map_capital_ag() -> None: + capital = Capital( + currency=CurrencyEnum.DEUTSCHE_MARK, value=42, type=CapitalTypeEnum.GRUNDKAPITAL # type: ignore + ) + data = { + "tns:fachdatenRegister": { + "tns:auswahl_zusatzangaben": { + "tns:kapitalgesellschaft": { + "tns:zusatzAktiengesellschaft": { + "tns:grundkapital": { + "tns:hoehe": { + "tns:zahl": str(capital.value), + "tns:waehrung": {"code": capital.currency}, + } + }, + } + } + } + } + } + + result = transform.map_capital(data, CompanyTypeEnum.SE) # type: ignore + assert result == capital + + +def test_map_capital_personengesellschaft() -> None: + capital = Capital( + currency=CurrencyEnum.DEUTSCHE_MARK, value=42, type=CapitalTypeEnum.STAMMKAPITAL # type: ignore + ) + data = { + "tns:fachdatenRegister": { + "tns:auswahl_zusatzangaben": { + "tns:personengesellschaft": { + "tns:zusatzGmbH": { + "tns:stammkapital": { + "tns:zahl": str(capital.value), + "tns:waehrung": {"code": capital.currency}, + }, + } + } + } + } + } + + result = transform.map_capital(data, CompanyTypeEnum.OHG) # type: ignore + assert result == capital + + +def test_map_capital_einzelkaufmann() -> None: + capital = Capital( + currency=CurrencyEnum.DEUTSCHE_MARK, value=42, type=CapitalTypeEnum.STAMMKAPITAL # type: ignore + ) + data = { + "tns:fachdatenRegister": { + "tns:auswahl_zusatzangaben": { + "Personengesellschaft": { + "tns:zusatzGmbH": { + "tns:stammkapital": { + "tns:zahl": str(capital.value), + "tns:waehrung": {"code": capital.currency}, + }, + } + } + } + } + } + + result = transform.map_capital(data, CompanyTypeEnum.EINZELKAUFMANN) # type: ignore + assert result is None + + +def test_map_capital_partial_null_values() -> None: + capital = Capital( + currency=CurrencyEnum.DEUTSCHE_MARK, value=42, type=CapitalTypeEnum.STAMMKAPITAL # type: ignore + ) + data = { + "tns:fachdatenRegister": { + "tns:auswahl_zusatzangaben": { + "tns:personengesellschaft": { + "tns:zusatzGmbH": { + "tns:stammkapital": { + "tns:zahl": None, + "tns:waehrung": {"code": capital.currency}, + }, + } + } + } + } + } + + result = transform.map_capital(data, CompanyTypeEnum.OHG) # type: ignore + assert result is None + + +def test_map_business_purpose() -> None: + business_purpose = "Handel mit Betäubungsmitteln aller Art" + data = { + "tns:fachdatenRegister": { + "tns:basisdatenRegister": {"tns:gegenstand": business_purpose} + } + } + + result = transform.map_business_purpose(data) + assert result == business_purpose + + +def test_map_business_purpose_no_result() -> None: + data: dict = {} + + result = transform.map_business_purpose(data) + assert result is None + + +def test_map_founding_date_from_tag_der_ersten_eintragung() -> None: + data = { + "some entry": "Tag der ersten Eintragung: 01.05.2004", + "some other entry": "hfjdoöiashföahöf iodsazo8 5z4o fdsha8oü gfdsö", + } + expected_result = "2004-05-01" + result = transform.map_founding_date(data) + assert result == expected_result + + +def test_map_founding_date_from_gesellschaftsvertrag() -> None: + data = { + "some entry": "hfjdoöiashföahöf iodsazo8 5z4o fdsha8oü gfdsö", + "some other entry": "Das Wesen der Rekursion ist der Selbstaufruf Gesellschaftsvertrag vom 22.12.1996 Hallo Welt", + } + expected_result = "1996-12-22" + result = transform.map_founding_date(data) + assert result == expected_result + + +def test_map_founding_date_from_gruendungsdatum() -> None: + data = { + "tns:fachdatenRegister": { + "tns:basisdatenRegister": { + "tns:satzungsdatum": {"tns:aktuellesSatzungsdatum": "1998-01-01"} + } + } + } + expected_result = "1998-01-01" + result = transform.map_founding_date(data) + assert result == expected_result + + +def test_map_founding_date_no_result() -> None: + data: dict = {"tns:fachdatenRegister": {"tns:basisdatenRegister": {}}} + result = transform.map_founding_date(data) + assert result is None + + +def test_map_company_id() -> None: + district_court = DistrictCourt("Amtsgericht Ulm", "Ulm") + company_id = CompanyID(district_court, "HRA 4711") + data = { + "tns:grunddaten": { + "tns:verfahrensdaten": { + "tns:instanzdaten": { + "tns:aktenzeichen": { + "tns:auswahl_aktenzeichen": { + "tns:aktenzeichen.freitext": company_id.hr_number + } + }, + }, + "tns:beteiligung": [ + {}, + { + "tns:beteiligter": { + "tns:auswahl_beteiligter": { + "tns:organisation": { + "tns:bezeichnung": { + "tns:bezeichnung.aktuell": district_court.name + }, + "tns:anschrift": { + "tns:ort": district_court.city, + }, + } + } + }, + }, + ], + }, + }, + } + result = transform.map_company_id(data) + assert result == company_id + + +def test_map_last_update() -> None: + date = "2024-01-01" + data = {"tns:fachdatenRegister": {"tns:auszug": {"tns:letzteEintragung": date}}} + result = transform.map_last_update(data) + assert result == date + + +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.map_co_relation" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.map_company_id" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.name_from_beteiligung" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.loc_from_beteiligung" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.map_last_update" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.map_rechtsform" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.map_capital" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.map_business_purpose" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.map_founding_date" +) +@patch( + "aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.v3.V3_Transformer.parse_stakeholder" +) +def test_map_unternehmensregister_json( # noqa: PLR0913 + mock_map_parse_stakeholder: Mock, + mock_map_founding_date: Mock, + mock_map_business_purpose: Mock, + mock_map_capital: Mock, + mock_map_rechtsform: Mock, + mock_map_last_update: Mock, + mock_loc_from_beteiligung: Mock, + mock_map_name_from_beteiligung: Mock, + mock_map_company_id: Mock, + mock_map_co_relation: Mock, +) -> None: + expected_result = Company( + **{ # type: ignore + "id": Mock(), + "name": Mock(), + "location": Mock(), + "last_update": Mock(), + "company_type": Mock(), + "capital": Mock(), + "business_purpose": Mock(), + "founding_date": Mock(), + "relationships": [Mock()], + } + ) + + mock_map_company_id.return_value = expected_result.id + mock_map_name_from_beteiligung.return_value = expected_result.name + mock_loc_from_beteiligung.return_value = expected_result.location + mock_map_last_update.return_value = expected_result.last_update + mock_map_rechtsform.return_value = expected_result.company_type + mock_map_capital.return_value = expected_result.capital + mock_map_business_purpose.return_value = expected_result.business_purpose + mock_map_founding_date.return_value = expected_result.founding_date + mock_map_parse_stakeholder.return_value = expected_result.relationships[0] + mock_map_co_relation.side_effect = lambda x: x + + data: dict = { + "rootLayerWithSomeStuipStringNooneCaresAbout": { + "tns:grunddaten": {"tns:verfahrensdaten": {"tns:beteiligung": [{}, {}, {}]}} + } + } + + result = transform.map_unternehmensregister_json(data) + assert result == expected_result From f9d3f0eb76950a07dc5bdb6ff8f3e73e39d8596c Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sun, 5 Nov 2023 13:47:06 +0100 Subject: [PATCH 13/14] test: Cover apps/find_missing_companies.py --- .../apps/find_missing_companies.py | 12 ++++----- .../unternehmensregister/extract.py | 2 +- tests/apps/find_missing_companies_test.py | 25 +++++++++++++++++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py index 4cccaa8..cf0e1bb 100644 --- a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -33,18 +33,18 @@ from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector from aki_prj23_transparenzregister.utils.sql import connector, entities -def work(company: entities.Company, config_provider: ConfigProvider) -> None: +def work(company_name: str, config_provider: ConfigProvider) -> None: """Main method. Args: - company (entities.Company): Company to be searched for + company_name (str): Name of the company to search for config_provider (ConfigProvider): ConfigProvider """ with tempfile.TemporaryDirectory() as tmp_dir: xml_dir = os.path.join(*[tmp_dir, "xml"]) os.makedirs(xml_dir, exist_ok=True) try: - extract.scrape(company.name, xml_dir, True, True) # type: ignore + extract.scrape(company_name, xml_dir, True, True) # type: ignore except Exception as e: logger.error(e) return @@ -90,12 +90,12 @@ def work(company: entities.Company, config_provider: ConfigProvider) -> None: with connector.get_session(config_provider) as session: company = ( session.query(entities.MissingCompany) # type: ignore - .where(entities.MissingCompany.name == company.name) + .where(entities.MissingCompany.name == company_name) .first() ) company.searched_for = True # type: ignore session.commit() - logger.info(f"Processed {company.name}") + logger.info(f"Processed {company_name}") except Exception as e: logger.error(e) return @@ -133,7 +133,7 @@ if __name__ == "__main__": batch_size = 5 pool = multiprocessing.Pool(processes=batch_size) # Scrape data from unternehmensregister - params = [(company, config_provider) for company in missing_companies] + params = [(company.name, config_provider) for company in missing_companies] # Map the process_handler function to the parameter list using the Pool pool.starmap(work, params) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py index fb946d3..73f3d44 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py @@ -117,7 +117,7 @@ def scrape( try: wait.until( - lambda: wait_for_download_condition(download_path, num_files) # type: ignore + lambda x: wait_for_download_condition(download_path, num_files) # type: ignore ) file_name = "".join(e for e in company_name if e.isalnum()) + ".xml" rename_latest_file( diff --git a/tests/apps/find_missing_companies_test.py b/tests/apps/find_missing_companies_test.py index 8146e51..f985763 100644 --- a/tests/apps/find_missing_companies_test.py +++ b/tests/apps/find_missing_companies_test.py @@ -1,6 +1,31 @@ """Testing find_missing_companies.py.""" +from unittest.mock import Mock, patch + from aki_prj23_transparenzregister.apps import find_missing_companies def test_import_find_missing_companies() -> None: assert find_missing_companies + + +@patch("aki_prj23_transparenzregister.apps.find_missing_companies.MongoConnector") +@patch("aki_prj23_transparenzregister.apps.find_missing_companies.CompanyMongoService") +@patch( + "aki_prj23_transparenzregister.apps.find_missing_companies.load.load_directory_to_mongo" +) +@patch("aki_prj23_transparenzregister.apps.find_missing_companies.connector") +def test_work( + connector_mock: Mock, + load_directory_to_mongo_mock: Mock, + company_mongo_service_mock: Mock, + mongo_connector_mock: Mock, +) -> None: + config_provider_mock = Mock() + config_provider_mock.session.return_value = Mock() + + load_directory_to_mongo_mock.return_value = 42 + + find_missing_companies.work( + "Atos IT-Dienstleistung und Beratung GmbH", config_provider_mock + ) + assert True From 982cbd7ad454bee371502cd83b56312849988ea5 Mon Sep 17 00:00:00 2001 From: TrisNol Date: Sun, 5 Nov 2023 19:59:13 +0100 Subject: [PATCH 14/14] refactor: Resolve leftover todos --- .../apps/find_missing_companies.py | 2 +- .../unternehmensregister/transform/main.py | 8 +++----- .../unternehmensregister/transform/v1/v1.py | 1 - .../unternehmensregister/transform/v3/v3.py | 1 - 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py index cf0e1bb..29ec97d 100644 --- a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -126,7 +126,7 @@ if __name__ == "__main__": missing_companies = ( session.query(entities.MissingCompany) - .where(entities.MissingCompany.searched_for is False) + .where(entities.MissingCompany.searched_for == False) # noqa .all() ) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py index 6459311..dc57093 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/main.py @@ -57,7 +57,6 @@ def determine_version(data: dict) -> BaseTransformer: module: Version module """ if "XJustiz_Daten" in data: - # TODO consider class inheritance for version modules return v1.V1_Transformer() if "tns:nachrichtenkopf" in data[list(data.keys())[0]]: return v3.V3_Transformer() @@ -79,14 +78,13 @@ def map_unternehmensregister_json(data: dict) -> Company: if __name__ == "__main__": base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister" - # TODO Adapt to new structure with different versions for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")): path = os.path.join(f"{base_path}/export", file) with open(path, encoding="utf-8") as file_object: try: - company: Company = map_unternehmensregister_json( - json.loads(file_object.read()) - ) + data = json.loads(file_object.read()) + transformer: BaseTransformer = determine_version(data) + company: Company = transformer.map_unternehmensregister_json(data) name = "".join(e for e in company.name if e.isalnum())[:50] diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py index 77993d2..6cb60e7 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v1/v1.py @@ -435,7 +435,6 @@ class V1_Transformer(BaseTransformer): # noqa: N801 """ result: dict = {"relationships": []} - # TODO Refactor mapping - this is a nightmare... result["id"] = self.map_company_id(data) result["name"] = self.name_from_beteiligung(data) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py index 70f97cb..b787d4d 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform/v3/v3.py @@ -526,7 +526,6 @@ class V3_Transformer(BaseTransformer): # noqa: N801 path = ["tns:fachdatenRegister", "tns:auszug", "tns:letzteEintragung"] return traversal(data, path) - # TODO class model with inheritance - only difference: Determine root in __init__ def map_unternehmensregister_json(self, data: dict) -> Company: """Processes the Unternehmensregister structured export to a Company by using several helper methods.