checkpoint: Refactoring data-extraction from unternehmensregister to handle v1 and v3

This commit is contained in:
TrisNol
2023-11-03 11:35:45 +01:00
parent b7f977138d
commit 2458ad98ff
11 changed files with 4671 additions and 180 deletions

View File

@ -24,8 +24,8 @@ from aki_prj23_transparenzregister.utils.sql import entities
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import ( from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
extract, extract,
load, load,
transform,
) )
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import main as transform
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
@ -43,8 +43,8 @@ if __name__ == "__main__":
parsed = parser.parse_args(sys.argv[1:]) parsed = parser.parse_args(sys.argv[1:])
configer_logger(namespace=parsed) configer_logger(namespace=parsed)
config = parsed.config config = parsed.config
session = connector.get_session(get_config_provider(config)) # session = connector.get_session(get_config_provider(config))
missing_companies = session.query(entities.MissingCompany).all() # missing_companies = session.query(entities.MissingCompany).all()
counter = 0 counter = 0
# # Scrape data from unternehmensregister # # Scrape data from unternehmensregister
@ -63,22 +63,24 @@ if __name__ == "__main__":
for file in tqdm(glob.glob1(json_dir, "*.json")): for file in tqdm(glob.glob1(json_dir, "*.json")):
path = os.path.join(json_dir, file) path = os.path.join(json_dir, file)
with open(path, encoding="utf-8") as file_object: with open(path, encoding="utf-8") as file_object:
try: # try:
company = transform.map_unternehmensregister_json( print(path)
json.loads(file_object.read()) company = transform.map_unternehmensregister_json(
json.loads(file_object.read())
)
name = "".join(e for e in company.name if e.isalnum())[:50]
with open(
f"{output_path}/{name}.json",
"w+",
encoding="utf-8",
) as export_file:
json.dump(
dataclasses.asdict(company), export_file, ensure_ascii=False
) )
# except Exception as e:
name = "".join(e for e in company.name if e.isalnum())[:50] # logger.error(e.with_traceback())
# logger.error(e)
with open( # logger.error(f"Error in processing {path}")
f"{output_path}/{name}.json", # sys.exit(1)
"w+",
encoding="utf-8",
) as export_file:
json.dump(
dataclasses.asdict(company), export_file, ensure_ascii=False
)
except Exception as e:
logger.error(e)
logger.error(f"Error in processing {path}")
sys.exit(1)

View File

@ -0,0 +1,81 @@
"""Transform raw Unternehmensregister export (*.xml) to processed .json files for loading."""
import dataclasses
import glob
import json
import os
import re
import sys
import xmltodict
from tqdm import tqdm
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import v1
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3 import v3
from aki_prj23_transparenzregister.models.company import Company
def transform_xml_to_json(source_dir: str, target_dir: str) -> None:
"""Convert all xml files in a directory to json files.
Args:
source_dir (str): Directory hosting the xml files
target_dir (str): Target directory to move json files to
"""
for source_path in [
os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True)
]:
target_path = os.path.join(
target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json")
)
with open(source_path, encoding="utf-8") as source_file:
# deepcode ignore HandleUnicode: Weird XML format no other solution
data = xmltodict.parse(source_file.read().encode())
with open(target_path, "w", encoding="utf-8") as json_file:
json_file.write(json.dumps(data))
def determine_version(data: dict):
if "XJustiz_Daten" in data:
return v1
elif "tns:nachrichtenkopf" in data[list(data.keys())[0]]:
return v3
raise ValueError("Could not determine Unternehmensregister version.")
def map_unternehmensregister_json(data: dict) -> Company:
"""Processes the Unternehmensregister structured export to a Company by using several helper methods.
Args:
data (dict): Data export
Returns:
Company: Transformed data
"""
version = determine_version(data)
return version.map_unternehmensregister_json(data)
if __name__ == "__main__":
from loguru import logger
base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister"
for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")):
path = os.path.join(f"{base_path}/export", file)
with open(path, encoding="utf-8") as file_object:
try:
company: Company = map_unternehmensregister_json(
json.loads(file_object.read())
)
name = "".join(e for e in company.name if e.isalnum())[:50]
with open(
f"{base_path}/transformed/{name}.json",
"w+",
encoding="utf-8",
) as export_file:
json.dump(
dataclasses.asdict(company), export_file, ensure_ascii=False
)
except Exception as e:
logger.error(e)
logger.error(f"Error in processing {path}")
sys.exit(1)

View File

@ -0,0 +1,569 @@
"""Transform raw Unternehmensregister export (*.xml) to processed .json files for loading."""
import dataclasses
import glob
import json
import os
import re
import sys
import xmltodict
from tqdm import tqdm
from aki_prj23_transparenzregister.models.company import (
Capital,
CapitalTypeEnum,
Company,
CompanyID,
CompanyRelationship,
CompanyRelationshipEnum,
CompanyToCompanyRelationship,
CompanyTypeEnum,
CurrencyEnum,
DistrictCourt,
Location,
PersonName,
PersonToCompanyRelationship,
RelationshipRoleEnum,
)
from aki_prj23_transparenzregister.utils.string_tools import (
remove_traling_and_leading_quotes,
transform_date_to_iso,
)
def parse_date_of_birth(data: dict) -> str | None:
"""Retreives the date of birth from a stakeholder entry if possible.
Args:
data (dict): Stakeholder data
Returns:
str | None: date of birth or None if not found
"""
if "Geburt" in (base := data["Beteiligter"]["Natuerliche_Person"]):
base = base["Geburt"]["Geburtsdatum"]
if isinstance(base, str):
return base
return None
def parse_stakeholder(data: dict) -> CompanyRelationship | None:
"""Extract the company stakeholder/relation from a single "Beteiligung".
Args:
data (dict): Data export
Returns:
CompanyRelationship | None: Relationship if it could be processed
"""
if "Natuerliche_Person" in data["Beteiligter"]:
# It's a Company serving as a "Kommanditist" or similar
if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None:
return CompanyToCompanyRelationship(
**{ # type: ignore
"name": remove_traling_and_leading_quotes(
data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][
"Nachname"
]
),
"location": Location(
**{
"city": data["Beteiligter"]["Natuerliche_Person"][
"Anschrift"
][-1]["Ort"]
if isinstance(
data["Beteiligter"]["Natuerliche_Person"]["Anschrift"],
list,
)
else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][
"Ort"
]
}
),
"role": RelationshipRoleEnum(
data["Rolle"]["Rollenbezeichnung"]["content"]
),
"type": CompanyRelationshipEnum.COMPANY,
}
)
return PersonToCompanyRelationship(
**{ # type: ignore
"name": PersonName(
**{
"firstname": data["Beteiligter"]["Natuerliche_Person"][
"Voller_Name"
]["Vorname"],
"lastname": data["Beteiligter"]["Natuerliche_Person"][
"Voller_Name"
]["Nachname"],
}
),
"date_of_birth": parse_date_of_birth(data),
"location": Location(
**{
"city": data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][
-1
]["Ort"]
if isinstance(
data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], list
)
else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][
"Ort"
]
}
),
"role": RelationshipRoleEnum(
data["Rolle"]["Rollenbezeichnung"]["content"]
),
"type": CompanyRelationshipEnum.PERSON,
}
)
if "Organisation" in data["Beteiligter"]:
return CompanyToCompanyRelationship(
**{ # type: ignore
"role": RelationshipRoleEnum(
data["Rolle"]["Rollenbezeichnung"]["content"]
),
"name": remove_traling_and_leading_quotes(
data["Beteiligter"]["Organisation"]["Bezeichnung"][
"Bezeichnung_Aktuell"
]
),
"location": Location(
**{
"city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"],
"street": data["Beteiligter"]["Organisation"]["Anschrift"][
"Strasse"
]
if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"]
else None,
"house_number": data["Beteiligter"]["Organisation"][
"Anschrift"
]["Hausnummer"]
if "Hausnummer"
in data["Beteiligter"]["Organisation"]["Anschrift"]
else None,
"zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][
"Postleitzahl"
]
if "Postleitzahl"
in data["Beteiligter"]["Organisation"]["Anschrift"]
else None,
}
),
"type": CompanyRelationshipEnum.COMPANY,
}
)
return None
def normalize_street(street: str) -> str:
"""Normalize street names by extending them to `Straße` or `straße`.
Args:
street (str): Name of street
Returns:
str: Normalized street name
"""
if street is None:
return None
regex = r"(Str\.|Strasse)"
street = re.sub(regex, "Straße", street)
regex = r"(str\.|strasse)"
street = re.sub(regex, "straße", street)
return street.strip()
def loc_from_beteiligung(data: dict) -> Location:
"""Extract the company location from the first relationship in the export.
Args:
data (dict): Data export
Returns:
Location: location
"""
base = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][
"Beteiligter"
]["Organisation"]["Anschrift"]
house_number = None
street = None
if "Strasse" in base:
regex = r".(\d+)$"
hits = re.findall(regex, base["Strasse"])
if len(hits) == 1:
house_number = hits[0]
street = base["Strasse"][: (-1 * len(house_number))]
if "Hausnummer" in base:
house_number = house_number + base["Hausnummer"]
else:
if "Hausnummer" in base:
house_number = base["Hausnummer"]
street = base["Strasse"]
return Location(
**{
"city": base["Ort"],
"zip_code": base["Postleitzahl"],
"street": normalize_street(street), # type: ignore
"house_number": house_number,
}
)
def name_from_beteiligung(data: dict) -> str:
"""Extract the Company name from an Unternehmensregister export by using the first relationship found.
Args:
data (dict): Data export
Returns:
str: Company name
"""
name = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][
"Beteiligter"
]["Organisation"]["Bezeichnung"]["Bezeichnung_Aktuell"]
return remove_traling_and_leading_quotes(name)
def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None:
"""Extracts the company type from a given Unternehmensregister export.
Args:
company_name (str): Name of the company as a fallback solution
data (dict): Data export
Returns:
CompanyTypeEnum | None: Company type if found
"""
try:
return CompanyTypeEnum(
data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][
"Rechtstraeger"
]["Rechtsform"]["content"]
)
except KeyError:
if (
company_name.endswith("GmbH")
or company_name.endswith("UG")
or company_name.endswith("UG (haftungsbeschränkt)")
):
return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung")
if company_name.endswith("SE"):
return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)")
if company_name.endswith("KG"):
return CompanyTypeEnum("Kommanditgesellschaft")
return None
def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
"""Extracts the company capital from the given Unternehmensregister export.
Args:
data (dict): Data export
company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung')
Returns:
Capital | None: Company Capital if found
"""
# Early return
if "Zusatzangaben" not in data["XJustiz_Daten"]["Fachdaten_Register"]:
return None
capital: dict = {"Zahl": 0.0, "Waehrung": ""}
if company_type == CompanyTypeEnum.KG:
capital_type = "Hafteinlage"
base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][
"Personengesellschaft"
]["Zusatz_KG"]["Daten_Kommanditist"]
if isinstance(base, list):
for entry in base:
# TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below
capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"])
capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"]
elif isinstance(base, dict):
capital = base["Hafteinlage"]
elif company_type in [
CompanyTypeEnum.GMBH,
CompanyTypeEnum.SE,
CompanyTypeEnum.AG,
CompanyTypeEnum.KGaA,
CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM,
CompanyTypeEnum.OHG,
]:
if (
"Kapitalgesellschaft"
not in data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"]
):
base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][
"Personengesellschaft"
]
else:
base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][
"Kapitalgesellschaft"
]
if "Zusatz_GmbH" in base:
capital_type = "Stammkapital"
capital = base["Zusatz_GmbH"]["Stammkapital"]
elif "Zusatz_Aktiengesellschaft" in base:
capital_type = "Grundkapital"
capital = base["Zusatz_Aktiengesellschaft"]["Grundkapital"]["Hoehe"]
elif company_type in [
CompanyTypeEnum.EINZELKAUFMANN,
CompanyTypeEnum.EG,
CompanyTypeEnum.PARTNERSCHAFT,
CompanyTypeEnum.PARTNERGESELLSCHAFT,
CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT,
None,
]:
return None
# Catch entries having the dict but with null values
if not all(capital.values()):
return None
return Capital(
**{ # type: ignore
"value": float(capital["Zahl"]),
"currency": CurrencyEnum(capital["Waehrung"]),
"type": CapitalTypeEnum(capital_type),
}
)
def map_business_purpose(data: dict) -> str | None:
"""Extracts the "Geschäftszweck" from a given Unternehmensregister export.
Args:
data (dict): Data export
Returns:
str | None: Business purpose if found
"""
try:
return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][
"Gegenstand_oder_Geschaeftszweck"
]
except KeyError:
return None
def extract_date_from_string(value: str) -> str | None:
"""Extract a date in ISO format from the given string if possible.
Args:
value (str): Input text
Returns:
str | None: Date in ISO format, None if not found
"""
date_regex = [ # type: ignore
{"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso},
{"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None},
]
results = []
for regex in date_regex:
result = re.findall(regex["regex"], value) # type: ignore
if len(result) == 1:
relevant_data = result[0]
if regex["mapper"] is not None: # type: ignore
results.append(regex["mapper"](relevant_data)) # type: ignore
else:
results.append(relevant_data)
if len(results) != 1:
return None
return results[0]
def map_founding_date(data: dict) -> str | None:
"""Extracts the founding date from a given Unternehmensregister export.
Args:
data (dict): Data export
Returns:
str | None: Founding date if found
"""
text = str(data)
entry_date = re.findall(
r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text
)
if len(entry_date) == 1:
return transform_date_to_iso(entry_date[0][1])
entry_date = re.findall(
r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text
)
if len(entry_date) == 1:
return transform_date_to_iso(entry_date[0])
if (
"Gruendungsmetadaten"
in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"]
):
return extract_date_from_string(
data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][
"Gruendungsmetadaten"
]["Gruendungsdatum"]
)
# No reliable answer
return None
def map_company_id(data: dict) -> CompanyID:
"""Retrieve Company ID from export.
Args:
data (dict): Data export
Returns:
CompanyID: ID of the company
"""
return CompanyID(
**{
"hr_number": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
"Instanzdaten"
]["Aktenzeichen"],
"district_court": DistrictCourt(
**{
"name": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
"Beteiligung"
][1]["Beteiligter"]["Organisation"]["Bezeichnung"][
"Bezeichnung_Aktuell"
]
if "Organisation"
in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
"Beteiligung"
][1]["Beteiligter"]
else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
"Beteiligung"
][1]["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][
"Nachname"
],
"city": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
"Beteiligung"
][1]["Beteiligter"]["Organisation"]["Sitz"]["Ort"]
if "Organisation"
in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
"Beteiligung"
][1]["Beteiligter"]
else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
"Beteiligung"
][1]["Beteiligter"]["Natuerliche_Person"]["Anschrift"]["Ort"],
}
),
}
)
def map_last_update(data: dict) -> str:
"""Extract last update date from export.
Args:
data (dict): Unternehmensregister export
Returns:
str: Last update date
"""
return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"]["letzte_Eintragung"]
def map_co_relation(data: dict) -> dict:
"""Search for and map the c/o relation from location.street if possible.
Args:
data (dict): Company dict
Returns:
dict: Modified Company dict
"""
street = data["location"].street
if street is None:
return data
parts = street.split(",")
co_company = None
co_company_index = None
for index, part in enumerate(parts):
trimmed_part = part.strip()
result = re.findall(r"^c\/o(.*)$", trimmed_part)
if len(result) == 1:
co_company = result[0].strip()
co_company_index = index
if co_company_index is not None:
del parts[co_company_index]
street = "".join(parts).strip()
data["location"].street = street
if co_company is not None and co_company != "":
relation = CompanyToCompanyRelationship(
RelationshipRoleEnum.CARE_OF, # type: ignore
Location(
data["location"].city,
street,
data["location"].house_number,
data["location"].zip_code,
),
CompanyRelationshipEnum.COMPANY, # type: ignore
co_company,
)
data["relationships"].append(relation)
return data
def map_unternehmensregister_json(data: dict) -> Company:
"""Processes the Unternehmensregister structured export to a Company by using several helper methods.
Args:
data (dict): Data export
Returns:
Company: Transformed data
"""
result: dict = {"relationships": []}
# TODO Refactor mapping - this is a nightmare...
result["id"] = map_company_id(data)
result["name"] = name_from_beteiligung(data)
result["location"] = loc_from_beteiligung(data)
result["last_update"] = map_last_update(data)
result["company_type"] = map_rechtsform(result["name"], data)
result["capital"] = map_capital(data, result["company_type"])
result["business_purpose"] = map_business_purpose(data)
result["founding_date"] = map_founding_date(data)
for i in range(
2, len(data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"])
):
people = parse_stakeholder(
data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][i]
)
result["relationships"].append(people)
result = map_co_relation(result)
return Company(**result)
if __name__ == "__main__":
from loguru import logger
base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister"
for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")):
path = os.path.join(f"{base_path}/export", file)
with open(path, encoding="utf-8") as file_object:
try:
company: Company = map_unternehmensregister_json(
json.loads(file_object.read())
)
name = "".join(e for e in company.name if e.isalnum())[:50]
with open(
f"{base_path}/transformed/{name}.json",
"w+",
encoding="utf-8",
) as export_file:
json.dump(
dataclasses.asdict(company), export_file, ensure_ascii=False
)
except Exception as e:
logger.error(e)
logger.error(f"Error in processing {path}")
sys.exit(1)

View File

@ -0,0 +1,34 @@
import os
import xmltodict
from pathlib import Path
from aki_prj23_transparenzregister.models.company import RelationshipRoleEnum
class RoleMapper:
singleton = None
def __init__(self):
# TODO Automated file retrieval
base_path = os.path.dirname(Path(__file__))
path = os.path.join(base_path, "assets", "xjustiz_0040_cl_rollenbezeichnung_3_3.xsd")
with open(path, encoding="utf-8") as file:
content = file.read()
data = xmltodict.parse(content)
mapping = {}
for entry in data["xs:schema"]["xs:simpleType"]["xs:restriction"]["xs:enumeration"]:
mapping[entry['@value']] = entry['xs:annotation']['xs:appinfo']['wert']
self.dictionary = mapping
@staticmethod
def mapper():
if RoleMapper.singleton is None:
RoleMapper.singleton = RoleMapper()
return RoleMapper.singleton
def get(self, key: str) -> RelationshipRoleEnum:
return RelationshipRoleEnum(self.dictionary[key])
if __name__ == '__main__':
mapper = RoleMapper()
print(mapper.get("201"))

View File

@ -30,28 +30,9 @@ from aki_prj23_transparenzregister.utils.string_tools import (
transform_date_to_iso, transform_date_to_iso,
) )
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.role_mapper import (
def transform_xml_to_json(source_dir: str, target_dir: str) -> None: RoleMapper,
"""Convert all xml files in a directory to json files. )
Args:
source_dir (str): Directory hosting the xml files
target_dir (str): Target directory to move json files to
"""
if not os.path.exists(target_dir):
os.makedirs(target_dir)
for source_path in [
os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True)
]:
target_path = os.path.join(
target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json")
)
with open(source_path, encoding="utf-8") as source_file:
# deepcode ignore HandleUnicode: Weird XML format no other solution
data = xmltodict.parse(source_file.read().encode())
with open(target_path, "w", encoding="utf-8") as json_file:
json_file.write(json.dumps(data))
def parse_date_of_birth(data: dict) -> str | None: def parse_date_of_birth(data: dict) -> str | None:
@ -63,22 +44,20 @@ def parse_date_of_birth(data: dict) -> str | None:
Returns: Returns:
str | None: date of birth or None if not found str | None: date of birth or None if not found
""" """
if "tns:geburt" in (base := data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]): if "tns:geburt" in (
base := data["tns:beteiligter"]["tns:auswahl_beteiligter"][
"tns:natuerlichePerson"
]
):
base = base["tns:geburt"]["tns:geburtsdatum"] base = base["tns:geburt"]["tns:geburtsdatum"]
if isinstance(base, str): if isinstance(base, str):
return base return base
return None return None
def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum: def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum:
match role_id: mapper = RoleMapper.mapper()
case "086": return mapper.get(role_id)
return RelationshipRoleEnum.GESCHAEFTSFUEHRER
case "285":
return RelationshipRoleEnum.PROKURIST
case "194":
return RelationshipRoleEnum.VORSTAND
case _:
raise KeyError(f'Uknown role_id: {role_id}')
def parse_stakeholder(data: dict) -> CompanyRelationship | None: def parse_stakeholder(data: dict) -> CompanyRelationship | None:
@ -92,100 +71,120 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None:
""" """
if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]: if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]:
# It's a Company serving as a "Kommanditist" or similar # It's a Company serving as a "Kommanditist" or similar
# if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None: if (
# return CompanyToCompanyRelationship( "tns:vorname"
# **{ # type: ignore not in data["tns:beteiligter"]["tns:auswahl_beteiligter"][
# "name": remove_traling_and_leading_quotes( "tns:natuerlichePerson"
# data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][ ]["tns:vollerName"]
# "Nachname" ):
# ] return CompanyToCompanyRelationship(
# ), **{ # type: ignore
# "location": Location( "name": remove_traling_and_leading_quotes(
# **{ data["tns:beteiligter"]["tns:auswahl_beteiligter"][
# "city": data["Beteiligter"]["Natuerliche_Person"][ "tns:natuerlichePerson"
# "Anschrift" ]["tns:vollerName"]["tns:nachname"]
# ][-1]["Ort"] ),
# if isinstance( "location": Location(
# data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], **{
# list, "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"][
# ) "tns:natuerlichePerson"
# else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][ ]["tns:anschrift"][-1]["tns:ort"]
# "Ort" if isinstance(
# ] data["tns:beteiligter"]["tns:auswahl_beteiligter"][
# } "tns:natuerlichePerson"
# ), ]["tns:anschrift"],
# "role": RelationshipRoleEnum( list,
# data["Rolle"]["Rollenbezeichnung"]["content"] )
# ), else data["tns:beteiligter"]["tns:auswahl_beteiligter"][
# "type": CompanyRelationshipEnum.COMPANY, "tns:natuerlichePerson"
# } ]["tns:anschrift"]["tns:ort"]
# ) }
),
"role": map_role_id_to_enum(
data["tns:rolle"]["tns:rollenbezeichnung"]["code"]
),
"type": CompanyRelationshipEnum.COMPANY,
}
)
return PersonToCompanyRelationship( return PersonToCompanyRelationship(
**{ # type: ignore **{ # type: ignore
"name": PersonName( "name": PersonName(
**{ **{
"firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ "firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"][
"tns:vollerName" "tns:natuerlichePerson"
]["tns:vorname"], ]["tns:vollerName"]["tns:vorname"],
"lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][ "lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"][
"tns:vollerName" "tns:natuerlichePerson"
]["tns:nachname"], ]["tns:vollerName"]["tns:nachname"],
} }
), ),
"date_of_birth": parse_date_of_birth(data), "date_of_birth": parse_date_of_birth(data),
"location": Location( "location": Location(
**{ **{
"city": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ "city": data["tns:beteiligter"]["tns:auswahl_beteiligter"][
-1 "tns:natuerlichePerson"
]["tns:ort"] ]["tns:anschrift"][-1]["tns:ort"]
if isinstance( if isinstance(
data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"], list data["tns:beteiligter"]["tns:auswahl_beteiligter"][
"tns:natuerlichePerson"
]["tns:anschrift"],
list,
) )
else data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][ else data["tns:beteiligter"]["tns:auswahl_beteiligter"][
"tns:ort" "tns:natuerlichePerson"
] ]["tns:anschrift"]["tns:ort"]
} }
), ),
# TODO get role via ID
"role": map_role_id_to_enum( "role": map_role_id_to_enum(
data["tns:rolle"]["tns:rollenbezeichnung"]["code"] data["tns:rolle"]["tns:rollenbezeichnung"]["code"]
), ),
"type": CompanyRelationshipEnum.PERSON, "type": CompanyRelationshipEnum.PERSON,
} }
) )
if "Organisation" in data["Beteiligter"]: if "tns:organisation" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]:
base = data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"]
location = None
if "tns:anschrift" in base:
location = Location(
**{
"city": base["tns:anschrift"]["tns:ort"],
"street": base["tns:anschrift"]["tns:strasse"]
if "tns:strasse" in base["tns:anschrift"]
else None,
"house_number": base["tns:anschrift"]["tns:hausnummer"]
if "tns:hausnummer" in base["tns:anschrift"]
else None,
"zip_code": base["tns:anschrift"]["tns:postleitzahl"]
if "tns:potsleitzahl" in base["tns:anschrift"]
else None,
}
)
else:
location = Location(
**{
"city": base["tns:sitz"]["tns:ort"],
"street": base["tns:sitz"]["tns:strasse"]
if "tns:strasse" in base["tns:sitz"]
else None,
"house_number": base["tns:sitz"]["tns:hausnummer"]
if "tns:hausnummer" in base["tns:sitz"]
else None,
"zip_code": base["tns:sitz"]["tns:postleitzahl"]
if "tns:potsleitzahl" in base["tns:sitz"]
else None,
}
)
return CompanyToCompanyRelationship( return CompanyToCompanyRelationship(
**{ # type: ignore **{ # type: ignore
"role": RelationshipRoleEnum( "role": map_role_id_to_enum(
data["Rolle"]["Rollenbezeichnung"]["content"] data["tns:rolle"]["tns:rollenbezeichnung"]["code"]
), ),
"name": remove_traling_and_leading_quotes( "name": remove_traling_and_leading_quotes(
data["Beteiligter"]["Organisation"]["Bezeichnung"][ base["tns:bezeichnung"]["tns:bezeichnung.aktuell"]
"Bezeichnung_Aktuell"
]
),
"location": Location(
**{
"city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"],
"street": data["Beteiligter"]["Organisation"]["Anschrift"][
"Strasse"
]
if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"]
else None,
"house_number": data["Beteiligter"]["Organisation"][
"Anschrift"
]["Hausnummer"]
if "Hausnummer"
in data["Beteiligter"]["Organisation"]["Anschrift"]
else None,
"zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][
"Postleitzahl"
]
if "Postleitzahl"
in data["Beteiligter"]["Organisation"]["Anschrift"]
else None,
}
), ),
"location": location,
"type": CompanyRelationshipEnum.COMPANY, "type": CompanyRelationshipEnum.COMPANY,
} }
) )
@ -227,10 +226,16 @@ def loc_from_beteiligung(data: dict) -> Location:
"tns:beteiligter", "tns:beteiligter",
"tns:auswahl_beteiligter", "tns:auswahl_beteiligter",
"tns:organisation", "tns:organisation",
"tns:anschrift" # "tns:anschrift",
] ]
base = traversal(data, base_path) base = traversal(data, base_path)
if "tns:anschrift" in base:
base = base["tns:anschrift"]
else:
base = base["tns:sitz"]
if isinstance(base, list):
base = base[0]
house_number = None house_number = None
street = None street = None
if "tns:strasse" in base: if "tns:strasse" in base:
@ -273,7 +278,7 @@ def name_from_beteiligung(data: dict) -> str:
"tns:auswahl_beteiligter", "tns:auswahl_beteiligter",
"tns:organisation", "tns:organisation",
"tns:bezeichnung", "tns:bezeichnung",
"tns:bezeichnung.aktuell" "tns:bezeichnung.aktuell",
] ]
name = traversal(data, path) name = traversal(data, path)
return remove_traling_and_leading_quotes(name) return remove_traling_and_leading_quotes(name)
@ -296,11 +301,9 @@ def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None:
"tns:rechtstraeger", "tns:rechtstraeger",
"tns:angabenZurRechtsform", "tns:angabenZurRechtsform",
"tns:rechtsform", "tns:rechtsform",
"code" "code",
] ]
return CompanyTypeEnum( return CompanyTypeEnum(traversal(data, path))
traversal(data, path)
)
except Exception: except Exception:
if ( if (
company_name.endswith("GmbH") company_name.endswith("GmbH")
@ -328,8 +331,8 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
# Early return # Early return
if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]: if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]:
return None return None
capital: dict = {"Zahl": 0.0, "Waehrung": ""} capital: dict = {"tns:zahl": 0.0, "tns:waehrung": {"code": None}}
if company_type == CompanyTypeEnum.KG: if company_type == CompanyTypeEnum.KG and "tns:personengesellschaft" in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"]:
capital_type = "Hafteinlage" capital_type = "Hafteinlage"
base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][ base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][
"tns:personengesellschaft" "tns:personengesellschaft"
@ -337,10 +340,14 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
if isinstance(base, list): if isinstance(base, list):
for entry in base: for entry in base:
# TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below # TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below
capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"]) capital["tns:zahl"] = capital["tns:zahl"] + float(
capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"] entry["tns:hafteinlage"]["tns:zahl"]
)
capital["tns:waehrung"]["code"] = entry["tns:hafteinlage"][
"tns:waehrung"
]["code"]
elif isinstance(base, dict): elif isinstance(base, dict):
capital = base["Hafteinlage"] capital = base["tns:hafteinlage"]
elif company_type in [ elif company_type in [
CompanyTypeEnum.GMBH, CompanyTypeEnum.GMBH,
CompanyTypeEnum.SE, CompanyTypeEnum.SE,
@ -365,7 +372,9 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
capital = base["tns:zusatzGmbH"]["tns:stammkapital"] capital = base["tns:zusatzGmbH"]["tns:stammkapital"]
elif "tns:zusatzAktiengesellschaft" in base: elif "tns:zusatzAktiengesellschaft" in base:
capital_type = "Grundkapital" capital_type = "Grundkapital"
capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"]["tns:hoehe"] capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"][
"tns:hoehe"
]
elif company_type in [ elif company_type in [
CompanyTypeEnum.EINZELKAUFMANN, CompanyTypeEnum.EINZELKAUFMANN,
CompanyTypeEnum.EG, CompanyTypeEnum.EG,
@ -397,11 +406,7 @@ def map_business_purpose(data: dict) -> str | None:
str | None: Business purpose if found str | None: Business purpose if found
""" """
try: try:
path = [ path = ["tns:fachdatenRegister", "tns:basisdatenRegister", "tns:gegenstand"]
"tns:fachdatenRegister",
"tns:basisdatenRegister",
"tns:gegenstand"
]
return traversal(data, path) return traversal(data, path)
except KeyError: except KeyError:
return None return None
@ -455,20 +460,18 @@ def map_founding_date(data: dict) -> str | None:
) )
if len(entry_date) == 1: if len(entry_date) == 1:
return transform_date_to_iso(entry_date[0]) return transform_date_to_iso(entry_date[0])
if ( if "tns:satzungsdatum" in data["tns:fachdatenRegister"]["tns:basisdatenRegister"]:
"tns:satzungsdatum"
in data["tns:fachdatenRegister"]["tns:basisdatenRegister"]
):
path = [ path = [
"tns:fachdatenRegister", "tns:fachdatenRegister",
"tns:basisdatenRegister", "tns:basisdatenRegister",
"tns:satzungsdatum", "tns:satzungsdatum",
"tns:aktuellesSatzungsdatum" "tns:aktuellesSatzungsdatum",
] ]
return traversal(data, path) return traversal(data, path)
# No reliable answer # No reliable answer
return None return None
def traversal(data: dict, path: list[str | int]) -> any: def traversal(data: dict, path: list[str | int]) -> any:
current = data current = data
for key in path: for key in path:
@ -484,15 +487,14 @@ def map_hr_number(data: dict) -> str:
"tns:aktenzeichen" "tns:aktenzeichen"
]["tns:auswahl_aktenzeichen"] ]["tns:auswahl_aktenzeichen"]
if "tns:aktenzeichen.strukturiert" in base: if "tns:aktenzeichen.strukturiert" in base:
hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"][ hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"]["code"]
"code"
]
hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"] hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"]
return f"{hr_prefix} {hr_number}" return f"{hr_prefix} {hr_number}"
elif "tns:aktenzeichen.freitext" in base: elif "tns:aktenzeichen.freitext" in base:
return base["tns:aktenzeichen.freitext"] return base["tns:aktenzeichen.freitext"]
return hr_full return hr_full
def map_district_court(data: dict) -> DistrictCourt: def map_district_court(data: dict) -> DistrictCourt:
base_path = [ base_path = [
"tns:grunddaten", "tns:grunddaten",
@ -501,17 +503,11 @@ def map_district_court(data: dict) -> DistrictCourt:
1, 1,
"tns:beteiligter", "tns:beteiligter",
"tns:auswahl_beteiligter", "tns:auswahl_beteiligter",
"tns:organisation" "tns:organisation",
]
path = [*base_path,
"tns:bezeichnung",
"tns:bezeichnung.aktuell"
] ]
path = [*base_path, "tns:bezeichnung", "tns:bezeichnung.aktuell"]
name = traversal(data, path) name = traversal(data, path)
path = [*base_path, path = [*base_path, "tns:anschrift", "tns:ort"]
"tns:anschrift",
"tns:ort"
]
city = traversal(data, path) city = traversal(data, path)
return DistrictCourt(name=name, city=city) return DistrictCourt(name=name, city=city)
@ -525,12 +521,14 @@ def map_company_id(data: dict) -> CompanyID:
Returns: Returns:
CompanyID: ID of the company CompanyID: ID of the company
""" """
return CompanyID( try:
**{ return CompanyID(
"hr_number": map_hr_number(data), **{"hr_number": map_hr_number(data), "district_court": map_district_court(data)}
"district_court": map_district_court(data) )
} except KeyError:
) hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][0]["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"]["tns:registereintragung"]["tns:registernummer"]
district_court = map_district_court(data)
return CompanyID(hr_number=hr_number, district_court=district_court)
def map_last_update(data: dict) -> str: def map_last_update(data: dict) -> str:
@ -542,11 +540,7 @@ def map_last_update(data: dict) -> str:
Returns: Returns:
str: Last update date str: Last update date
""" """
path = [ path = ["tns:fachdatenRegister", "tns:auszug", "tns:letzteEintragung"]
"tns:fachdatenRegister",
"tns:auszug",
"tns:letzteEintragung"
]
return traversal(data, path) return traversal(data, path)

File diff suppressed because it is too large Load Diff