mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-06-22 04:43:54 +02:00
checkpoint: Refactoring data-extraction from unternehmensregister to handle v1 and v3
This commit is contained in:
@ -24,8 +24,8 @@ from aki_prj23_transparenzregister.utils.sql import entities
|
|||||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
|
||||||
extract,
|
extract,
|
||||||
load,
|
load,
|
||||||
transform,
|
|
||||||
)
|
)
|
||||||
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import main as transform
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
@ -43,8 +43,8 @@ if __name__ == "__main__":
|
|||||||
parsed = parser.parse_args(sys.argv[1:])
|
parsed = parser.parse_args(sys.argv[1:])
|
||||||
configer_logger(namespace=parsed)
|
configer_logger(namespace=parsed)
|
||||||
config = parsed.config
|
config = parsed.config
|
||||||
session = connector.get_session(get_config_provider(config))
|
# session = connector.get_session(get_config_provider(config))
|
||||||
missing_companies = session.query(entities.MissingCompany).all()
|
# missing_companies = session.query(entities.MissingCompany).all()
|
||||||
|
|
||||||
counter = 0
|
counter = 0
|
||||||
# # Scrape data from unternehmensregister
|
# # Scrape data from unternehmensregister
|
||||||
@ -63,22 +63,24 @@ if __name__ == "__main__":
|
|||||||
for file in tqdm(glob.glob1(json_dir, "*.json")):
|
for file in tqdm(glob.glob1(json_dir, "*.json")):
|
||||||
path = os.path.join(json_dir, file)
|
path = os.path.join(json_dir, file)
|
||||||
with open(path, encoding="utf-8") as file_object:
|
with open(path, encoding="utf-8") as file_object:
|
||||||
try:
|
# try:
|
||||||
company = transform.map_unternehmensregister_json(
|
print(path)
|
||||||
json.loads(file_object.read())
|
company = transform.map_unternehmensregister_json(
|
||||||
|
json.loads(file_object.read())
|
||||||
|
)
|
||||||
|
|
||||||
|
name = "".join(e for e in company.name if e.isalnum())[:50]
|
||||||
|
|
||||||
|
with open(
|
||||||
|
f"{output_path}/{name}.json",
|
||||||
|
"w+",
|
||||||
|
encoding="utf-8",
|
||||||
|
) as export_file:
|
||||||
|
json.dump(
|
||||||
|
dataclasses.asdict(company), export_file, ensure_ascii=False
|
||||||
)
|
)
|
||||||
|
# except Exception as e:
|
||||||
name = "".join(e for e in company.name if e.isalnum())[:50]
|
# logger.error(e.with_traceback())
|
||||||
|
# logger.error(e)
|
||||||
with open(
|
# logger.error(f"Error in processing {path}")
|
||||||
f"{output_path}/{name}.json",
|
# sys.exit(1)
|
||||||
"w+",
|
|
||||||
encoding="utf-8",
|
|
||||||
) as export_file:
|
|
||||||
json.dump(
|
|
||||||
dataclasses.asdict(company), export_file, ensure_ascii=False
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(e)
|
|
||||||
logger.error(f"Error in processing {path}")
|
|
||||||
sys.exit(1)
|
|
@ -0,0 +1,81 @@
|
|||||||
|
"""Transform raw Unternehmensregister export (*.xml) to processed .json files for loading."""
|
||||||
|
import dataclasses
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import xmltodict
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import v1
|
||||||
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3 import v3
|
||||||
|
from aki_prj23_transparenzregister.models.company import Company
|
||||||
|
|
||||||
|
def transform_xml_to_json(source_dir: str, target_dir: str) -> None:
|
||||||
|
"""Convert all xml files in a directory to json files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_dir (str): Directory hosting the xml files
|
||||||
|
target_dir (str): Target directory to move json files to
|
||||||
|
"""
|
||||||
|
for source_path in [
|
||||||
|
os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True)
|
||||||
|
]:
|
||||||
|
target_path = os.path.join(
|
||||||
|
target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json")
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(source_path, encoding="utf-8") as source_file:
|
||||||
|
# deepcode ignore HandleUnicode: Weird XML format no other solution
|
||||||
|
data = xmltodict.parse(source_file.read().encode())
|
||||||
|
with open(target_path, "w", encoding="utf-8") as json_file:
|
||||||
|
json_file.write(json.dumps(data))
|
||||||
|
|
||||||
|
def determine_version(data: dict):
|
||||||
|
if "XJustiz_Daten" in data:
|
||||||
|
return v1
|
||||||
|
elif "tns:nachrichtenkopf" in data[list(data.keys())[0]]:
|
||||||
|
return v3
|
||||||
|
raise ValueError("Could not determine Unternehmensregister version.")
|
||||||
|
|
||||||
|
def map_unternehmensregister_json(data: dict) -> Company:
|
||||||
|
"""Processes the Unternehmensregister structured export to a Company by using several helper methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Company: Transformed data
|
||||||
|
"""
|
||||||
|
version = determine_version(data)
|
||||||
|
return version.map_unternehmensregister_json(data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister"
|
||||||
|
for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")):
|
||||||
|
path = os.path.join(f"{base_path}/export", file)
|
||||||
|
with open(path, encoding="utf-8") as file_object:
|
||||||
|
try:
|
||||||
|
company: Company = map_unternehmensregister_json(
|
||||||
|
json.loads(file_object.read())
|
||||||
|
)
|
||||||
|
|
||||||
|
name = "".join(e for e in company.name if e.isalnum())[:50]
|
||||||
|
|
||||||
|
with open(
|
||||||
|
f"{base_path}/transformed/{name}.json",
|
||||||
|
"w+",
|
||||||
|
encoding="utf-8",
|
||||||
|
) as export_file:
|
||||||
|
json.dump(
|
||||||
|
dataclasses.asdict(company), export_file, ensure_ascii=False
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
logger.error(f"Error in processing {path}")
|
||||||
|
sys.exit(1)
|
@ -0,0 +1,569 @@
|
|||||||
|
"""Transform raw Unternehmensregister export (*.xml) to processed .json files for loading."""
|
||||||
|
import dataclasses
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import xmltodict
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.models.company import (
|
||||||
|
Capital,
|
||||||
|
CapitalTypeEnum,
|
||||||
|
Company,
|
||||||
|
CompanyID,
|
||||||
|
CompanyRelationship,
|
||||||
|
CompanyRelationshipEnum,
|
||||||
|
CompanyToCompanyRelationship,
|
||||||
|
CompanyTypeEnum,
|
||||||
|
CurrencyEnum,
|
||||||
|
DistrictCourt,
|
||||||
|
Location,
|
||||||
|
PersonName,
|
||||||
|
PersonToCompanyRelationship,
|
||||||
|
RelationshipRoleEnum,
|
||||||
|
)
|
||||||
|
from aki_prj23_transparenzregister.utils.string_tools import (
|
||||||
|
remove_traling_and_leading_quotes,
|
||||||
|
transform_date_to_iso,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date_of_birth(data: dict) -> str | None:
|
||||||
|
"""Retreives the date of birth from a stakeholder entry if possible.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Stakeholder data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str | None: date of birth or None if not found
|
||||||
|
"""
|
||||||
|
if "Geburt" in (base := data["Beteiligter"]["Natuerliche_Person"]):
|
||||||
|
base = base["Geburt"]["Geburtsdatum"]
|
||||||
|
if isinstance(base, str):
|
||||||
|
return base
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_stakeholder(data: dict) -> CompanyRelationship | None:
|
||||||
|
"""Extract the company stakeholder/relation from a single "Beteiligung".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CompanyRelationship | None: Relationship if it could be processed
|
||||||
|
"""
|
||||||
|
if "Natuerliche_Person" in data["Beteiligter"]:
|
||||||
|
# It's a Company serving as a "Kommanditist" or similar
|
||||||
|
if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None:
|
||||||
|
return CompanyToCompanyRelationship(
|
||||||
|
**{ # type: ignore
|
||||||
|
"name": remove_traling_and_leading_quotes(
|
||||||
|
data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][
|
||||||
|
"Nachname"
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"location": Location(
|
||||||
|
**{
|
||||||
|
"city": data["Beteiligter"]["Natuerliche_Person"][
|
||||||
|
"Anschrift"
|
||||||
|
][-1]["Ort"]
|
||||||
|
if isinstance(
|
||||||
|
data["Beteiligter"]["Natuerliche_Person"]["Anschrift"],
|
||||||
|
list,
|
||||||
|
)
|
||||||
|
else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][
|
||||||
|
"Ort"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
"role": RelationshipRoleEnum(
|
||||||
|
data["Rolle"]["Rollenbezeichnung"]["content"]
|
||||||
|
),
|
||||||
|
"type": CompanyRelationshipEnum.COMPANY,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return PersonToCompanyRelationship(
|
||||||
|
**{ # type: ignore
|
||||||
|
"name": PersonName(
|
||||||
|
**{
|
||||||
|
"firstname": data["Beteiligter"]["Natuerliche_Person"][
|
||||||
|
"Voller_Name"
|
||||||
|
]["Vorname"],
|
||||||
|
"lastname": data["Beteiligter"]["Natuerliche_Person"][
|
||||||
|
"Voller_Name"
|
||||||
|
]["Nachname"],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
"date_of_birth": parse_date_of_birth(data),
|
||||||
|
"location": Location(
|
||||||
|
**{
|
||||||
|
"city": data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][
|
||||||
|
-1
|
||||||
|
]["Ort"]
|
||||||
|
if isinstance(
|
||||||
|
data["Beteiligter"]["Natuerliche_Person"]["Anschrift"], list
|
||||||
|
)
|
||||||
|
else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][
|
||||||
|
"Ort"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
"role": RelationshipRoleEnum(
|
||||||
|
data["Rolle"]["Rollenbezeichnung"]["content"]
|
||||||
|
),
|
||||||
|
"type": CompanyRelationshipEnum.PERSON,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if "Organisation" in data["Beteiligter"]:
|
||||||
|
return CompanyToCompanyRelationship(
|
||||||
|
**{ # type: ignore
|
||||||
|
"role": RelationshipRoleEnum(
|
||||||
|
data["Rolle"]["Rollenbezeichnung"]["content"]
|
||||||
|
),
|
||||||
|
"name": remove_traling_and_leading_quotes(
|
||||||
|
data["Beteiligter"]["Organisation"]["Bezeichnung"][
|
||||||
|
"Bezeichnung_Aktuell"
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"location": Location(
|
||||||
|
**{
|
||||||
|
"city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"],
|
||||||
|
"street": data["Beteiligter"]["Organisation"]["Anschrift"][
|
||||||
|
"Strasse"
|
||||||
|
]
|
||||||
|
if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"]
|
||||||
|
else None,
|
||||||
|
"house_number": data["Beteiligter"]["Organisation"][
|
||||||
|
"Anschrift"
|
||||||
|
]["Hausnummer"]
|
||||||
|
if "Hausnummer"
|
||||||
|
in data["Beteiligter"]["Organisation"]["Anschrift"]
|
||||||
|
else None,
|
||||||
|
"zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][
|
||||||
|
"Postleitzahl"
|
||||||
|
]
|
||||||
|
if "Postleitzahl"
|
||||||
|
in data["Beteiligter"]["Organisation"]["Anschrift"]
|
||||||
|
else None,
|
||||||
|
}
|
||||||
|
),
|
||||||
|
"type": CompanyRelationshipEnum.COMPANY,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_street(street: str) -> str:
|
||||||
|
"""Normalize street names by extending them to `Straße` or `straße`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
street (str): Name of street
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Normalized street name
|
||||||
|
"""
|
||||||
|
if street is None:
|
||||||
|
return None
|
||||||
|
regex = r"(Str\.|Strasse)"
|
||||||
|
street = re.sub(regex, "Straße", street)
|
||||||
|
regex = r"(str\.|strasse)"
|
||||||
|
street = re.sub(regex, "straße", street)
|
||||||
|
return street.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def loc_from_beteiligung(data: dict) -> Location:
|
||||||
|
"""Extract the company location from the first relationship in the export.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Location: location
|
||||||
|
"""
|
||||||
|
base = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][
|
||||||
|
"Beteiligter"
|
||||||
|
]["Organisation"]["Anschrift"]
|
||||||
|
|
||||||
|
house_number = None
|
||||||
|
street = None
|
||||||
|
if "Strasse" in base:
|
||||||
|
regex = r".(\d+)$"
|
||||||
|
hits = re.findall(regex, base["Strasse"])
|
||||||
|
if len(hits) == 1:
|
||||||
|
house_number = hits[0]
|
||||||
|
street = base["Strasse"][: (-1 * len(house_number))]
|
||||||
|
if "Hausnummer" in base:
|
||||||
|
house_number = house_number + base["Hausnummer"]
|
||||||
|
else:
|
||||||
|
if "Hausnummer" in base:
|
||||||
|
house_number = base["Hausnummer"]
|
||||||
|
street = base["Strasse"]
|
||||||
|
return Location(
|
||||||
|
**{
|
||||||
|
"city": base["Ort"],
|
||||||
|
"zip_code": base["Postleitzahl"],
|
||||||
|
"street": normalize_street(street), # type: ignore
|
||||||
|
"house_number": house_number,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def name_from_beteiligung(data: dict) -> str:
|
||||||
|
"""Extract the Company name from an Unternehmensregister export by using the first relationship found.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Company name
|
||||||
|
"""
|
||||||
|
name = data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][0][
|
||||||
|
"Beteiligter"
|
||||||
|
]["Organisation"]["Bezeichnung"]["Bezeichnung_Aktuell"]
|
||||||
|
return remove_traling_and_leading_quotes(name)
|
||||||
|
|
||||||
|
|
||||||
|
def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None:
|
||||||
|
"""Extracts the company type from a given Unternehmensregister export.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
company_name (str): Name of the company as a fallback solution
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CompanyTypeEnum | None: Company type if found
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return CompanyTypeEnum(
|
||||||
|
data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][
|
||||||
|
"Rechtstraeger"
|
||||||
|
]["Rechtsform"]["content"]
|
||||||
|
)
|
||||||
|
except KeyError:
|
||||||
|
if (
|
||||||
|
company_name.endswith("GmbH")
|
||||||
|
or company_name.endswith("UG")
|
||||||
|
or company_name.endswith("UG (haftungsbeschränkt)")
|
||||||
|
):
|
||||||
|
return CompanyTypeEnum("Gesellschaft mit beschränkter Haftung")
|
||||||
|
if company_name.endswith("SE"):
|
||||||
|
return CompanyTypeEnum("Europäische Aktiengesellschaft (SE)")
|
||||||
|
if company_name.endswith("KG"):
|
||||||
|
return CompanyTypeEnum("Kommanditgesellschaft")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
|
||||||
|
"""Extracts the company capital from the given Unternehmensregister export.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
company_type (CompanyTypeEnum): Type of company (e.g., 'Gesellschaft mit beschränkter Haftung')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Capital | None: Company Capital if found
|
||||||
|
"""
|
||||||
|
# Early return
|
||||||
|
if "Zusatzangaben" not in data["XJustiz_Daten"]["Fachdaten_Register"]:
|
||||||
|
return None
|
||||||
|
capital: dict = {"Zahl": 0.0, "Waehrung": ""}
|
||||||
|
if company_type == CompanyTypeEnum.KG:
|
||||||
|
capital_type = "Hafteinlage"
|
||||||
|
base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][
|
||||||
|
"Personengesellschaft"
|
||||||
|
]["Zusatz_KG"]["Daten_Kommanditist"]
|
||||||
|
if isinstance(base, list):
|
||||||
|
for entry in base:
|
||||||
|
# TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below
|
||||||
|
capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"])
|
||||||
|
capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"]
|
||||||
|
elif isinstance(base, dict):
|
||||||
|
capital = base["Hafteinlage"]
|
||||||
|
elif company_type in [
|
||||||
|
CompanyTypeEnum.GMBH,
|
||||||
|
CompanyTypeEnum.SE,
|
||||||
|
CompanyTypeEnum.AG,
|
||||||
|
CompanyTypeEnum.KGaA,
|
||||||
|
CompanyTypeEnum.AUSLAENDISCHE_RECHTSFORM,
|
||||||
|
CompanyTypeEnum.OHG,
|
||||||
|
]:
|
||||||
|
if (
|
||||||
|
"Kapitalgesellschaft"
|
||||||
|
not in data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"]
|
||||||
|
):
|
||||||
|
base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][
|
||||||
|
"Personengesellschaft"
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
base = data["XJustiz_Daten"]["Fachdaten_Register"]["Zusatzangaben"][
|
||||||
|
"Kapitalgesellschaft"
|
||||||
|
]
|
||||||
|
if "Zusatz_GmbH" in base:
|
||||||
|
capital_type = "Stammkapital"
|
||||||
|
capital = base["Zusatz_GmbH"]["Stammkapital"]
|
||||||
|
elif "Zusatz_Aktiengesellschaft" in base:
|
||||||
|
capital_type = "Grundkapital"
|
||||||
|
capital = base["Zusatz_Aktiengesellschaft"]["Grundkapital"]["Hoehe"]
|
||||||
|
elif company_type in [
|
||||||
|
CompanyTypeEnum.EINZELKAUFMANN,
|
||||||
|
CompanyTypeEnum.EG,
|
||||||
|
CompanyTypeEnum.PARTNERSCHAFT,
|
||||||
|
CompanyTypeEnum.PARTNERGESELLSCHAFT,
|
||||||
|
CompanyTypeEnum.PARTNERSCHAFTSGESELLSCHAFT,
|
||||||
|
None,
|
||||||
|
]:
|
||||||
|
return None
|
||||||
|
# Catch entries having the dict but with null values
|
||||||
|
if not all(capital.values()):
|
||||||
|
return None
|
||||||
|
return Capital(
|
||||||
|
**{ # type: ignore
|
||||||
|
"value": float(capital["Zahl"]),
|
||||||
|
"currency": CurrencyEnum(capital["Waehrung"]),
|
||||||
|
"type": CapitalTypeEnum(capital_type),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def map_business_purpose(data: dict) -> str | None:
|
||||||
|
"""Extracts the "Geschäftszweck" from a given Unternehmensregister export.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str | None: Business purpose if found
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][
|
||||||
|
"Gegenstand_oder_Geschaeftszweck"
|
||||||
|
]
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_date_from_string(value: str) -> str | None:
|
||||||
|
"""Extract a date in ISO format from the given string if possible.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
value (str): Input text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str | None: Date in ISO format, None if not found
|
||||||
|
"""
|
||||||
|
date_regex = [ # type: ignore
|
||||||
|
{"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso},
|
||||||
|
{"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None},
|
||||||
|
]
|
||||||
|
results = []
|
||||||
|
for regex in date_regex:
|
||||||
|
result = re.findall(regex["regex"], value) # type: ignore
|
||||||
|
if len(result) == 1:
|
||||||
|
relevant_data = result[0]
|
||||||
|
if regex["mapper"] is not None: # type: ignore
|
||||||
|
results.append(regex["mapper"](relevant_data)) # type: ignore
|
||||||
|
else:
|
||||||
|
results.append(relevant_data)
|
||||||
|
if len(results) != 1:
|
||||||
|
return None
|
||||||
|
return results[0]
|
||||||
|
|
||||||
|
|
||||||
|
def map_founding_date(data: dict) -> str | None:
|
||||||
|
"""Extracts the founding date from a given Unternehmensregister export.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str | None: Founding date if found
|
||||||
|
"""
|
||||||
|
text = str(data)
|
||||||
|
entry_date = re.findall(
|
||||||
|
r".Tag der ersten Eintragung:(\\n| )?(\d{1,2}\.\d{1,2}\.\d{2,4})", text
|
||||||
|
)
|
||||||
|
if len(entry_date) == 1:
|
||||||
|
return transform_date_to_iso(entry_date[0][1])
|
||||||
|
|
||||||
|
entry_date = re.findall(
|
||||||
|
r".Gesellschaftsvertrag vom (\d{1,2}\.\d{1,2}\.\d{2,4})", text
|
||||||
|
)
|
||||||
|
if len(entry_date) == 1:
|
||||||
|
return transform_date_to_iso(entry_date[0])
|
||||||
|
if (
|
||||||
|
"Gruendungsmetadaten"
|
||||||
|
in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"]
|
||||||
|
):
|
||||||
|
return extract_date_from_string(
|
||||||
|
data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][
|
||||||
|
"Gruendungsmetadaten"
|
||||||
|
]["Gruendungsdatum"]
|
||||||
|
)
|
||||||
|
# No reliable answer
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def map_company_id(data: dict) -> CompanyID:
|
||||||
|
"""Retrieve Company ID from export.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CompanyID: ID of the company
|
||||||
|
"""
|
||||||
|
return CompanyID(
|
||||||
|
**{
|
||||||
|
"hr_number": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
|
||||||
|
"Instanzdaten"
|
||||||
|
]["Aktenzeichen"],
|
||||||
|
"district_court": DistrictCourt(
|
||||||
|
**{
|
||||||
|
"name": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
|
||||||
|
"Beteiligung"
|
||||||
|
][1]["Beteiligter"]["Organisation"]["Bezeichnung"][
|
||||||
|
"Bezeichnung_Aktuell"
|
||||||
|
]
|
||||||
|
if "Organisation"
|
||||||
|
in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
|
||||||
|
"Beteiligung"
|
||||||
|
][1]["Beteiligter"]
|
||||||
|
else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
|
||||||
|
"Beteiligung"
|
||||||
|
][1]["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][
|
||||||
|
"Nachname"
|
||||||
|
],
|
||||||
|
"city": data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
|
||||||
|
"Beteiligung"
|
||||||
|
][1]["Beteiligter"]["Organisation"]["Sitz"]["Ort"]
|
||||||
|
if "Organisation"
|
||||||
|
in data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
|
||||||
|
"Beteiligung"
|
||||||
|
][1]["Beteiligter"]
|
||||||
|
else data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"][
|
||||||
|
"Beteiligung"
|
||||||
|
][1]["Beteiligter"]["Natuerliche_Person"]["Anschrift"]["Ort"],
|
||||||
|
}
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def map_last_update(data: dict) -> str:
|
||||||
|
"""Extract last update date from export.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Unternehmensregister export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Last update date
|
||||||
|
"""
|
||||||
|
return data["XJustiz_Daten"]["Fachdaten_Register"]["Auszug"]["letzte_Eintragung"]
|
||||||
|
|
||||||
|
|
||||||
|
def map_co_relation(data: dict) -> dict:
|
||||||
|
"""Search for and map the c/o relation from location.street if possible.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Company dict
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Modified Company dict
|
||||||
|
"""
|
||||||
|
street = data["location"].street
|
||||||
|
if street is None:
|
||||||
|
return data
|
||||||
|
parts = street.split(",")
|
||||||
|
co_company = None
|
||||||
|
co_company_index = None
|
||||||
|
for index, part in enumerate(parts):
|
||||||
|
trimmed_part = part.strip()
|
||||||
|
result = re.findall(r"^c\/o(.*)$", trimmed_part)
|
||||||
|
if len(result) == 1:
|
||||||
|
co_company = result[0].strip()
|
||||||
|
co_company_index = index
|
||||||
|
if co_company_index is not None:
|
||||||
|
del parts[co_company_index]
|
||||||
|
street = "".join(parts).strip()
|
||||||
|
data["location"].street = street
|
||||||
|
|
||||||
|
if co_company is not None and co_company != "":
|
||||||
|
relation = CompanyToCompanyRelationship(
|
||||||
|
RelationshipRoleEnum.CARE_OF, # type: ignore
|
||||||
|
Location(
|
||||||
|
data["location"].city,
|
||||||
|
street,
|
||||||
|
data["location"].house_number,
|
||||||
|
data["location"].zip_code,
|
||||||
|
),
|
||||||
|
CompanyRelationshipEnum.COMPANY, # type: ignore
|
||||||
|
co_company,
|
||||||
|
)
|
||||||
|
data["relationships"].append(relation)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def map_unternehmensregister_json(data: dict) -> Company:
|
||||||
|
"""Processes the Unternehmensregister structured export to a Company by using several helper methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict): Data export
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Company: Transformed data
|
||||||
|
"""
|
||||||
|
result: dict = {"relationships": []}
|
||||||
|
|
||||||
|
# TODO Refactor mapping - this is a nightmare...
|
||||||
|
result["id"] = map_company_id(data)
|
||||||
|
result["name"] = name_from_beteiligung(data)
|
||||||
|
|
||||||
|
result["location"] = loc_from_beteiligung(data)
|
||||||
|
result["last_update"] = map_last_update(data)
|
||||||
|
|
||||||
|
result["company_type"] = map_rechtsform(result["name"], data)
|
||||||
|
result["capital"] = map_capital(data, result["company_type"])
|
||||||
|
result["business_purpose"] = map_business_purpose(data)
|
||||||
|
result["founding_date"] = map_founding_date(data)
|
||||||
|
|
||||||
|
for i in range(
|
||||||
|
2, len(data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"])
|
||||||
|
):
|
||||||
|
people = parse_stakeholder(
|
||||||
|
data["XJustiz_Daten"]["Grunddaten"]["Verfahrensdaten"]["Beteiligung"][i]
|
||||||
|
)
|
||||||
|
result["relationships"].append(people)
|
||||||
|
result = map_co_relation(result)
|
||||||
|
return Company(**result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister"
|
||||||
|
for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")):
|
||||||
|
path = os.path.join(f"{base_path}/export", file)
|
||||||
|
with open(path, encoding="utf-8") as file_object:
|
||||||
|
try:
|
||||||
|
company: Company = map_unternehmensregister_json(
|
||||||
|
json.loads(file_object.read())
|
||||||
|
)
|
||||||
|
|
||||||
|
name = "".join(e for e in company.name if e.isalnum())[:50]
|
||||||
|
|
||||||
|
with open(
|
||||||
|
f"{base_path}/transformed/{name}.json",
|
||||||
|
"w+",
|
||||||
|
encoding="utf-8",
|
||||||
|
) as export_file:
|
||||||
|
json.dump(
|
||||||
|
dataclasses.asdict(company), export_file, ensure_ascii=False
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
logger.error(f"Error in processing {path}")
|
||||||
|
sys.exit(1)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,34 @@
|
|||||||
|
import os
|
||||||
|
import xmltodict
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from aki_prj23_transparenzregister.models.company import RelationshipRoleEnum
|
||||||
|
|
||||||
|
|
||||||
|
class RoleMapper:
|
||||||
|
singleton = None
|
||||||
|
def __init__(self):
|
||||||
|
# TODO Automated file retrieval
|
||||||
|
base_path = os.path.dirname(Path(__file__))
|
||||||
|
path = os.path.join(base_path, "assets", "xjustiz_0040_cl_rollenbezeichnung_3_3.xsd")
|
||||||
|
with open(path, encoding="utf-8") as file:
|
||||||
|
content = file.read()
|
||||||
|
data = xmltodict.parse(content)
|
||||||
|
|
||||||
|
mapping = {}
|
||||||
|
for entry in data["xs:schema"]["xs:simpleType"]["xs:restriction"]["xs:enumeration"]:
|
||||||
|
mapping[entry['@value']] = entry['xs:annotation']['xs:appinfo']['wert']
|
||||||
|
self.dictionary = mapping
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def mapper():
|
||||||
|
if RoleMapper.singleton is None:
|
||||||
|
RoleMapper.singleton = RoleMapper()
|
||||||
|
return RoleMapper.singleton
|
||||||
|
|
||||||
|
def get(self, key: str) -> RelationshipRoleEnum:
|
||||||
|
return RelationshipRoleEnum(self.dictionary[key])
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
mapper = RoleMapper()
|
||||||
|
print(mapper.get("201"))
|
@ -30,28 +30,9 @@ from aki_prj23_transparenzregister.utils.string_tools import (
|
|||||||
transform_date_to_iso,
|
transform_date_to_iso,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3.role_mapper import (
|
||||||
def transform_xml_to_json(source_dir: str, target_dir: str) -> None:
|
RoleMapper,
|
||||||
"""Convert all xml files in a directory to json files.
|
)
|
||||||
|
|
||||||
Args:
|
|
||||||
source_dir (str): Directory hosting the xml files
|
|
||||||
target_dir (str): Target directory to move json files to
|
|
||||||
"""
|
|
||||||
if not os.path.exists(target_dir):
|
|
||||||
os.makedirs(target_dir)
|
|
||||||
for source_path in [
|
|
||||||
os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True)
|
|
||||||
]:
|
|
||||||
target_path = os.path.join(
|
|
||||||
target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json")
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(source_path, encoding="utf-8") as source_file:
|
|
||||||
# deepcode ignore HandleUnicode: Weird XML format no other solution
|
|
||||||
data = xmltodict.parse(source_file.read().encode())
|
|
||||||
with open(target_path, "w", encoding="utf-8") as json_file:
|
|
||||||
json_file.write(json.dumps(data))
|
|
||||||
|
|
||||||
|
|
||||||
def parse_date_of_birth(data: dict) -> str | None:
|
def parse_date_of_birth(data: dict) -> str | None:
|
||||||
@ -63,22 +44,20 @@ def parse_date_of_birth(data: dict) -> str | None:
|
|||||||
Returns:
|
Returns:
|
||||||
str | None: date of birth or None if not found
|
str | None: date of birth or None if not found
|
||||||
"""
|
"""
|
||||||
if "tns:geburt" in (base := data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]):
|
if "tns:geburt" in (
|
||||||
|
base := data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
|
"tns:natuerlichePerson"
|
||||||
|
]
|
||||||
|
):
|
||||||
base = base["tns:geburt"]["tns:geburtsdatum"]
|
base = base["tns:geburt"]["tns:geburtsdatum"]
|
||||||
if isinstance(base, str):
|
if isinstance(base, str):
|
||||||
return base
|
return base
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum:
|
def map_role_id_to_enum(role_id: str) -> RelationshipRoleEnum:
|
||||||
match role_id:
|
mapper = RoleMapper.mapper()
|
||||||
case "086":
|
return mapper.get(role_id)
|
||||||
return RelationshipRoleEnum.GESCHAEFTSFUEHRER
|
|
||||||
case "285":
|
|
||||||
return RelationshipRoleEnum.PROKURIST
|
|
||||||
case "194":
|
|
||||||
return RelationshipRoleEnum.VORSTAND
|
|
||||||
case _:
|
|
||||||
raise KeyError(f'Uknown role_id: {role_id}')
|
|
||||||
|
|
||||||
|
|
||||||
def parse_stakeholder(data: dict) -> CompanyRelationship | None:
|
def parse_stakeholder(data: dict) -> CompanyRelationship | None:
|
||||||
@ -92,100 +71,120 @@ def parse_stakeholder(data: dict) -> CompanyRelationship | None:
|
|||||||
"""
|
"""
|
||||||
if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]:
|
if "tns:natuerlichePerson" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]:
|
||||||
# It's a Company serving as a "Kommanditist" or similar
|
# It's a Company serving as a "Kommanditist" or similar
|
||||||
# if data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"]["Vorname"] is None:
|
if (
|
||||||
# return CompanyToCompanyRelationship(
|
"tns:vorname"
|
||||||
# **{ # type: ignore
|
not in data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
# "name": remove_traling_and_leading_quotes(
|
"tns:natuerlichePerson"
|
||||||
# data["Beteiligter"]["Natuerliche_Person"]["Voller_Name"][
|
]["tns:vollerName"]
|
||||||
# "Nachname"
|
):
|
||||||
# ]
|
return CompanyToCompanyRelationship(
|
||||||
# ),
|
**{ # type: ignore
|
||||||
# "location": Location(
|
"name": remove_traling_and_leading_quotes(
|
||||||
# **{
|
data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
# "city": data["Beteiligter"]["Natuerliche_Person"][
|
"tns:natuerlichePerson"
|
||||||
# "Anschrift"
|
]["tns:vollerName"]["tns:nachname"]
|
||||||
# ][-1]["Ort"]
|
),
|
||||||
# if isinstance(
|
"location": Location(
|
||||||
# data["Beteiligter"]["Natuerliche_Person"]["Anschrift"],
|
**{
|
||||||
# list,
|
"city": data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
# )
|
"tns:natuerlichePerson"
|
||||||
# else data["Beteiligter"]["Natuerliche_Person"]["Anschrift"][
|
]["tns:anschrift"][-1]["tns:ort"]
|
||||||
# "Ort"
|
if isinstance(
|
||||||
# ]
|
data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
# }
|
"tns:natuerlichePerson"
|
||||||
# ),
|
]["tns:anschrift"],
|
||||||
# "role": RelationshipRoleEnum(
|
list,
|
||||||
# data["Rolle"]["Rollenbezeichnung"]["content"]
|
)
|
||||||
# ),
|
else data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
# "type": CompanyRelationshipEnum.COMPANY,
|
"tns:natuerlichePerson"
|
||||||
# }
|
]["tns:anschrift"]["tns:ort"]
|
||||||
# )
|
}
|
||||||
|
),
|
||||||
|
"role": map_role_id_to_enum(
|
||||||
|
data["tns:rolle"]["tns:rollenbezeichnung"]["code"]
|
||||||
|
),
|
||||||
|
"type": CompanyRelationshipEnum.COMPANY,
|
||||||
|
}
|
||||||
|
)
|
||||||
return PersonToCompanyRelationship(
|
return PersonToCompanyRelationship(
|
||||||
**{ # type: ignore
|
**{ # type: ignore
|
||||||
"name": PersonName(
|
"name": PersonName(
|
||||||
**{
|
**{
|
||||||
"firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][
|
"firstname": data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
"tns:vollerName"
|
"tns:natuerlichePerson"
|
||||||
]["tns:vorname"],
|
]["tns:vollerName"]["tns:vorname"],
|
||||||
"lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"][
|
"lastname": data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
"tns:vollerName"
|
"tns:natuerlichePerson"
|
||||||
]["tns:nachname"],
|
]["tns:vollerName"]["tns:nachname"],
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
"date_of_birth": parse_date_of_birth(data),
|
"date_of_birth": parse_date_of_birth(data),
|
||||||
"location": Location(
|
"location": Location(
|
||||||
**{
|
**{
|
||||||
"city": data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][
|
"city": data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
-1
|
"tns:natuerlichePerson"
|
||||||
]["tns:ort"]
|
]["tns:anschrift"][-1]["tns:ort"]
|
||||||
if isinstance(
|
if isinstance(
|
||||||
data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"], list
|
data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
|
"tns:natuerlichePerson"
|
||||||
|
]["tns:anschrift"],
|
||||||
|
list,
|
||||||
)
|
)
|
||||||
else data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:natuerlichePerson"]["tns:anschrift"][
|
else data["tns:beteiligter"]["tns:auswahl_beteiligter"][
|
||||||
"tns:ort"
|
"tns:natuerlichePerson"
|
||||||
]
|
]["tns:anschrift"]["tns:ort"]
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
# TODO get role via ID
|
|
||||||
"role": map_role_id_to_enum(
|
"role": map_role_id_to_enum(
|
||||||
data["tns:rolle"]["tns:rollenbezeichnung"]["code"]
|
data["tns:rolle"]["tns:rollenbezeichnung"]["code"]
|
||||||
),
|
),
|
||||||
"type": CompanyRelationshipEnum.PERSON,
|
"type": CompanyRelationshipEnum.PERSON,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if "Organisation" in data["Beteiligter"]:
|
if "tns:organisation" in data["tns:beteiligter"]["tns:auswahl_beteiligter"]:
|
||||||
|
base = data["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"]
|
||||||
|
|
||||||
|
location = None
|
||||||
|
if "tns:anschrift" in base:
|
||||||
|
location = Location(
|
||||||
|
**{
|
||||||
|
"city": base["tns:anschrift"]["tns:ort"],
|
||||||
|
"street": base["tns:anschrift"]["tns:strasse"]
|
||||||
|
if "tns:strasse" in base["tns:anschrift"]
|
||||||
|
else None,
|
||||||
|
"house_number": base["tns:anschrift"]["tns:hausnummer"]
|
||||||
|
if "tns:hausnummer" in base["tns:anschrift"]
|
||||||
|
else None,
|
||||||
|
"zip_code": base["tns:anschrift"]["tns:postleitzahl"]
|
||||||
|
if "tns:potsleitzahl" in base["tns:anschrift"]
|
||||||
|
else None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
location = Location(
|
||||||
|
**{
|
||||||
|
"city": base["tns:sitz"]["tns:ort"],
|
||||||
|
"street": base["tns:sitz"]["tns:strasse"]
|
||||||
|
if "tns:strasse" in base["tns:sitz"]
|
||||||
|
else None,
|
||||||
|
"house_number": base["tns:sitz"]["tns:hausnummer"]
|
||||||
|
if "tns:hausnummer" in base["tns:sitz"]
|
||||||
|
else None,
|
||||||
|
"zip_code": base["tns:sitz"]["tns:postleitzahl"]
|
||||||
|
if "tns:potsleitzahl" in base["tns:sitz"]
|
||||||
|
else None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return CompanyToCompanyRelationship(
|
return CompanyToCompanyRelationship(
|
||||||
**{ # type: ignore
|
**{ # type: ignore
|
||||||
"role": RelationshipRoleEnum(
|
"role": map_role_id_to_enum(
|
||||||
data["Rolle"]["Rollenbezeichnung"]["content"]
|
data["tns:rolle"]["tns:rollenbezeichnung"]["code"]
|
||||||
),
|
),
|
||||||
"name": remove_traling_and_leading_quotes(
|
"name": remove_traling_and_leading_quotes(
|
||||||
data["Beteiligter"]["Organisation"]["Bezeichnung"][
|
base["tns:bezeichnung"]["tns:bezeichnung.aktuell"]
|
||||||
"Bezeichnung_Aktuell"
|
|
||||||
]
|
|
||||||
),
|
|
||||||
"location": Location(
|
|
||||||
**{
|
|
||||||
"city": data["Beteiligter"]["Organisation"]["Anschrift"]["Ort"],
|
|
||||||
"street": data["Beteiligter"]["Organisation"]["Anschrift"][
|
|
||||||
"Strasse"
|
|
||||||
]
|
|
||||||
if "Strasse" in data["Beteiligter"]["Organisation"]["Anschrift"]
|
|
||||||
else None,
|
|
||||||
"house_number": data["Beteiligter"]["Organisation"][
|
|
||||||
"Anschrift"
|
|
||||||
]["Hausnummer"]
|
|
||||||
if "Hausnummer"
|
|
||||||
in data["Beteiligter"]["Organisation"]["Anschrift"]
|
|
||||||
else None,
|
|
||||||
"zip_code": data["Beteiligter"]["Organisation"]["Anschrift"][
|
|
||||||
"Postleitzahl"
|
|
||||||
]
|
|
||||||
if "Postleitzahl"
|
|
||||||
in data["Beteiligter"]["Organisation"]["Anschrift"]
|
|
||||||
else None,
|
|
||||||
}
|
|
||||||
),
|
),
|
||||||
|
"location": location,
|
||||||
"type": CompanyRelationshipEnum.COMPANY,
|
"type": CompanyRelationshipEnum.COMPANY,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -227,10 +226,16 @@ def loc_from_beteiligung(data: dict) -> Location:
|
|||||||
"tns:beteiligter",
|
"tns:beteiligter",
|
||||||
"tns:auswahl_beteiligter",
|
"tns:auswahl_beteiligter",
|
||||||
"tns:organisation",
|
"tns:organisation",
|
||||||
"tns:anschrift"
|
# "tns:anschrift",
|
||||||
]
|
]
|
||||||
base = traversal(data, base_path)
|
base = traversal(data, base_path)
|
||||||
|
if "tns:anschrift" in base:
|
||||||
|
base = base["tns:anschrift"]
|
||||||
|
else:
|
||||||
|
base = base["tns:sitz"]
|
||||||
|
|
||||||
|
if isinstance(base, list):
|
||||||
|
base = base[0]
|
||||||
house_number = None
|
house_number = None
|
||||||
street = None
|
street = None
|
||||||
if "tns:strasse" in base:
|
if "tns:strasse" in base:
|
||||||
@ -273,7 +278,7 @@ def name_from_beteiligung(data: dict) -> str:
|
|||||||
"tns:auswahl_beteiligter",
|
"tns:auswahl_beteiligter",
|
||||||
"tns:organisation",
|
"tns:organisation",
|
||||||
"tns:bezeichnung",
|
"tns:bezeichnung",
|
||||||
"tns:bezeichnung.aktuell"
|
"tns:bezeichnung.aktuell",
|
||||||
]
|
]
|
||||||
name = traversal(data, path)
|
name = traversal(data, path)
|
||||||
return remove_traling_and_leading_quotes(name)
|
return remove_traling_and_leading_quotes(name)
|
||||||
@ -296,11 +301,9 @@ def map_rechtsform(company_name: str, data: dict) -> CompanyTypeEnum | None:
|
|||||||
"tns:rechtstraeger",
|
"tns:rechtstraeger",
|
||||||
"tns:angabenZurRechtsform",
|
"tns:angabenZurRechtsform",
|
||||||
"tns:rechtsform",
|
"tns:rechtsform",
|
||||||
"code"
|
"code",
|
||||||
]
|
]
|
||||||
return CompanyTypeEnum(
|
return CompanyTypeEnum(traversal(data, path))
|
||||||
traversal(data, path)
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
if (
|
if (
|
||||||
company_name.endswith("GmbH")
|
company_name.endswith("GmbH")
|
||||||
@ -328,8 +331,8 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
|
|||||||
# Early return
|
# Early return
|
||||||
if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]:
|
if "tns:auswahl_zusatzangaben" not in data["tns:fachdatenRegister"]:
|
||||||
return None
|
return None
|
||||||
capital: dict = {"Zahl": 0.0, "Waehrung": ""}
|
capital: dict = {"tns:zahl": 0.0, "tns:waehrung": {"code": None}}
|
||||||
if company_type == CompanyTypeEnum.KG:
|
if company_type == CompanyTypeEnum.KG and "tns:personengesellschaft" in data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"]:
|
||||||
capital_type = "Hafteinlage"
|
capital_type = "Hafteinlage"
|
||||||
base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][
|
base = data["tns:fachdatenRegister"]["tns:auswahl_zusatzangaben"][
|
||||||
"tns:personengesellschaft"
|
"tns:personengesellschaft"
|
||||||
@ -337,10 +340,14 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
|
|||||||
if isinstance(base, list):
|
if isinstance(base, list):
|
||||||
for entry in base:
|
for entry in base:
|
||||||
# TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below
|
# TODO link to persons using Ref_Rollennummer then extract ["Hafteinlage"] as below
|
||||||
capital["Zahl"] = capital["Zahl"] + float(entry["Hafteinlage"]["Zahl"])
|
capital["tns:zahl"] = capital["tns:zahl"] + float(
|
||||||
capital["Waehrung"] = entry["Hafteinlage"]["Waehrung"]
|
entry["tns:hafteinlage"]["tns:zahl"]
|
||||||
|
)
|
||||||
|
capital["tns:waehrung"]["code"] = entry["tns:hafteinlage"][
|
||||||
|
"tns:waehrung"
|
||||||
|
]["code"]
|
||||||
elif isinstance(base, dict):
|
elif isinstance(base, dict):
|
||||||
capital = base["Hafteinlage"]
|
capital = base["tns:hafteinlage"]
|
||||||
elif company_type in [
|
elif company_type in [
|
||||||
CompanyTypeEnum.GMBH,
|
CompanyTypeEnum.GMBH,
|
||||||
CompanyTypeEnum.SE,
|
CompanyTypeEnum.SE,
|
||||||
@ -365,7 +372,9 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
|
|||||||
capital = base["tns:zusatzGmbH"]["tns:stammkapital"]
|
capital = base["tns:zusatzGmbH"]["tns:stammkapital"]
|
||||||
elif "tns:zusatzAktiengesellschaft" in base:
|
elif "tns:zusatzAktiengesellschaft" in base:
|
||||||
capital_type = "Grundkapital"
|
capital_type = "Grundkapital"
|
||||||
capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"]["tns:hoehe"]
|
capital = base["tns:zusatzAktiengesellschaft"]["tns:grundkapital"][
|
||||||
|
"tns:hoehe"
|
||||||
|
]
|
||||||
elif company_type in [
|
elif company_type in [
|
||||||
CompanyTypeEnum.EINZELKAUFMANN,
|
CompanyTypeEnum.EINZELKAUFMANN,
|
||||||
CompanyTypeEnum.EG,
|
CompanyTypeEnum.EG,
|
||||||
@ -397,11 +406,7 @@ def map_business_purpose(data: dict) -> str | None:
|
|||||||
str | None: Business purpose if found
|
str | None: Business purpose if found
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
path = [
|
path = ["tns:fachdatenRegister", "tns:basisdatenRegister", "tns:gegenstand"]
|
||||||
"tns:fachdatenRegister",
|
|
||||||
"tns:basisdatenRegister",
|
|
||||||
"tns:gegenstand"
|
|
||||||
]
|
|
||||||
return traversal(data, path)
|
return traversal(data, path)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return None
|
return None
|
||||||
@ -455,20 +460,18 @@ def map_founding_date(data: dict) -> str | None:
|
|||||||
)
|
)
|
||||||
if len(entry_date) == 1:
|
if len(entry_date) == 1:
|
||||||
return transform_date_to_iso(entry_date[0])
|
return transform_date_to_iso(entry_date[0])
|
||||||
if (
|
if "tns:satzungsdatum" in data["tns:fachdatenRegister"]["tns:basisdatenRegister"]:
|
||||||
"tns:satzungsdatum"
|
|
||||||
in data["tns:fachdatenRegister"]["tns:basisdatenRegister"]
|
|
||||||
):
|
|
||||||
path = [
|
path = [
|
||||||
"tns:fachdatenRegister",
|
"tns:fachdatenRegister",
|
||||||
"tns:basisdatenRegister",
|
"tns:basisdatenRegister",
|
||||||
"tns:satzungsdatum",
|
"tns:satzungsdatum",
|
||||||
"tns:aktuellesSatzungsdatum"
|
"tns:aktuellesSatzungsdatum",
|
||||||
]
|
]
|
||||||
return traversal(data, path)
|
return traversal(data, path)
|
||||||
# No reliable answer
|
# No reliable answer
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def traversal(data: dict, path: list[str | int]) -> any:
|
def traversal(data: dict, path: list[str | int]) -> any:
|
||||||
current = data
|
current = data
|
||||||
for key in path:
|
for key in path:
|
||||||
@ -484,15 +487,14 @@ def map_hr_number(data: dict) -> str:
|
|||||||
"tns:aktenzeichen"
|
"tns:aktenzeichen"
|
||||||
]["tns:auswahl_aktenzeichen"]
|
]["tns:auswahl_aktenzeichen"]
|
||||||
if "tns:aktenzeichen.strukturiert" in base:
|
if "tns:aktenzeichen.strukturiert" in base:
|
||||||
hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"][
|
hr_prefix = base["tns:aktenzeichen.strukturiert"]["tns:register"]["code"]
|
||||||
"code"
|
|
||||||
]
|
|
||||||
hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"]
|
hr_number = base["tns:aktenzeichen.strukturiert"]["tns:laufendeNummer"]
|
||||||
return f"{hr_prefix} {hr_number}"
|
return f"{hr_prefix} {hr_number}"
|
||||||
elif "tns:aktenzeichen.freitext" in base:
|
elif "tns:aktenzeichen.freitext" in base:
|
||||||
return base["tns:aktenzeichen.freitext"]
|
return base["tns:aktenzeichen.freitext"]
|
||||||
return hr_full
|
return hr_full
|
||||||
|
|
||||||
|
|
||||||
def map_district_court(data: dict) -> DistrictCourt:
|
def map_district_court(data: dict) -> DistrictCourt:
|
||||||
base_path = [
|
base_path = [
|
||||||
"tns:grunddaten",
|
"tns:grunddaten",
|
||||||
@ -501,17 +503,11 @@ def map_district_court(data: dict) -> DistrictCourt:
|
|||||||
1,
|
1,
|
||||||
"tns:beteiligter",
|
"tns:beteiligter",
|
||||||
"tns:auswahl_beteiligter",
|
"tns:auswahl_beteiligter",
|
||||||
"tns:organisation"
|
"tns:organisation",
|
||||||
]
|
|
||||||
path = [*base_path,
|
|
||||||
"tns:bezeichnung",
|
|
||||||
"tns:bezeichnung.aktuell"
|
|
||||||
]
|
]
|
||||||
|
path = [*base_path, "tns:bezeichnung", "tns:bezeichnung.aktuell"]
|
||||||
name = traversal(data, path)
|
name = traversal(data, path)
|
||||||
path = [*base_path,
|
path = [*base_path, "tns:anschrift", "tns:ort"]
|
||||||
"tns:anschrift",
|
|
||||||
"tns:ort"
|
|
||||||
]
|
|
||||||
city = traversal(data, path)
|
city = traversal(data, path)
|
||||||
return DistrictCourt(name=name, city=city)
|
return DistrictCourt(name=name, city=city)
|
||||||
|
|
||||||
@ -525,12 +521,14 @@ def map_company_id(data: dict) -> CompanyID:
|
|||||||
Returns:
|
Returns:
|
||||||
CompanyID: ID of the company
|
CompanyID: ID of the company
|
||||||
"""
|
"""
|
||||||
return CompanyID(
|
try:
|
||||||
**{
|
return CompanyID(
|
||||||
"hr_number": map_hr_number(data),
|
**{"hr_number": map_hr_number(data), "district_court": map_district_court(data)}
|
||||||
"district_court": map_district_court(data)
|
)
|
||||||
}
|
except KeyError:
|
||||||
)
|
hr_number = data["tns:grunddaten"]["tns:verfahrensdaten"]["tns:beteiligung"][0]["tns:beteiligter"]["tns:auswahl_beteiligter"]["tns:organisation"]["tns:registereintragung"]["tns:registernummer"]
|
||||||
|
district_court = map_district_court(data)
|
||||||
|
return CompanyID(hr_number=hr_number, district_court=district_court)
|
||||||
|
|
||||||
|
|
||||||
def map_last_update(data: dict) -> str:
|
def map_last_update(data: dict) -> str:
|
||||||
@ -542,11 +540,7 @@ def map_last_update(data: dict) -> str:
|
|||||||
Returns:
|
Returns:
|
||||||
str: Last update date
|
str: Last update date
|
||||||
"""
|
"""
|
||||||
path = [
|
path = ["tns:fachdatenRegister", "tns:auszug", "tns:letzteEintragung"]
|
||||||
"tns:fachdatenRegister",
|
|
||||||
"tns:auszug",
|
|
||||||
"tns:letzteEintragung"
|
|
||||||
]
|
|
||||||
return traversal(data, path)
|
return traversal(data, path)
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user