fix(data-extraction): Parse date from Gesellschaftsvertrag entry (#221)

This commit is contained in:
Tristan Nolde 2023-10-15 13:06:04 +02:00 committed by GitHub
commit 7e54ab98c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 57 additions and 4 deletions

View File

@ -367,6 +367,33 @@ def map_business_purpose(data: dict) -> str | None:
return None
def extract_date_from_string(value: str) -> str | None:
"""Extract a date in ISO format from the given string if possible.
Args:
value (str): Input text
Returns:
str | None: Date in ISO format, None if not found
"""
date_regex = [ # type: ignore
{"regex": r"\d{1,2}\.\d{1,2}\.\d{4}", "mapper": transform_date_to_iso},
{"regex": r"\d{4}-\d{1,2}-\d{1,2}", "mapper": None},
]
results = []
for regex in date_regex:
result = re.findall(regex["regex"], value) # type: ignore
if len(result) == 1:
relevant_data = result[0]
if regex["mapper"] is not None: # type: ignore
results.append(regex["mapper"](relevant_data)) # type: ignore
else:
results.append(relevant_data)
if len(results) != 1:
return None
return results[0]
def map_founding_date(data: dict) -> str | None:
"""Extracts the founding date from a given Unternehmensregister export.
@ -392,9 +419,11 @@ def map_founding_date(data: dict) -> str | None:
"Gruendungsmetadaten"
in data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"]
):
return data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][
"Gruendungsmetadaten"
]["Gruendungsdatum"]
return extract_date_from_string(
data["XJustiz_Daten"]["Fachdaten_Register"]["Basisdaten_Register"][
"Gruendungsmetadaten"
]["Gruendungsdatum"]
)
# No reliable answer
return None

View File

@ -601,6 +601,28 @@ def test_map_business_purpose_no_result() -> None:
assert result is None
@pytest.mark.parametrize(
("value", "expected_result"),
[
("", None),
("Tag der ersten Eintragung: 01.05.2004", "2004-05-01"),
("Tag der ersten Eintragung: 1.05.2004", "2004-05-01"),
("Tag der ersten Eintragung: 1.5.2004", "2004-05-01"),
("Tag der ersten Eintragung: 01.5.2004", "2004-05-01"),
("Gesellschaftsvertrag vom 06.04.2016 Hallo Welt", "2016-04-06"),
("Str. des Tests vom 1999-04-05", "1999-04-05"),
("Once upon a midnight dreary while I pondered weak and weary...", None),
(
"This company was first founded in 2016-06-10 and then again on 1.5.2004",
None,
),
],
)
def test_extract_date_from_string(value: str, expected_result: str) -> None:
result = transform.extract_date_from_string(value)
assert result == expected_result
def test_map_founding_date_from_tag_der_ersten_eintragung() -> None:
data = {
"some entry": "Tag der ersten Eintragung: 01.05.2004",
@ -626,7 +648,9 @@ def test_map_founding_date_from_gruendungsdatum() -> None:
"XJustiz_Daten": {
"Fachdaten_Register": {
"Basisdaten_Register": {
"Gruendungsmetadaten": {"Gruendungsdatum": "1998-01-01"}
"Gruendungsmetadaten": {
"Gruendungsdatum": "Gesellschaftsvertrag vom 1998-01-01"
}
}
}
}