fix(data-extraction): Resolve issue in different Bundesanzeiger formats

This commit is contained in:
TrisNol 2023-09-25 18:37:39 +02:00
parent 5bbdf046d2
commit 2050b49fde
2 changed files with 10 additions and 4 deletions

View File

@ -62,9 +62,11 @@ class Bundesanzeiger:
pd.DataFrame: Filtered and pruned DataFrame
"""
df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0])
df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"]
df_reports = df_reports.loc[
(df_reports.type == "Jahresabschluss") | (df_reports.type == "Jahres-")
]
df_reports["jahr"] = df_reports.name.apply(
lambda name: name.split(" ")[-1].split(".")[-1]
lambda name: re.findall(r"\d{2}\.\d{2}.\d{4}", name)[0].split(".")[-1]
)
return df_reports.drop(["name", "report", "type"], axis=1)

View File

@ -70,7 +70,11 @@ def test_extracct_financial_results() -> None:
def test_filter_reports() -> None:
test_data = [
{"name": "Bedienungsanleitung", "report": "", "raw_report": ""},
{"name": "Jahresabschluss 1998", "report": "", "raw_report": ""},
{
"name": "Jahresabschluss vom 01.01.1998 bis zum 31.12.1998",
"report": "",
"raw_report": "",
},
]
test_df = pd.DataFrame(test_data)
ba = Bundesanzeiger()
@ -91,7 +95,7 @@ def test_get_information(mock_bundesanzeiger: Mock) -> None:
"raw_report": "",
},
"2": {
"name": "Jahresabschluss 1998",
"name": "Jahresabschluss 01.01.1998",
"report": "",
"company": "PRJ 23 Transparenzregister GmbH",
"raw_report": "",