fix(data-extraction): Resolve issue in different Bundesanzeiger formats

This commit is contained in:
TrisNol 2023-09-25 18:37:39 +02:00
parent 5bbdf046d2
commit 2050b49fde
2 changed files with 10 additions and 4 deletions

View File

@ -62,9 +62,11 @@ class Bundesanzeiger:
pd.DataFrame: Filtered and pruned DataFrame pd.DataFrame: Filtered and pruned DataFrame
""" """
df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0]) df_reports["type"] = df_reports.name.apply(lambda name: name.split(" ")[0])
df_reports = df_reports.loc[df_reports.type == "Jahresabschluss"] df_reports = df_reports.loc[
(df_reports.type == "Jahresabschluss") | (df_reports.type == "Jahres-")
]
df_reports["jahr"] = df_reports.name.apply( df_reports["jahr"] = df_reports.name.apply(
lambda name: name.split(" ")[-1].split(".")[-1] lambda name: re.findall(r"\d{2}\.\d{2}.\d{4}", name)[0].split(".")[-1]
) )
return df_reports.drop(["name", "report", "type"], axis=1) return df_reports.drop(["name", "report", "type"], axis=1)

View File

@ -70,7 +70,11 @@ def test_extracct_financial_results() -> None:
def test_filter_reports() -> None: def test_filter_reports() -> None:
test_data = [ test_data = [
{"name": "Bedienungsanleitung", "report": "", "raw_report": ""}, {"name": "Bedienungsanleitung", "report": "", "raw_report": ""},
{"name": "Jahresabschluss 1998", "report": "", "raw_report": ""}, {
"name": "Jahresabschluss vom 01.01.1998 bis zum 31.12.1998",
"report": "",
"raw_report": "",
},
] ]
test_df = pd.DataFrame(test_data) test_df = pd.DataFrame(test_data)
ba = Bundesanzeiger() ba = Bundesanzeiger()
@ -91,7 +95,7 @@ def test_get_information(mock_bundesanzeiger: Mock) -> None:
"raw_report": "", "raw_report": "",
}, },
"2": { "2": {
"name": "Jahresabschluss 1998", "name": "Jahresabschluss 01.01.1998",
"report": "", "report": "",
"company": "PRJ 23 Transparenzregister GmbH", "company": "PRJ 23 Transparenzregister GmbH",
"raw_report": "", "raw_report": "",