diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py index 21bce5b..ec39c3d 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/bundesanzeiger.py @@ -200,7 +200,9 @@ class Bundesanzeiger: soup = BeautifulSoup(report, features="html.parser") for table in soup.find_all("table", {"class": "std_table"}): try: - results = pd.read_html(StringIO(str(table)), flavor="bs4") + results = pd.read_html( + StringIO(str(table)), flavor="bs4", thousands=".", decimal="," + ) if len(results) > 0: data_frame = results[0] result.append(data_frame) @@ -229,10 +231,10 @@ class Bundesanzeiger: return None def parse_string_to_float(value: str | float) -> float | None: + if value is None: + return None try: - if value is None: - return None - return float(str(value).replace(".", "").replace(",", ".")) + return float(value) except Exception: return None diff --git a/tests/utils/data_extraction/bundesanzeiger_test.py b/tests/utils/data_extraction/bundesanzeiger_test.py index 73bbbd9..5e2ef33 100644 --- a/tests/utils/data_extraction/bundesanzeiger_test.py +++ b/tests/utils/data_extraction/bundesanzeiger_test.py @@ -136,3 +136,75 @@ def test_get_information_no_results(mock_bundesanzeiger: Mock) -> None: ba = Bundesanzeiger() result = ba.get_information("PRJ 23 Transparenzregister GmbH", "Iserlohn") assert len(result) == 0 + + +def test_extract_tables_from_reports() -> None: + report = """ + +
+
+ Möge die Macht mir dir sein + + + + + + + + + +
Column AColumn B
424711
+
+ """ + ba = Bundesanzeiger() + result = ba.__extract_tables_from_report__(report) + assert len(result) == 1 + + +def test_parse_tables_to_kpis() -> None: + report = """ + +
+
+ Möge die Macht mir dir sein + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Position2023 in T€1997 in €
a) Umlaufvermögen12,134711
+EBIT11234711
To be ignoredI've tried so hard and got so far, but in the end it doesn't even matter4711
Gewinn4711
Jahresüberschuss4.130,124711
+
+ """ + ba = Bundesanzeiger() + result = ba.parse_tables_to_kpis(report) + assert result == { + "Umlaufvermögen": 12130.0, + "EBIT": 1123000.0, + "Jahresüberschuss": 4130120.0, + }