diff --git a/src/aki_prj23_transparenzregister/apps/find_missing_companies.py b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py new file mode 100644 index 0000000..513b256 --- /dev/null +++ b/src/aki_prj23_transparenzregister/apps/find_missing_companies.py @@ -0,0 +1,85 @@ +import os +import sys +import json +import glob +import argparse +import tempfile +import pandas as pd +from tqdm import tqdm +from pathlib import Path +from loguru import logger +from aki_prj23_transparenzregister.config.config_providers import ( + HELP_TEXT_CONFIG, + get_config_provider, +) +from aki_prj23_transparenzregister.utils.logger_config import ( + add_logger_options_to_argparse, + configer_logger, +) + +from aki_prj23_transparenzregister.utils.sql import connector +from aki_prj23_transparenzregister.utils.sql import entities + +from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import ( + extract, + load, + transform, +) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Transparenzregister Webserver", + description="Starts an Dash Webserver that shows our Analysis.", + epilog="Example: webserver --log-level ERROR --log-path print.log", + ) + parser.add_argument( + "config", + metavar="config", + default="ENV", + ) + add_logger_options_to_argparse(parser) + + parsed = parser.parse_args(sys.argv[1:]) + configer_logger(namespace=parsed) + config = parsed.config + session = connector.get_session(get_config_provider(config)) + missing_companies = session.query(entities.MissingCompany).all() + + counter = 0 + # Scrape data from unternehmensregister + for company in missing_companies: + print(company.name) + extract.scrape(company.name, ["tmp", "xml"]) + counter = counter + 1 + if counter == 5: + break + # Transform input + output_path = os.path.join(str(Path.cwd()), *["tmp", "transformed"]) + xml_dir = os.path.join(str(Path.cwd()), *["tmp", "xml"]) + json_dir = os.path.join(str(Path.cwd()), *["tmp", "json"]) + transform.transform_xml_to_json( + os.path.join(xml_dir), + os.path.join(json_dir), + ) + for file in tqdm(glob.glob1(json_dir, "*.json")): + path = os.path.join(json_dir, file) + with open(path, encoding="utf-8") as file_object: + try: + company: Company = transform.map_unternehmensregister_json( + json.loads(file_object.read()) + ) + + name = "".join(e for e in company.name if e.isalnum())[:50] + + with open( + f"{output_path}/{name}.json", + "w+", + encoding="utf-8", + ) as export_file: + json.dump( + dataclasses.asdict(company), export_file, ensure_ascii=False + ) + except Exception as e: + logger.error(e) + logger.error(f"Error in processing {path}") + sys.exit(1) \ No newline at end of file diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py index c37b260..efff716 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/extract.py @@ -21,6 +21,7 @@ def scrape(query: str, download_dir: list[str]) -> None: download_dir (list[str]): Directory to place output files in """ download_path = os.path.join(str(Path.cwd()), *download_dir) + print(download_path) options = webdriver.ChromeOptions() preferences = { "profile.default_content_settings.popups": 0, @@ -32,7 +33,7 @@ def scrape(query: str, download_dir: list[str]) -> None: "default_directory": download_path, }, } - options.add_argument("--headless=new") + # options.add_argument("--headless=new") options.add_experimental_option("prefs", preferences) driver = webdriver.Chrome(options=options) diff --git a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py index 82a8028..eb2fd97 100644 --- a/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py +++ b/src/aki_prj23_transparenzregister/utils/data_extraction/unternehmensregister/transform.py @@ -38,6 +38,8 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None: source_dir (str): Directory hosting the xml files target_dir (str): Target directory to move json files to """ + if not os.path.exists(target_dir): + os.makedirs(target_dir) for source_path in [ os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True) ]: