checkpoint: Core data fetch routine

This commit is contained in:
TrisNol 2023-10-29 12:53:27 +01:00
parent 2fddc9149a
commit 7f8511c9d6
3 changed files with 89 additions and 1 deletions

View File

@ -0,0 +1,85 @@
import os
import sys
import json
import glob
import argparse
import tempfile
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from loguru import logger
from aki_prj23_transparenzregister.config.config_providers import (
HELP_TEXT_CONFIG,
get_config_provider,
)
from aki_prj23_transparenzregister.utils.logger_config import (
add_logger_options_to_argparse,
configer_logger,
)
from aki_prj23_transparenzregister.utils.sql import connector
from aki_prj23_transparenzregister.utils.sql import entities
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
extract,
load,
transform,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="Transparenzregister Webserver",
description="Starts an Dash Webserver that shows our Analysis.",
epilog="Example: webserver --log-level ERROR --log-path print.log",
)
parser.add_argument(
"config",
metavar="config",
default="ENV",
)
add_logger_options_to_argparse(parser)
parsed = parser.parse_args(sys.argv[1:])
configer_logger(namespace=parsed)
config = parsed.config
session = connector.get_session(get_config_provider(config))
missing_companies = session.query(entities.MissingCompany).all()
counter = 0
# Scrape data from unternehmensregister
for company in missing_companies:
print(company.name)
extract.scrape(company.name, ["tmp", "xml"])
counter = counter + 1
if counter == 5:
break
# Transform input
output_path = os.path.join(str(Path.cwd()), *["tmp", "transformed"])
xml_dir = os.path.join(str(Path.cwd()), *["tmp", "xml"])
json_dir = os.path.join(str(Path.cwd()), *["tmp", "json"])
transform.transform_xml_to_json(
os.path.join(xml_dir),
os.path.join(json_dir),
)
for file in tqdm(glob.glob1(json_dir, "*.json")):
path = os.path.join(json_dir, file)
with open(path, encoding="utf-8") as file_object:
try:
company: Company = transform.map_unternehmensregister_json(
json.loads(file_object.read())
)
name = "".join(e for e in company.name if e.isalnum())[:50]
with open(
f"{output_path}/{name}.json",
"w+",
encoding="utf-8",
) as export_file:
json.dump(
dataclasses.asdict(company), export_file, ensure_ascii=False
)
except Exception as e:
logger.error(e)
logger.error(f"Error in processing {path}")
sys.exit(1)

View File

@ -21,6 +21,7 @@ def scrape(query: str, download_dir: list[str]) -> None:
download_dir (list[str]): Directory to place output files in
"""
download_path = os.path.join(str(Path.cwd()), *download_dir)
print(download_path)
options = webdriver.ChromeOptions()
preferences = {
"profile.default_content_settings.popups": 0,
@ -32,7 +33,7 @@ def scrape(query: str, download_dir: list[str]) -> None:
"default_directory": download_path,
},
}
options.add_argument("--headless=new")
# options.add_argument("--headless=new")
options.add_experimental_option("prefs", preferences)
driver = webdriver.Chrome(options=options)

View File

@ -38,6 +38,8 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None:
source_dir (str): Directory hosting the xml files
target_dir (str): Target directory to move json files to
"""
if not os.path.exists(target_dir):
os.makedirs(target_dir)
for source_path in [
os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True)
]: