mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-24 21:12:34 +02:00
checkpoint: Core data fetch routine
This commit is contained in:
parent
2fddc9149a
commit
7f8511c9d6
@ -0,0 +1,85 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import glob
|
||||
import argparse
|
||||
import tempfile
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
from aki_prj23_transparenzregister.config.config_providers import (
|
||||
HELP_TEXT_CONFIG,
|
||||
get_config_provider,
|
||||
)
|
||||
from aki_prj23_transparenzregister.utils.logger_config import (
|
||||
add_logger_options_to_argparse,
|
||||
configer_logger,
|
||||
)
|
||||
|
||||
from aki_prj23_transparenzregister.utils.sql import connector
|
||||
from aki_prj23_transparenzregister.utils.sql import entities
|
||||
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
|
||||
extract,
|
||||
load,
|
||||
transform,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="Transparenzregister Webserver",
|
||||
description="Starts an Dash Webserver that shows our Analysis.",
|
||||
epilog="Example: webserver --log-level ERROR --log-path print.log",
|
||||
)
|
||||
parser.add_argument(
|
||||
"config",
|
||||
metavar="config",
|
||||
default="ENV",
|
||||
)
|
||||
add_logger_options_to_argparse(parser)
|
||||
|
||||
parsed = parser.parse_args(sys.argv[1:])
|
||||
configer_logger(namespace=parsed)
|
||||
config = parsed.config
|
||||
session = connector.get_session(get_config_provider(config))
|
||||
missing_companies = session.query(entities.MissingCompany).all()
|
||||
|
||||
counter = 0
|
||||
# Scrape data from unternehmensregister
|
||||
for company in missing_companies:
|
||||
print(company.name)
|
||||
extract.scrape(company.name, ["tmp", "xml"])
|
||||
counter = counter + 1
|
||||
if counter == 5:
|
||||
break
|
||||
# Transform input
|
||||
output_path = os.path.join(str(Path.cwd()), *["tmp", "transformed"])
|
||||
xml_dir = os.path.join(str(Path.cwd()), *["tmp", "xml"])
|
||||
json_dir = os.path.join(str(Path.cwd()), *["tmp", "json"])
|
||||
transform.transform_xml_to_json(
|
||||
os.path.join(xml_dir),
|
||||
os.path.join(json_dir),
|
||||
)
|
||||
for file in tqdm(glob.glob1(json_dir, "*.json")):
|
||||
path = os.path.join(json_dir, file)
|
||||
with open(path, encoding="utf-8") as file_object:
|
||||
try:
|
||||
company: Company = transform.map_unternehmensregister_json(
|
||||
json.loads(file_object.read())
|
||||
)
|
||||
|
||||
name = "".join(e for e in company.name if e.isalnum())[:50]
|
||||
|
||||
with open(
|
||||
f"{output_path}/{name}.json",
|
||||
"w+",
|
||||
encoding="utf-8",
|
||||
) as export_file:
|
||||
json.dump(
|
||||
dataclasses.asdict(company), export_file, ensure_ascii=False
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(f"Error in processing {path}")
|
||||
sys.exit(1)
|
@ -21,6 +21,7 @@ def scrape(query: str, download_dir: list[str]) -> None:
|
||||
download_dir (list[str]): Directory to place output files in
|
||||
"""
|
||||
download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||
print(download_path)
|
||||
options = webdriver.ChromeOptions()
|
||||
preferences = {
|
||||
"profile.default_content_settings.popups": 0,
|
||||
@ -32,7 +33,7 @@ def scrape(query: str, download_dir: list[str]) -> None:
|
||||
"default_directory": download_path,
|
||||
},
|
||||
}
|
||||
options.add_argument("--headless=new")
|
||||
# options.add_argument("--headless=new")
|
||||
options.add_experimental_option("prefs", preferences)
|
||||
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
@ -38,6 +38,8 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None:
|
||||
source_dir (str): Directory hosting the xml files
|
||||
target_dir (str): Target directory to move json files to
|
||||
"""
|
||||
if not os.path.exists(target_dir):
|
||||
os.makedirs(target_dir)
|
||||
for source_path in [
|
||||
os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True)
|
||||
]:
|
||||
|
Loading…
x
Reference in New Issue
Block a user