mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-04-25 20:42:34 +02:00
checkpoint: Core data fetch routine
This commit is contained in:
parent
2fddc9149a
commit
7f8511c9d6
@ -0,0 +1,85 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import glob
|
||||||
|
import argparse
|
||||||
|
import tempfile
|
||||||
|
import pandas as pd
|
||||||
|
from tqdm import tqdm
|
||||||
|
from pathlib import Path
|
||||||
|
from loguru import logger
|
||||||
|
from aki_prj23_transparenzregister.config.config_providers import (
|
||||||
|
HELP_TEXT_CONFIG,
|
||||||
|
get_config_provider,
|
||||||
|
)
|
||||||
|
from aki_prj23_transparenzregister.utils.logger_config import (
|
||||||
|
add_logger_options_to_argparse,
|
||||||
|
configer_logger,
|
||||||
|
)
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.utils.sql import connector
|
||||||
|
from aki_prj23_transparenzregister.utils.sql import entities
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
|
||||||
|
extract,
|
||||||
|
load,
|
||||||
|
transform,
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="Transparenzregister Webserver",
|
||||||
|
description="Starts an Dash Webserver that shows our Analysis.",
|
||||||
|
epilog="Example: webserver --log-level ERROR --log-path print.log",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"config",
|
||||||
|
metavar="config",
|
||||||
|
default="ENV",
|
||||||
|
)
|
||||||
|
add_logger_options_to_argparse(parser)
|
||||||
|
|
||||||
|
parsed = parser.parse_args(sys.argv[1:])
|
||||||
|
configer_logger(namespace=parsed)
|
||||||
|
config = parsed.config
|
||||||
|
session = connector.get_session(get_config_provider(config))
|
||||||
|
missing_companies = session.query(entities.MissingCompany).all()
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
# Scrape data from unternehmensregister
|
||||||
|
for company in missing_companies:
|
||||||
|
print(company.name)
|
||||||
|
extract.scrape(company.name, ["tmp", "xml"])
|
||||||
|
counter = counter + 1
|
||||||
|
if counter == 5:
|
||||||
|
break
|
||||||
|
# Transform input
|
||||||
|
output_path = os.path.join(str(Path.cwd()), *["tmp", "transformed"])
|
||||||
|
xml_dir = os.path.join(str(Path.cwd()), *["tmp", "xml"])
|
||||||
|
json_dir = os.path.join(str(Path.cwd()), *["tmp", "json"])
|
||||||
|
transform.transform_xml_to_json(
|
||||||
|
os.path.join(xml_dir),
|
||||||
|
os.path.join(json_dir),
|
||||||
|
)
|
||||||
|
for file in tqdm(glob.glob1(json_dir, "*.json")):
|
||||||
|
path = os.path.join(json_dir, file)
|
||||||
|
with open(path, encoding="utf-8") as file_object:
|
||||||
|
try:
|
||||||
|
company: Company = transform.map_unternehmensregister_json(
|
||||||
|
json.loads(file_object.read())
|
||||||
|
)
|
||||||
|
|
||||||
|
name = "".join(e for e in company.name if e.isalnum())[:50]
|
||||||
|
|
||||||
|
with open(
|
||||||
|
f"{output_path}/{name}.json",
|
||||||
|
"w+",
|
||||||
|
encoding="utf-8",
|
||||||
|
) as export_file:
|
||||||
|
json.dump(
|
||||||
|
dataclasses.asdict(company), export_file, ensure_ascii=False
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
logger.error(f"Error in processing {path}")
|
||||||
|
sys.exit(1)
|
@ -21,6 +21,7 @@ def scrape(query: str, download_dir: list[str]) -> None:
|
|||||||
download_dir (list[str]): Directory to place output files in
|
download_dir (list[str]): Directory to place output files in
|
||||||
"""
|
"""
|
||||||
download_path = os.path.join(str(Path.cwd()), *download_dir)
|
download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||||
|
print(download_path)
|
||||||
options = webdriver.ChromeOptions()
|
options = webdriver.ChromeOptions()
|
||||||
preferences = {
|
preferences = {
|
||||||
"profile.default_content_settings.popups": 0,
|
"profile.default_content_settings.popups": 0,
|
||||||
@ -32,7 +33,7 @@ def scrape(query: str, download_dir: list[str]) -> None:
|
|||||||
"default_directory": download_path,
|
"default_directory": download_path,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
options.add_argument("--headless=new")
|
# options.add_argument("--headless=new")
|
||||||
options.add_experimental_option("prefs", preferences)
|
options.add_experimental_option("prefs", preferences)
|
||||||
|
|
||||||
driver = webdriver.Chrome(options=options)
|
driver = webdriver.Chrome(options=options)
|
||||||
|
@ -38,6 +38,8 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None:
|
|||||||
source_dir (str): Directory hosting the xml files
|
source_dir (str): Directory hosting the xml files
|
||||||
target_dir (str): Target directory to move json files to
|
target_dir (str): Target directory to move json files to
|
||||||
"""
|
"""
|
||||||
|
if not os.path.exists(target_dir):
|
||||||
|
os.makedirs(target_dir)
|
||||||
for source_path in [
|
for source_path in [
|
||||||
os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True)
|
os.path.normpath(i) for i in glob.glob(source_dir + "**/*.xml", recursive=True)
|
||||||
]:
|
]:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user