mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-05-13 16:08:45 +02:00
checkpoint: Refactoring, first working version of processing
This commit is contained in:
parent
2458ad98ff
commit
042a019628
@ -5,6 +5,7 @@ import glob
|
||||
import argparse
|
||||
import tempfile
|
||||
import dataclasses
|
||||
import multiprocessing
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
@ -21,11 +22,76 @@ from aki_prj23_transparenzregister.utils.logger_config import (
|
||||
from aki_prj23_transparenzregister.utils.sql import connector
|
||||
from aki_prj23_transparenzregister.utils.sql import entities
|
||||
|
||||
from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector
|
||||
from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import (
|
||||
CompanyMongoService,
|
||||
)
|
||||
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
|
||||
extract,
|
||||
load,
|
||||
)
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import main as transform
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import (
|
||||
main as transform,
|
||||
)
|
||||
|
||||
def work(company: entities.Company, configProvider) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
xml_dir = os.path.join(*[tmp_dir, "xml"])
|
||||
os.makedirs(xml_dir, exist_ok=True)
|
||||
try:
|
||||
extract.scrape(company.name, xml_dir, True)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
return
|
||||
output_path = os.path.join(*[tmp_dir, "transformed"])
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
json_dir = os.path.join(*[tmp_dir, "json"])
|
||||
os.makedirs(json_dir, exist_ok=True)
|
||||
transform.transform_xml_to_json(
|
||||
xml_dir,
|
||||
json_dir,
|
||||
)
|
||||
|
||||
for file in tqdm(glob.glob1(json_dir, "*.json")):
|
||||
try:
|
||||
path = os.path.join(json_dir, file)
|
||||
with open(path, encoding="utf-8") as file_object:
|
||||
company_mapped = transform.map_unternehmensregister_json(
|
||||
json.loads(file_object.read())
|
||||
)
|
||||
|
||||
name = "".join(e for e in company_mapped.name if e.isalnum())[:50]
|
||||
|
||||
with open(
|
||||
os.path.join(output_path, f"{name}.json"),
|
||||
"w+",
|
||||
encoding="utf-8",
|
||||
) as export_file:
|
||||
json.dump(
|
||||
dataclasses.asdict(company_mapped), export_file, ensure_ascii=False
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
return
|
||||
mongoConnector = MongoConnector(configProvider.get_mongo_connection_string())
|
||||
companyMongoService = CompanyMongoService(
|
||||
mongoConnector
|
||||
)
|
||||
num_processed = load.load_directory_to_mongo(output_path, companyMongoService)
|
||||
mongoConnector.client.close()
|
||||
|
||||
try:
|
||||
if num_processed > 0:
|
||||
with connector.get_session(configProvider) as session:
|
||||
company = session.query(entities.MissingCompany).where(entities.MissingCompany.name == company.name).first()
|
||||
company.searched_for = True
|
||||
session.commit()
|
||||
print(f"Processed {company.name}")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
@ -43,44 +109,29 @@ if __name__ == "__main__":
|
||||
parsed = parser.parse_args(sys.argv[1:])
|
||||
configer_logger(namespace=parsed)
|
||||
config = parsed.config
|
||||
# session = connector.get_session(get_config_provider(config))
|
||||
# missing_companies = session.query(entities.MissingCompany).all()
|
||||
configProvider = get_config_provider(config)
|
||||
session = connector.get_session(configProvider)
|
||||
|
||||
counter = 0
|
||||
# # Scrape data from unternehmensregister
|
||||
# for company in missing_companies:
|
||||
# print(company.name)
|
||||
# extract.scrape(company.name, ["tmp", "xml"])
|
||||
|
||||
# Transform input
|
||||
output_path = os.path.join(str(Path.cwd()), *["tmp", "transformed"])
|
||||
xml_dir = os.path.join(str(Path.cwd()), *["tmp", "xml"])
|
||||
json_dir = os.path.join(str(Path.cwd()), *["tmp", "json"])
|
||||
transform.transform_xml_to_json(
|
||||
os.path.join(xml_dir),
|
||||
os.path.join(json_dir),
|
||||
companyMongoService = CompanyMongoService(
|
||||
MongoConnector(configProvider.get_mongo_connection_string())
|
||||
)
|
||||
for file in tqdm(glob.glob1(json_dir, "*.json")):
|
||||
path = os.path.join(json_dir, file)
|
||||
with open(path, encoding="utf-8") as file_object:
|
||||
# try:
|
||||
print(path)
|
||||
company = transform.map_unternehmensregister_json(
|
||||
json.loads(file_object.read())
|
||||
)
|
||||
|
||||
name = "".join(e for e in company.name if e.isalnum())[:50]
|
||||
missing_companies = session.query(entities.MissingCompany).where(entities.MissingCompany.searched_for == False).all()
|
||||
|
||||
with open(
|
||||
f"{output_path}/{name}.json",
|
||||
"w+",
|
||||
encoding="utf-8",
|
||||
) as export_file:
|
||||
json.dump(
|
||||
dataclasses.asdict(company), export_file, ensure_ascii=False
|
||||
)
|
||||
# except Exception as e:
|
||||
# logger.error(e.with_traceback())
|
||||
# logger.error(e)
|
||||
# logger.error(f"Error in processing {path}")
|
||||
# sys.exit(1)
|
||||
batch_size = 5
|
||||
pool = multiprocessing.Pool(processes=batch_size)
|
||||
# Scrape data from unternehmensregister
|
||||
params = [
|
||||
(company, configProvider)
|
||||
for company in missing_companies
|
||||
]
|
||||
# Map the process_handler function to the parameter list using the Pool
|
||||
pool.starmap(work, params)
|
||||
|
||||
# Close the Pool to prevent any more tasks from being submitted
|
||||
pool.close()
|
||||
|
||||
# Wait for all the processes to complete
|
||||
pool.join()
|
||||
# for company in tqdm(missing_companies):
|
||||
|
||||
|
@ -13,15 +13,15 @@ from selenium.webdriver.support.ui import WebDriverWait
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def scrape(query: str, download_dir: list[str]) -> None:
|
||||
def scrape(query: str, download_dir: str, full_match: bool = False) -> None:
|
||||
"""Fetch results from Unternehmensregister for given query.
|
||||
|
||||
Args:
|
||||
query (str): Search Query (RegEx supported)
|
||||
download_dir (list[str]): Directory to place output files in
|
||||
"""
|
||||
download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||
print(download_path)
|
||||
# download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||
download_path = download_dir
|
||||
options = webdriver.ChromeOptions()
|
||||
preferences = {
|
||||
"profile.default_content_settings.popups": 0,
|
||||
@ -33,8 +33,9 @@ def scrape(query: str, download_dir: list[str]) -> None:
|
||||
"default_directory": download_path,
|
||||
},
|
||||
}
|
||||
# options.add_argument("--headless=new")
|
||||
options.add_argument("--headless=new")
|
||||
options.add_experimental_option("prefs", preferences)
|
||||
options.add_experimental_option("excludeSwitches", ["enable-logging"])
|
||||
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
@ -74,7 +75,7 @@ def scrape(query: str, download_dir: list[str]) -> None:
|
||||
]
|
||||
for index, company_link in enumerate(companies_tab):
|
||||
company_name = company_names[index]
|
||||
if company_name in processed_companies:
|
||||
if company_name in processed_companies or (full_match == True and company_name != query):
|
||||
continue
|
||||
# Go to intermediary page
|
||||
company_link.click()
|
||||
@ -121,6 +122,8 @@ def scrape(query: str, download_dir: list[str]) -> None:
|
||||
finally:
|
||||
for _ in range(6):
|
||||
driver.back()
|
||||
if company_name == query and full_match == True:
|
||||
break
|
||||
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
|
||||
driver.close()
|
||||
|
||||
|
@ -14,17 +14,22 @@ from aki_prj23_transparenzregister.utils.mongo.connector import (
|
||||
MongoConnector,
|
||||
)
|
||||
|
||||
def load_directory_to_mongo(base_path: str, service: CompanyMongoService) -> int:
|
||||
num_processed = 0
|
||||
for file in tqdm(glob.glob1(base_path, "*.json")):
|
||||
path = os.path.join(base_path, file)
|
||||
with open(path, encoding="utf-8") as file_object:
|
||||
data = json.loads(file_object.read())
|
||||
company: Company = Company(**data)
|
||||
|
||||
service.migrations_of_base_data(company)
|
||||
num_processed += 1
|
||||
return num_processed
|
||||
|
||||
if __name__ == "__main__":
|
||||
provider = JsonFileConfigProvider("secrets.json")
|
||||
conn_string = provider.get_mongo_connection_string()
|
||||
connector = MongoConnector(conn_string)
|
||||
service = CompanyMongoService(connector)
|
||||
|
||||
base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister"
|
||||
for file in tqdm(glob.glob1(f"{base_path}/transformed", "*.json")):
|
||||
path = os.path.join(f"{base_path}/transformed", file)
|
||||
with open(path, encoding="utf-8") as file_object:
|
||||
data = json.loads(file_object.read())
|
||||
company: Company = Company(**data)
|
||||
|
||||
service.migrations_of_base_data(company)
|
||||
load_directory_to_mongo("./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister/transformed", service)
|
@ -8,6 +8,7 @@ import sys
|
||||
|
||||
import xmltodict
|
||||
from tqdm import tqdm
|
||||
from loguru import logger
|
||||
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import v1
|
||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v3 import v3
|
||||
@ -26,12 +27,14 @@ def transform_xml_to_json(source_dir: str, target_dir: str) -> None:
|
||||
target_path = os.path.join(
|
||||
target_dir, source_path.split(os.sep)[-1].replace(".xml", ".json")
|
||||
)
|
||||
|
||||
with open(source_path, encoding="utf-8") as source_file:
|
||||
# deepcode ignore HandleUnicode: Weird XML format no other solution
|
||||
data = xmltodict.parse(source_file.read().encode())
|
||||
with open(target_path, "w", encoding="utf-8") as json_file:
|
||||
json_file.write(json.dumps(data))
|
||||
try:
|
||||
with open(source_path, encoding="utf-8") as source_file:
|
||||
# deepcode ignore HandleUnicode: Weird XML format no other solution
|
||||
data = xmltodict.parse(source_file.read().encode())
|
||||
with open(target_path, "w", encoding="utf-8") as json_file:
|
||||
json_file.write(json.dumps(data))
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
def determine_version(data: dict):
|
||||
if "XJustiz_Daten" in data:
|
||||
|
@ -539,31 +539,3 @@ def map_unternehmensregister_json(data: dict) -> Company:
|
||||
result["relationships"].append(people)
|
||||
result = map_co_relation(result)
|
||||
return Company(**result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from loguru import logger
|
||||
|
||||
base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister"
|
||||
for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")):
|
||||
path = os.path.join(f"{base_path}/export", file)
|
||||
with open(path, encoding="utf-8") as file_object:
|
||||
try:
|
||||
company: Company = map_unternehmensregister_json(
|
||||
json.loads(file_object.read())
|
||||
)
|
||||
|
||||
name = "".join(e for e in company.name if e.isalnum())[:50]
|
||||
|
||||
with open(
|
||||
f"{base_path}/transformed/{name}.json",
|
||||
"w+",
|
||||
encoding="utf-8",
|
||||
) as export_file:
|
||||
json.dump(
|
||||
dataclasses.asdict(company), export_file, ensure_ascii=False
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(f"Error in processing {path}")
|
||||
sys.exit(1)
|
@ -385,6 +385,8 @@ def map_capital(data: dict, company_type: CompanyTypeEnum) -> Capital | None:
|
||||
]:
|
||||
return None
|
||||
# Catch entries having the dict but with null values
|
||||
if isinstance(capital, list):
|
||||
capital = capital[0]
|
||||
if not all(capital.values()):
|
||||
return None
|
||||
return Capital(
|
||||
@ -465,9 +467,10 @@ def map_founding_date(data: dict) -> str | None:
|
||||
"tns:fachdatenRegister",
|
||||
"tns:basisdatenRegister",
|
||||
"tns:satzungsdatum",
|
||||
"tns:aktuellesSatzungsdatum",
|
||||
]
|
||||
return traversal(data, path)
|
||||
base = traversal(data, path)
|
||||
if "tns:aktuellesSatzungsdatum" in base:
|
||||
return base["tns:aktuellesSatzungsdatum"]
|
||||
# No reliable answer
|
||||
return None
|
||||
|
||||
@ -620,31 +623,3 @@ def map_unternehmensregister_json(data: dict) -> Company:
|
||||
result["relationships"].append(people)
|
||||
result = map_co_relation(result)
|
||||
return Company(**result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from loguru import logger
|
||||
|
||||
base_path = "./Jupyter/API-tests/Unternehmensregister/data/Unternehmensregister"
|
||||
for file in tqdm(glob.glob1(f"{base_path}/export", "*.json")):
|
||||
path = os.path.join(f"{base_path}/export", file)
|
||||
with open(path, encoding="utf-8") as file_object:
|
||||
try:
|
||||
company: Company = map_unternehmensregister_json(
|
||||
json.loads(file_object.read())
|
||||
)
|
||||
|
||||
name = "".join(e for e in company.name if e.isalnum())[:50]
|
||||
|
||||
with open(
|
||||
f"{base_path}/transformed/{name}.json",
|
||||
"w+",
|
||||
encoding="utf-8",
|
||||
) as export_file:
|
||||
json.dump(
|
||||
dataclasses.asdict(company), export_file, ensure_ascii=False
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(f"Error in processing {path}")
|
||||
sys.exit(1)
|
||||
|
@ -45,7 +45,6 @@ class CompanyMongoService:
|
||||
query = {
|
||||
"id.hr_number": id["hr_number"],
|
||||
"id.district_court.name": id["district_court"]["name"],
|
||||
"id.district_court.city": id["district_court"]["city"],
|
||||
}
|
||||
with self.lock:
|
||||
result = list(self.collection.find(query))
|
||||
|
@ -1099,6 +1099,14 @@
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
Loading…
x
Reference in New Issue
Block a user