mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-06-21 18:53:55 +02:00
test: Adapt existing unit tests to refactored imports
This commit is contained in:
@ -1,32 +1,20 @@
|
|||||||
|
"""Retrieve missing companies from unternehmensregister."""
|
||||||
|
import argparse
|
||||||
|
import dataclasses
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
|
||||||
import glob
|
|
||||||
import argparse
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import dataclasses
|
|
||||||
import multiprocessing
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
|
||||||
from pathlib import Path
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from aki_prj23_transparenzregister.config.config_providers import (
|
from aki_prj23_transparenzregister.config.config_providers import (
|
||||||
HELP_TEXT_CONFIG,
|
ConfigProvider,
|
||||||
get_config_provider,
|
get_config_provider,
|
||||||
)
|
)
|
||||||
from aki_prj23_transparenzregister.utils.logger_config import (
|
|
||||||
add_logger_options_to_argparse,
|
|
||||||
configer_logger,
|
|
||||||
)
|
|
||||||
|
|
||||||
from aki_prj23_transparenzregister.utils.sql import connector
|
|
||||||
from aki_prj23_transparenzregister.utils.sql import entities
|
|
||||||
|
|
||||||
from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector
|
|
||||||
from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import (
|
|
||||||
CompanyMongoService,
|
|
||||||
)
|
|
||||||
|
|
||||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
|
||||||
extract,
|
extract,
|
||||||
load,
|
load,
|
||||||
@ -34,13 +22,29 @@ from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister im
|
|||||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import (
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import (
|
||||||
main as transform,
|
main as transform,
|
||||||
)
|
)
|
||||||
|
from aki_prj23_transparenzregister.utils.logger_config import (
|
||||||
|
add_logger_options_to_argparse,
|
||||||
|
configer_logger,
|
||||||
|
)
|
||||||
|
from aki_prj23_transparenzregister.utils.mongo.company_mongo_service import (
|
||||||
|
CompanyMongoService,
|
||||||
|
)
|
||||||
|
from aki_prj23_transparenzregister.utils.mongo.connector import MongoConnector
|
||||||
|
from aki_prj23_transparenzregister.utils.sql import connector, entities
|
||||||
|
|
||||||
def work(company: entities.Company, configProvider) -> None:
|
|
||||||
|
def work(company: entities.Company, config_provider: ConfigProvider) -> None:
|
||||||
|
"""Main method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
company (entities.Company): Company to be searched for
|
||||||
|
config_provider (ConfigProvider): ConfigProvider
|
||||||
|
"""
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
xml_dir = os.path.join(*[tmp_dir, "xml"])
|
xml_dir = os.path.join(*[tmp_dir, "xml"])
|
||||||
os.makedirs(xml_dir, exist_ok=True)
|
os.makedirs(xml_dir, exist_ok=True)
|
||||||
try:
|
try:
|
||||||
extract.scrape(company.name, xml_dir, True)
|
extract.scrape(company.name, xml_dir, True, True) # type: ignore
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
return
|
return
|
||||||
@ -57,37 +61,41 @@ def work(company: entities.Company, configProvider) -> None:
|
|||||||
try:
|
try:
|
||||||
path = os.path.join(json_dir, file)
|
path = os.path.join(json_dir, file)
|
||||||
with open(path, encoding="utf-8") as file_object:
|
with open(path, encoding="utf-8") as file_object:
|
||||||
company_mapped = transform.map_unternehmensregister_json(
|
company_mapped = transform.map_unternehmensregister_json(
|
||||||
json.loads(file_object.read())
|
json.loads(file_object.read())
|
||||||
|
)
|
||||||
|
|
||||||
|
name = "".join(e for e in company_mapped.name if e.isalnum())[:50]
|
||||||
|
|
||||||
|
with open(
|
||||||
|
os.path.join(output_path, f"{name}.json"),
|
||||||
|
"w+",
|
||||||
|
encoding="utf-8",
|
||||||
|
) as export_file:
|
||||||
|
json.dump(
|
||||||
|
dataclasses.asdict(company_mapped),
|
||||||
|
export_file,
|
||||||
|
ensure_ascii=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
name = "".join(e for e in company_mapped.name if e.isalnum())[:50]
|
|
||||||
|
|
||||||
with open(
|
|
||||||
os.path.join(output_path, f"{name}.json"),
|
|
||||||
"w+",
|
|
||||||
encoding="utf-8",
|
|
||||||
) as export_file:
|
|
||||||
json.dump(
|
|
||||||
dataclasses.asdict(company_mapped), export_file, ensure_ascii=False
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
return
|
return
|
||||||
mongoConnector = MongoConnector(configProvider.get_mongo_connection_string())
|
mongo_connector = MongoConnector(config_provider.get_mongo_connection_string())
|
||||||
companyMongoService = CompanyMongoService(
|
company_mongo_service = CompanyMongoService(mongo_connector)
|
||||||
mongoConnector
|
num_processed = load.load_directory_to_mongo(output_path, company_mongo_service)
|
||||||
)
|
mongo_connector.client.close()
|
||||||
num_processed = load.load_directory_to_mongo(output_path, companyMongoService)
|
|
||||||
mongoConnector.client.close()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if num_processed > 0:
|
if num_processed > 0:
|
||||||
with connector.get_session(configProvider) as session:
|
with connector.get_session(config_provider) as session:
|
||||||
company = session.query(entities.MissingCompany).where(entities.MissingCompany.name == company.name).first()
|
company = (
|
||||||
company.searched_for = True
|
session.query(entities.MissingCompany) # type: ignore
|
||||||
|
.where(entities.MissingCompany.name == company.name)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
company.searched_for = True # type: ignore
|
||||||
session.commit()
|
session.commit()
|
||||||
print(f"Processed {company.name}")
|
logger.info(f"Processed {company.name}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
return
|
return
|
||||||
@ -109,22 +117,23 @@ if __name__ == "__main__":
|
|||||||
parsed = parser.parse_args(sys.argv[1:])
|
parsed = parser.parse_args(sys.argv[1:])
|
||||||
configer_logger(namespace=parsed)
|
configer_logger(namespace=parsed)
|
||||||
config = parsed.config
|
config = parsed.config
|
||||||
configProvider = get_config_provider(config)
|
config_provider = get_config_provider(config)
|
||||||
session = connector.get_session(configProvider)
|
session = connector.get_session(config_provider)
|
||||||
|
|
||||||
companyMongoService = CompanyMongoService(
|
company_mongo_service = CompanyMongoService(
|
||||||
MongoConnector(configProvider.get_mongo_connection_string())
|
MongoConnector(config_provider.get_mongo_connection_string())
|
||||||
)
|
)
|
||||||
|
|
||||||
missing_companies = session.query(entities.MissingCompany).where(entities.MissingCompany.searched_for == False).all()
|
missing_companies = (
|
||||||
|
session.query(entities.MissingCompany)
|
||||||
|
.where(entities.MissingCompany.searched_for is False)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
batch_size = 5
|
batch_size = 5
|
||||||
pool = multiprocessing.Pool(processes=batch_size)
|
pool = multiprocessing.Pool(processes=batch_size)
|
||||||
# Scrape data from unternehmensregister
|
# Scrape data from unternehmensregister
|
||||||
params = [
|
params = [(company, config_provider) for company in missing_companies]
|
||||||
(company, configProvider)
|
|
||||||
for company in missing_companies
|
|
||||||
]
|
|
||||||
# Map the process_handler function to the parameter list using the Pool
|
# Map the process_handler function to the parameter list using the Pool
|
||||||
pool.starmap(work, params)
|
pool.starmap(work, params)
|
||||||
|
|
||||||
@ -134,4 +143,3 @@ if __name__ == "__main__":
|
|||||||
# Wait for all the processes to complete
|
# Wait for all the processes to complete
|
||||||
pool.join()
|
pool.join()
|
||||||
# for company in tqdm(missing_companies):
|
# for company in tqdm(missing_companies):
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
import glob
|
import glob
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
@ -13,12 +12,19 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def scrape(query: str, download_dir: str, full_match: bool = False) -> None:
|
def scrape(
|
||||||
|
query: str,
|
||||||
|
download_dir: str,
|
||||||
|
full_match: bool = False,
|
||||||
|
early_stopping: bool = False,
|
||||||
|
) -> None:
|
||||||
"""Fetch results from Unternehmensregister for given query.
|
"""Fetch results from Unternehmensregister for given query.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query (str): Search Query (RegEx supported)
|
query (str): Search Query (RegEx supported)
|
||||||
download_dir (list[str]): Directory to place output files in
|
download_dir (list[str]): Directory to place output files in
|
||||||
|
full_match (bool, optional): Only scrape first result. Defaults to False.
|
||||||
|
early_stopping (bool, optional): Stop scraping after first page. Defaults to False.
|
||||||
"""
|
"""
|
||||||
# download_path = os.path.join(str(Path.cwd()), *download_dir)
|
# download_path = os.path.join(str(Path.cwd()), *download_dir)
|
||||||
download_path = download_dir
|
download_path = download_dir
|
||||||
@ -75,7 +81,9 @@ def scrape(query: str, download_dir: str, full_match: bool = False) -> None:
|
|||||||
]
|
]
|
||||||
for index, company_link in enumerate(companies_tab):
|
for index, company_link in enumerate(companies_tab):
|
||||||
company_name = company_names[index]
|
company_name = company_names[index]
|
||||||
if company_name in processed_companies or (full_match == True and company_name != query):
|
if company_name in processed_companies or (
|
||||||
|
full_match is True and company_name != query
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
# Go to intermediary page
|
# Go to intermediary page
|
||||||
company_link.click()
|
company_link.click()
|
||||||
@ -122,8 +130,10 @@ def scrape(query: str, download_dir: str, full_match: bool = False) -> None:
|
|||||||
finally:
|
finally:
|
||||||
for _ in range(6):
|
for _ in range(6):
|
||||||
driver.back()
|
driver.back()
|
||||||
if company_name == query and full_match == True:
|
if company_name == query and full_match is True:
|
||||||
break
|
break # noqa: B012
|
||||||
|
if early_stopping is True:
|
||||||
|
break
|
||||||
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
|
driver.find_element(By.XPATH, '//*[@class="fas fa-angle-right"]').click()
|
||||||
driver.close()
|
driver.close()
|
||||||
|
|
||||||
|
6
tests/apps/find_missing_companies_test.py
Normal file
6
tests/apps/find_missing_companies_test.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
"""Testing find_missing_companies.py."""
|
||||||
|
from aki_prj23_transparenzregister.apps import find_missing_companies
|
||||||
|
|
||||||
|
|
||||||
|
def test_import_find_missing_companies() -> None:
|
||||||
|
assert find_missing_companies
|
@ -86,4 +86,4 @@ def test_wait_for_download_condition() -> None:
|
|||||||
|
|
||||||
def test_scrape() -> None:
|
def test_scrape() -> None:
|
||||||
with TemporaryDirectory(dir="./") as temp_dir:
|
with TemporaryDirectory(dir="./") as temp_dir:
|
||||||
extract.scrape("GEA Farm Technologies GmbH", [temp_dir])
|
extract.scrape("GEA Farm Technologies GmbH", temp_dir)
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
"""Testing main.py."""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform import (
|
||||||
|
main,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_transform_xml_to_json() -> None:
|
||||||
|
with TemporaryDirectory(dir="./") as temp_source_dir:
|
||||||
|
with open(os.path.join(temp_source_dir, "test.xml"), "w") as file:
|
||||||
|
xml_input = """<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<test>
|
||||||
|
<message>Hello World!</message>
|
||||||
|
</test>
|
||||||
|
"""
|
||||||
|
file.write(xml_input)
|
||||||
|
with TemporaryDirectory(dir="./") as temp_target_dir:
|
||||||
|
main.transform_xml_to_json(temp_source_dir, temp_target_dir)
|
||||||
|
with open(os.path.join(temp_target_dir, "test.json")) as file:
|
||||||
|
json_output = json.load(file)
|
||||||
|
assert json_output == {"test": {"message": "Hello World!"}}
|
@ -1,7 +1,4 @@
|
|||||||
"""Testing utils/data_extraction/unternehmensregister/transform.py."""
|
"""Testing utils/data_extraction/unternehmensregister/transform.py."""
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from tempfile import TemporaryDirectory
|
|
||||||
from unittest.mock import Mock, patch
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -21,27 +18,11 @@ from aki_prj23_transparenzregister.models.company import (
|
|||||||
PersonToCompanyRelationship,
|
PersonToCompanyRelationship,
|
||||||
RelationshipRoleEnum,
|
RelationshipRoleEnum,
|
||||||
)
|
)
|
||||||
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister import (
|
from aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1 import (
|
||||||
transform,
|
v1 as transform,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_transform_xml_to_json() -> None:
|
|
||||||
with TemporaryDirectory(dir="./") as temp_source_dir:
|
|
||||||
with open(os.path.join(temp_source_dir, "test.xml"), "w") as file:
|
|
||||||
xml_input = """<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<test>
|
|
||||||
<message>Hello World!</message>
|
|
||||||
</test>
|
|
||||||
"""
|
|
||||||
file.write(xml_input)
|
|
||||||
with TemporaryDirectory(dir="./") as temp_target_dir:
|
|
||||||
transform.transform_xml_to_json(temp_source_dir, temp_target_dir)
|
|
||||||
with open(os.path.join(temp_target_dir, "test.json")) as file:
|
|
||||||
json_output = json.load(file)
|
|
||||||
assert json_output == {"test": {"message": "Hello World!"}}
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_stakeholder_org_hidden_in_person() -> None:
|
def test_parse_stakeholder_org_hidden_in_person() -> None:
|
||||||
data = {
|
data = {
|
||||||
"Beteiligter": {
|
"Beteiligter": {
|
||||||
@ -787,34 +768,34 @@ def test_map_co_relation(value: dict, expected_result: dict) -> None:
|
|||||||
|
|
||||||
|
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_co_relation"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_co_relation"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_company_id"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_company_id"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.name_from_beteiligung"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.name_from_beteiligung"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.loc_from_beteiligung"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.loc_from_beteiligung"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_last_update"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_last_update"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_rechtsform"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_rechtsform"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_capital"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_capital"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_business_purpose"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_business_purpose"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.map_founding_date"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.map_founding_date"
|
||||||
)
|
)
|
||||||
@patch(
|
@patch(
|
||||||
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.parse_stakeholder"
|
"aki_prj23_transparenzregister.utils.data_extraction.unternehmensregister.transform.v1.v1.parse_stakeholder"
|
||||||
)
|
)
|
||||||
def test_map_unternehmensregister_json( # noqa: PLR0913
|
def test_map_unternehmensregister_json( # noqa: PLR0913
|
||||||
mock_map_parse_stakeholder: Mock,
|
mock_map_parse_stakeholder: Mock,
|
Reference in New Issue
Block a user