mirror of
https://github.com/fhswf/aki_prj23_transparenzregister.git
synced 2025-06-22 08:03:54 +02:00
Added longitude/latitude and positional accuracy to the company data (#180)
This commit is contained in:
794
poetry.lock
generated
794
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -33,7 +33,6 @@ description = "Data Integration, Extraction, and Visualization using Text Mining
|
|||||||
documentation = "https://cuddly-waffle-r416zgy.pages.github.io/"
|
documentation = "https://cuddly-waffle-r416zgy.pages.github.io/"
|
||||||
homepage = "https://cuddly-waffle-r416zgy.pages.github.io/"
|
homepage = "https://cuddly-waffle-r416zgy.pages.github.io/"
|
||||||
keywords = ["deutschland", "economy", "transparenzregister", "dataintegration", "handelsregister"]
|
keywords = ["deutschland", "economy", "transparenzregister", "dataintegration", "handelsregister"]
|
||||||
|
|
||||||
maintainers = [
|
maintainers = [
|
||||||
"Philipp Horstenkamp <philipp@horstenkamp.de>",
|
"Philipp Horstenkamp <philipp@horstenkamp.de>",
|
||||||
"Tristan Nolde <contact@trisnol.dev>",
|
"Tristan Nolde <contact@trisnol.dev>",
|
||||||
@ -43,21 +42,22 @@ maintainers = [
|
|||||||
"Sascha Zhu <sascha.zhu@eugreen.de>"
|
"Sascha Zhu <sascha.zhu@eugreen.de>"
|
||||||
]
|
]
|
||||||
name = "aki-prj23-transparenzregister"
|
name = "aki-prj23-transparenzregister"
|
||||||
packages = [{ include = "aki_prj23_transparenzregister", from = "src" }]
|
packages = [{include = "aki_prj23_transparenzregister", from = "src"}]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
repository = "https://github.com/fhswf/aki_prj23_transparenzregister"
|
repository = "https://github.com/fhswf/aki_prj23_transparenzregister"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
aenum = "^3.1.15"
|
|
||||||
SQLAlchemy = "^1.4.49"
|
SQLAlchemy = "^1.4.49"
|
||||||
|
aenum = "^3.1.15"
|
||||||
cachetools = "^5.3.1"
|
cachetools = "^5.3.1"
|
||||||
dash = "^2.13.0"
|
dash = "^2.13.0"
|
||||||
dash-auth = "^2.0.0"
|
dash-auth = "^2.0.0"
|
||||||
dash-bootstrap-components = "^1.5.0"
|
dash-bootstrap-components = "^1.5.0"
|
||||||
deutschland = { git = "https://github.com/TrisNol/deutschland.git", branch = "hotfix/python-3.11-support" }
|
deutschland = {git = "https://github.com/TrisNol/deutschland.git", branch = "hotfix/python-3.11-support"}
|
||||||
loguru = "^0.7.0"
|
loguru = "^0.7.0"
|
||||||
matplotlib = "^3.7.2"
|
matplotlib = "^3.7.2"
|
||||||
|
pgeocode = "^0.4.1"
|
||||||
psycopg2-binary = "^2.9.7"
|
psycopg2-binary = "^2.9.7"
|
||||||
pymongo = "^4.5.0"
|
pymongo = "^4.5.0"
|
||||||
python = "^3.11"
|
python = "^3.11"
|
||||||
@ -73,7 +73,7 @@ processing = []
|
|||||||
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn"]
|
web-server = ["dash", "dash-auth", "dash-bootstrap-components", "matplotlib", "seaborn"]
|
||||||
|
|
||||||
[tool.poetry.group.develop.dependencies]
|
[tool.poetry.group.develop.dependencies]
|
||||||
black = { extras = ["jupyter"], version = "^23.9.1" }
|
black = {extras = ["jupyter"], version = "^23.9.1"}
|
||||||
jupyterlab = "^4.0.6"
|
jupyterlab = "^4.0.6"
|
||||||
nbconvert = "^7.8.0"
|
nbconvert = "^7.8.0"
|
||||||
openpyxl = "^3.1.2"
|
openpyxl = "^3.1.2"
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
"""This module contains the data transfer and refinement functionalities between staging and production DB."""
|
"""This module contains the data transfer and refinement functionalities between staging and production DB."""
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
from functools import lru_cache
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
import pgeocode
|
||||||
import sqlalchemy as sa
|
import sqlalchemy as sa
|
||||||
from cachetools import LRUCache, cached
|
from cachetools import LRUCache, cached
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@ -22,6 +24,8 @@ from aki_prj23_transparenzregister.utils.sql.connector import (
|
|||||||
)
|
)
|
||||||
from aki_prj23_transparenzregister.utils.string_tools import simplify_string
|
from aki_prj23_transparenzregister.utils.string_tools import simplify_string
|
||||||
|
|
||||||
|
nomi = pgeocode.Nominatim("de")
|
||||||
|
|
||||||
|
|
||||||
class DataInvalidError(ValueError):
|
class DataInvalidError(ValueError):
|
||||||
"""This error is thrown if a db entry can't be parsed for the production db."""
|
"""This error is thrown if a db entry can't be parsed for the production db."""
|
||||||
@ -192,6 +196,26 @@ def get_company_id(
|
|||||||
return company_id
|
return company_id
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(1000)
|
||||||
|
def get_geocodes(
|
||||||
|
zip_code: str,
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""Adds additional geo positioning data to locations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zip_code: The zipcode where the company is located.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
latitude, longitude and positional accuracy.
|
||||||
|
"""
|
||||||
|
if not zip_code:
|
||||||
|
return {}
|
||||||
|
zip_query = nomi.query_postal_code(zip_code)[["latitude", "longitude", "accuracy"]]
|
||||||
|
if zip_query.isna().any():
|
||||||
|
return {}
|
||||||
|
return dict(zip_query[["latitude", "longitude"]], pos_accuracy=zip_query.accuracy)
|
||||||
|
|
||||||
|
|
||||||
@logger.catch(level="WARNING", reraise=True)
|
@logger.catch(level="WARNING", reraise=True)
|
||||||
def add_company(company: dict[str, Any], db: Session) -> None:
|
def add_company(company: dict[str, Any], db: Session) -> None:
|
||||||
"""Add a company with all its data found in the mongodb company entry.
|
"""Add a company with all its data found in the mongodb company entry.
|
||||||
@ -218,6 +242,7 @@ def add_company(company: dict[str, Any], db: Session) -> None:
|
|||||||
zip_code=simplify_string(location.get("zip_code")),
|
zip_code=simplify_string(location.get("zip_code")),
|
||||||
street=simplify_string(location.get("street")),
|
street=simplify_string(location.get("street")),
|
||||||
last_update=last_update,
|
last_update=last_update,
|
||||||
|
**get_geocodes(location.get("zip_code")), # type: ignore
|
||||||
)
|
)
|
||||||
db.add(company_entry)
|
db.add(company_entry)
|
||||||
db.commit()
|
db.commit()
|
||||||
|
@ -44,6 +44,10 @@ class Company(Base):
|
|||||||
street = sa.Column(sa.String(100), nullable=True)
|
street = sa.Column(sa.String(100), nullable=True)
|
||||||
zip_code = sa.Column(sa.String(5), nullable=True)
|
zip_code = sa.Column(sa.String(5), nullable=True)
|
||||||
city = sa.Column(sa.String(100), nullable=True)
|
city = sa.Column(sa.String(100), nullable=True)
|
||||||
|
longitude = sa.Column(sa.Float, nullable=True)
|
||||||
|
latitude = sa.Column(sa.Float, nullable=True)
|
||||||
|
pos_accuracy = sa.Column(sa.Float, nullable=True)
|
||||||
|
|
||||||
last_update = sa.Column(sa.Date, nullable=False)
|
last_update = sa.Column(sa.Date, nullable=False)
|
||||||
sector = sa.Column(sa.String(100), nullable=True)
|
sector = sa.Column(sa.String(100), nullable=True)
|
||||||
|
|
||||||
|
@ -144,18 +144,24 @@ def full_db(empty_db: Session, finance_statements: list[dict[str, Any]]) -> Sess
|
|||||||
court_id=2,
|
court_id=2,
|
||||||
name="Some Company GmbH",
|
name="Some Company GmbH",
|
||||||
street="Sesamstr.",
|
street="Sesamstr.",
|
||||||
zip_code="12345",
|
zip_code="58644",
|
||||||
city="TV City",
|
city="TV City",
|
||||||
last_update=datetime.date.fromisoformat("2023-01-01"),
|
last_update=datetime.date.fromisoformat("2023-01-01"),
|
||||||
|
latitude=51.3246,
|
||||||
|
longitude=7.6968,
|
||||||
|
pos_accuracy=4.0,
|
||||||
),
|
),
|
||||||
entities.Company(
|
entities.Company(
|
||||||
hr="HRB 123",
|
hr="HRB 123",
|
||||||
court_id=1,
|
court_id=1,
|
||||||
name="Other Company GmbH",
|
name="Other Company GmbH",
|
||||||
street="Sesamstr.",
|
street="Sesamstr.",
|
||||||
zip_code="12345",
|
zip_code="58636",
|
||||||
city="TV City",
|
city="TV City",
|
||||||
last_update=datetime.date.fromisoformat("2023-01-01"),
|
last_update=datetime.date.fromisoformat("2023-01-01"),
|
||||||
|
latitude=51.38,
|
||||||
|
longitude=7.7032,
|
||||||
|
pos_accuracy=4.0,
|
||||||
),
|
),
|
||||||
entities.Company(
|
entities.Company(
|
||||||
hr="HRB 12",
|
hr="HRB 12",
|
||||||
|
@ -26,8 +26,11 @@ def test_get_company_data(full_db: Session) -> None:
|
|||||||
2: "Third Company GmbH",
|
2: "Third Company GmbH",
|
||||||
},
|
},
|
||||||
"company_street": {0: "Sesamstr.", 1: "Sesamstr.", 2: None},
|
"company_street": {0: "Sesamstr.", 1: "Sesamstr.", 2: None},
|
||||||
"company_zip_code": {0: "12345", 1: "12345", 2: None},
|
"company_zip_code": {0: "58644", 1: "58636", 2: None},
|
||||||
"company_city": {0: "TV City", 1: "TV City", 2: None},
|
"company_city": {0: "TV City", 1: "TV City", 2: None},
|
||||||
|
"company_longitude": {0: 7.6968, 1: 7.7032, 2: None},
|
||||||
|
"company_latitude": {0: 51.3246, 1: 51.38, 2: None},
|
||||||
|
"company_pos_accuracy": {0: 4.0, 1: 4.0, 2: None},
|
||||||
"company_last_update": {
|
"company_last_update": {
|
||||||
0: "2023-01-01",
|
0: "2023-01-01",
|
||||||
1: "2023-01-01",
|
1: "2023-01-01",
|
||||||
|
@ -177,12 +177,12 @@ def test_get_person_id_value_check(
|
|||||||
("name", "zip_code", "city", "id"),
|
("name", "zip_code", "city", "id"),
|
||||||
[
|
[
|
||||||
("Some Company GmbH", "", "", 1),
|
("Some Company GmbH", "", "", 1),
|
||||||
("Some Company GmbH", "12345", "", 1),
|
("Some Company GmbH", "58644", "", 1),
|
||||||
("Some Company GmbH", "12345", "TV City", 1),
|
("Some Company GmbH", "58644", "TV City", 1),
|
||||||
("Some Company GmbH", "", "TV City", 1),
|
("Some Company GmbH", "", "TV City", 1),
|
||||||
("Other Company GmbH", "", "", 2),
|
("Other Company GmbH", "", "", 2),
|
||||||
("Other Company GmbH", "12345", "", 2),
|
("Other Company GmbH", "58636", "", 2),
|
||||||
("Other Company GmbH", "12345", "TV City", 2),
|
("Other Company GmbH", "58636", "TV City", 2),
|
||||||
("Other Company GmbH", "", "TV City", 2),
|
("Other Company GmbH", "", "TV City", 2),
|
||||||
("Third Company GmbH", "", "", 3),
|
("Third Company GmbH", "", "", 3),
|
||||||
],
|
],
|
||||||
@ -672,8 +672,11 @@ def test_relationships(documents: list[dict[str, Any]], full_db: Session) -> Non
|
|||||||
2: "Third Company GmbH",
|
2: "Third Company GmbH",
|
||||||
},
|
},
|
||||||
"street": {0: "Sesamstr.", 1: "Sesamstr.", 2: None},
|
"street": {0: "Sesamstr.", 1: "Sesamstr.", 2: None},
|
||||||
"zip_code": {0: "12345", 1: "12345", 2: None},
|
"zip_code": {0: "58644", 1: "58636", 2: None},
|
||||||
"city": {0: "TV City", 1: "TV City", 2: None},
|
"city": {0: "TV City", 1: "TV City", 2: None},
|
||||||
|
"longitude": {0: 7.6968, 1: 7.7032, 2: None},
|
||||||
|
"latitude": {0: 51.3246, 1: 51.38, 2: None},
|
||||||
|
"pos_accuracy": {0: 4.0, 1: 4.0, 2: None},
|
||||||
"last_update": {
|
"last_update": {
|
||||||
0: pd.Timestamp("2023-01-01 00:00:00"),
|
0: pd.Timestamp("2023-01-01 00:00:00"),
|
||||||
1: pd.Timestamp("2023-01-01 00:00:00"),
|
1: pd.Timestamp("2023-01-01 00:00:00"),
|
||||||
@ -1014,3 +1017,18 @@ def test_add_annual_report_financial_key_error(full_db: Session) -> None:
|
|||||||
{"financials": {"something-strange": 123.12}, "auditors": {}},
|
{"financials": {"something-strange": 123.12}, "auditors": {}},
|
||||||
db=full_db,
|
db=full_db,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.working_on()
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("zip_code", "results"),
|
||||||
|
[
|
||||||
|
("44809", {"latitude": 51.4997, "longitude": 7.1944, "pos_accuracy": 4.0}),
|
||||||
|
(None, {}),
|
||||||
|
("", {}),
|
||||||
|
("60547", {}),
|
||||||
|
("58590", {}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_get_geocodes(zip_code: str | None, results: dict) -> None:
|
||||||
|
assert data_transfer.get_geocodes(zip_code) == results
|
||||||
|
Reference in New Issue
Block a user