# News

## Tagesschau API

In [59]:
import json
import requests
from bs4 import BeautifulSoup


class TagesschauAPI:
    def __init__(self):
        self.base_url = "https://www.tagesschau.de/api2"

    def get_news_for_sector(self, sector: str) -> dict:
        url = f"{self.base_url}/news/"
        regions = ",".join([str(i) for i in range(1, 16)])
        result = requests.get(url=url, params={"regions": regions, "ressort": sector})
        return result.json()

    def custom_search(self, query: str) -> dict:
        url = f"{self.base_url}/search/"
        result = requests.get(url=url, params={"searchText": query})
        return result.json()

    def get_news_details_text(self, url: str) -> dict:
        content = requests.get(url)
        soup = BeautifulSoup(content.text, features="html.parser")

        return " ".join([elem.text for elem in soup.find_all("p")])


tagesschau = TagesschauAPI()

data = tagesschau.get_news_for_sector("wirtschaft")
with open("./data/temp.json", "w+", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False)

In [54]:
from tqdm import tqdm
import pandas as pd


news = []
for news_article in tqdm(data["news"]):
    info = {
        "id": news_article["externalId"],
        "title": news_article["title"],
        "date": news_article["date"],
        "text": tagesschau.get_news_details_text(news_article["detailsweb"]),
    }
    news.append(info)

df = pd.DataFrame(news)
df.head()

100%|██████████| 51/51 [00:14<00:00,  3.64it/s]


Unnamed: 0,id,title,date,text
0,836120ce-9602-4296-9b04-8eff4da34be5,Befeuert Beyoncé die schwedische Inflation?,2023-06-15T16:31:26.141+02:00,Stand: 15.06.2023 16:31 Uhr \nDie Inflationsra...
1,5cc61bcb-d290-4114-b608-0d5aba426f27,EZB-Zinsentscheid mit fadem Beigeschmack,2023-06-15T16:18:19.844+02:00,Stand: 15.06.2023 16:18 Uhr \nNach der Zinserh...
2,091386f3-78b5-4180-8d5b-7c5b371aed93,Leitzins in der Eurozone steigt auf vier Prozent,2023-06-15T14:16:40.611+02:00,Stand: 15.06.2023 14:16 Uhr \nDie Europäische ...
3,add3a48e-d847-48eb-9e0d-2a1db3daf00c,"Überschwemmte Äcker, bedrohte Ernten",2023-06-15T14:05:59.591+02:00,Stand: 15.06.2023 14:05 Uhr \nVom Kachowka-Sta...
4,54453278-1e7d-4bc3-b33f-ea84f2daa7ac,Forscher erwarten 2023 schrumpfende Wirtschaft,2023-06-15T12:52:14.804+02:00,Stand: 15.06.2023 12:52 Uhr \nMehrere Wirtscha...


In [55]:
df.to_json("./data/news.json", orient="records", force_ascii=False)

In [60]:
custom_search_data = tagesschau.custom_search("Haltern am See")
custom_search_data

{'details': 'https://www.tagesschau.de/api2u/search',
 'searchText': 'Haltern am See',
 'pageSize': 25,
 'resultPage': 0,
 'totalItemCount': 3,
 'searchResults': [{'sophoraId': 'wdr-erneut-rettungsboot-motoren-der-dlrg-in-haltern-geklaut-100',
   'externalId': 'tagesschau_fm-story-WDR-c6323d1e-c262-4414-946c-e063dc4b9791',
   'title': 'Erneut Rettungsboot-Motoren der DLRG in Haltern geklaut',
   'date': '2023-06-14T13:26:00.000+02:00',
   'teaserImage': {'title': 'Diebe klauen Rettungsboot-Motoren der DLRG Haltern',
    'copyright': 'WDR/Markus Holtrichter',
    'alttext': 'Ein Mitglied der DLRG Haltern rudert im Paddelboot.',
    'imageVariants': {'1x1-144': 'https://images.tagesschau.de/image/c77cdc33-f678-4a0e-b411-3e98039c67d3/AAABiLnqOsk/AAABg8tMNQQ/1x1-144.jpg',
     '1x1-256': 'https://images.tagesschau.de/image/c77cdc33-f678-4a0e-b411-3e98039c67d3/AAABiLnqOsk/AAABg8tMOLk/1x1-256.jpg',
     '1x1-432': 'https://images.tagesschau.de/image/c77cdc33-f678-4a0e-b411-3e98039c67d3/AAABi