add inlcude script
This commit is contained in:
parent
597eaadc5f
commit
1ec8088a06
1 changed files with 116 additions and 0 deletions
116
include/01_geodaten_holen.py
Normal file
116
include/01_geodaten_holen.py
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
01_geodaten_holen.py
|
||||
--------------------
|
||||
Lädt die deutschen PLZ-Koordinaten (Geokodierungs-Referenz) herunter und
|
||||
erzeugt die Datei output/geo_plz_koordinaten.csv mit den Spalten:
|
||||
|
||||
plz, lat, lon, ort, bundesland
|
||||
|
||||
Primärquelle: GeoNames (https://download.geonames.org/export/zip/DE.zip)
|
||||
Lizenz CC-BY 4.0 — Quellenangabe "GeoNames" genügt.
|
||||
Fallback: OpenPLZ API (https://openplzapi.org), falls GeoNames
|
||||
nicht erreichbar ist (liefert keine Koordinaten, daher
|
||||
nur als Notnagel für Ort/Bundesland).
|
||||
|
||||
Aufruf: python 01_geodaten_holen.py
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
GEONAMES_URL = "https://download.geonames.org/export/zip/DE.zip"
|
||||
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||
OUTPUT_CSV = OUTPUT_DIR / "geo_plz_koordinaten.csv"
|
||||
|
||||
# Spalten der GeoNames-Postleitzahldatei (Tab-getrennt, ohne Header)
|
||||
GEONAMES_COLS = [
|
||||
"country_code", "plz", "ort", "bundesland", "bundesland_code",
|
||||
"kreis", "kreis_code", "gemeinde", "gemeinde_code",
|
||||
"lat", "lon", "accuracy",
|
||||
]
|
||||
|
||||
|
||||
def lade_geonames() -> pd.DataFrame:
|
||||
print(f"Lade {GEONAMES_URL} ...")
|
||||
resp = requests.get(GEONAMES_URL, timeout=120)
|
||||
resp.raise_for_status()
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
|
||||
with zf.open("DE.txt") as f:
|
||||
df = pd.read_csv(
|
||||
f,
|
||||
sep="\t",
|
||||
header=None,
|
||||
names=GEONAMES_COLS,
|
||||
dtype={"plz": str}, # führende Nullen erhalten!
|
||||
encoding="utf-8",
|
||||
)
|
||||
print(f" {len(df)} Zeilen geladen (eine Zeile je PLZ/Ortsteil).")
|
||||
|
||||
# Eine PLZ kann mehrere Orte/Ortsteile umfassen -> auf eine
|
||||
# Koordinate je PLZ verdichten (Mittelpunkt), ersten Ort behalten.
|
||||
geo = (
|
||||
df.groupby("plz", as_index=False)
|
||||
.agg(
|
||||
lat=("lat", "mean"),
|
||||
lon=("lon", "mean"),
|
||||
ort=("ort", "first"),
|
||||
bundesland=("bundesland", "first"),
|
||||
)
|
||||
)
|
||||
geo["lat"] = geo["lat"].round(5)
|
||||
geo["lon"] = geo["lon"].round(5)
|
||||
return geo
|
||||
|
||||
|
||||
def lade_openplz_fallback() -> pd.DataFrame:
|
||||
"""Fallback ohne Koordinaten — nur damit der ETL-Lauf nicht scheitert."""
|
||||
print("GeoNames nicht erreichbar — Fallback auf OpenPLZ (ohne Koordinaten).")
|
||||
rows, page = [], 1
|
||||
while True:
|
||||
r = requests.get(
|
||||
"https://openplzapi.org/de/Localities",
|
||||
params={"page": page, "pageSize": 50},
|
||||
timeout=60,
|
||||
)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
if not data:
|
||||
break
|
||||
for item in data:
|
||||
rows.append({
|
||||
"plz": item.get("postalCode"),
|
||||
"lat": None,
|
||||
"lon": None,
|
||||
"ort": item.get("name"),
|
||||
"bundesland": (item.get("federalState") or {}).get("name"),
|
||||
})
|
||||
page += 1
|
||||
if page > 400: # Sicherheitsbremse
|
||||
break
|
||||
return pd.DataFrame(rows).drop_duplicates(subset="plz")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
try:
|
||||
geo = lade_geonames()
|
||||
except Exception as exc:
|
||||
print(f" Fehler: {exc}", file=sys.stderr)
|
||||
geo = lade_openplz_fallback()
|
||||
|
||||
geo.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
|
||||
print(f"OK: {len(geo)} PLZ geschrieben nach {OUTPUT_CSV}")
|
||||
# Plausibilitätscheck: führende Nullen?
|
||||
nullen = geo[geo["plz"].str.startswith("0")]
|
||||
print(f" davon {len(nullen)} PLZ mit führender Null (z. B. {nullen['plz'].head(3).tolist()})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue