"""Scrape kaiser-natron.at product pages → JSON. stdlib-only by design (no requests/bs4) so the venv stays minimal. The site's product pages all share the same HTML skeleton:

{title}

Produktinformation:

...

Erhältlich in:

...

Produkteigenschaften:

Anwendung:

{useCaseTitle}: {body}

... So a tiny regex pass is robust enough — no need for a DOM parser. Output: scripts/output/products-content.json (one entry per slug). Re-run the script any time the source site is updated. """ from __future__ import annotations import html as html_lib import json import re import urllib.parse import urllib.request from pathlib import Path OUTPUT = Path(__file__).resolve().parent / "output" / "products-content.json" UA = "kaiser-natron-shop-build/1.0 (+https://github.com/Zazawowow/kaiser-natron)" BASE = "https://kaiser-natron.at/" # (our slug used in src/api/products.js, their URL path). PRODUCTS: list[tuple[str, str]] = [ ("kaiser-natron-pulver-50-g-beutel", "kaiser-natron-pulver-50-g-beutel"), ("kaiser-natron-pulver-250-g-grosspackung", "kaiser-natron-pulver-250-g-großpackung"), ("kaiser-natron-pulver-3490-g-eimer", "kaiser-natron-pulver-3-490-g-eimer"), ("kaiser-natron-tabletten-100-g-dose", "kaiser-natron-tabletten-100-g-dose"), ("kaiser-natron-bad-500-g", "kaiser-natron-bad-gelb-500-g-dose"), ("kaiser-natron-fussbad-500-g", "kaiser-natron-fußbad-500-g-dose"), ("kaiser-natron-daunenwasch-250-ml", "kaiser-natron-daunen-wasch-250-ml-flasche"), ("kaiser-natron-sport-profi-250-ml", "kaiser-natron-sport-profi-250-ml-flasche"), ("kaiser-natron-spuelmittel-500-ml", "kaiser-natron-spülmittel-500-ml-flasche"), ("kaiser-natron-allzweck-reiniger-750-ml", "kaiser-natron-allzweckreiniger-750-ml-flasche"), ("kaiser-natron-allzweck-spray-500-ml", "kaiser-natron-allzweck-spray-500-ml-flasche"), ("holste-wasch-soda-500-g-beutel", "holste-wasch-soda-500-g-beutel"), ("holste-handwaschpaste-500-ml", "holste-handwaschpaste-500-ml"), ("holste-kalk-und-urinsteinloeser-750-ml", "holste-kalk-und-urinsteinlöser-750-ml"), ("holste-reisstaerke-250-g-faltschachtel", "holste-reisstärke-250-g-faltschachtel"), ("holste-schmierseife-fluessig-1-l-flasche", "holste-schmierseife-flüssig-1-l-flasche"), ("holste-zitronensaeure-entkalker-fluessig-500-ml", "holste-zitronensäure-entkalker-flüssig-500-ml"), ("gazelle-waeschestaerke-1000-ml-flasche", "gazelle-wäschestärke-1000-ml-flasche"), ("gruene-tante-mit-quarzmehl-500-ml-dose", "grüne-tante-mit-quarzmehl-500-ml-dose"), ("linda-fleckenweg-200-ml-tube", "linda-fleckenweg-200-ml-tube"), ("linda-handreiniger-der-kraftvolle-200-g-tube", "linda-handreiniger-der-kraftvolle-200-g-tube"), ("linda-neutral-375-ml-dose", "linda-neutral-375-ml-dose"), ] def fetch(path: str) -> str: url = BASE + urllib.parse.quote(path, safe="-/") req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "de-AT,de;q=0.9"}) with urllib.request.urlopen(req, timeout=15) as r: return r.read().decode("utf-8") # --- HTML helpers -------------------------------------------------------- TAG = re.compile(r"<[^>]+>") WS = re.compile(r"\s+") def strip_tags(s: str) -> str: return WS.sub(" ", html_lib.unescape(TAG.sub("", s))).strip() def extract_block(html: str, heading: str) -> str | None: """Return the HTML between `

{heading}:

` and the next `

` (or end of product-info div).""" m = re.search(rf"

\s*{re.escape(heading)}\s*:?\s*

(.*?)(?=

|)", html, re.DOTALL | re.IGNORECASE) return m.group(1) if m else None def normalise_tagline(s: str) -> str: """Trim training colon, but keep terminal punctuation (`.`, `!`, `?`) that the brand voice uses to land each tagline.""" s = s.strip().rstrip(":") if not s: return s if s[-1] not in ".!?": s = s + "." return s def parse_product_info(block: str) -> tuple[str | None, str | None, str | None]: """Pulls (tagline, lead, descriptionLong). Source pattern is `

{TAGLINE} {body}

` — we treat the bolded sentence as the tagline and the rest of the first paragraph as the lead. Subsequent paragraphs (rare on this site) become the long description.""" if not block: return None, None, None # Pull the bold tagline straight out of the raw HTML before tags get # stripped; the *position* matters because lead = first paragraph # minus the tagline span. bold = re.search(r"]*>(.+?)", block, re.DOTALL) tagline = normalise_tagline(strip_tags(bold.group(1))) if bold else None paragraphs = [strip_tags(p) for p in re.findall(r"]*>(.*?)

", block, re.DOTALL)] paragraphs = [p for p in paragraphs if p] if not paragraphs: return tagline, None, None first = paragraphs[0] lead = first if tagline: # Tagline appears at start of paragraph; remove it plus any # joining punctuation (`:` / `.` / spaces) so the lead reads # cleanly without a leftover colon. bare = tagline.rstrip(".!?:") lead = re.sub(rf"^{re.escape(bare)}[\s\.:!?]*", "", first).strip() if not lead: lead = None rest = " ".join(paragraphs[1:]).strip() or None return tagline, lead, rest def parse_properties(block: str | None) -> list[str]: if not block: return [] return [strip_tags(li) for li in re.findall(r"]*>(.*?)", block, re.DOTALL) if strip_tags(li)] def parse_applications(block: str | None) -> list[dict]: """Use cases come in two flavours on the source site: A) `

Title: Body

` — separate label + body B) `

{single instruction}

` — body only, no label For (B) we leave `title` null so the page can render it as a plain instruction card without an h3 above. Anything else is dropped.""" if not block: return [] out = [] for p_html in re.findall(r"]*>(.*?)

", block, re.DOTALL): m = re.search(r"]*>(.+?)(.*)", p_html, re.DOTALL) if m: title = strip_tags(m.group(1)).rstrip(":.").strip() or None body = strip_tags(m.group(2)).lstrip(":").strip() or None else: body = strip_tags(p_html) or None title = None if title or body: out.append({"title": title, "body": body}) return out def parse_h1(html: str) -> str | None: m = re.search(r"

(.*?)

", html, re.DOTALL) return strip_tags(m.group(1)) if m else None def scrape_one(slug: str, path: str) -> dict: html = fetch(path) info_block = extract_block(html, "Produktinformation") props_block = extract_block(html, "Produkteigenschaften") apps_block = extract_block(html, "Anwendung") avail_block = extract_block(html, "Erhältlich in") tagline, lead, long_desc = parse_product_info(info_block or "") props = parse_properties(props_block) apps = parse_applications(apps_block) available_in = [] if avail_block: available_in = [strip_tags(p) for p in re.findall(r"]*>(.*?)

", avail_block, re.DOTALL) if strip_tags(p)] return { "slug": slug, "sourceUrl": BASE + urllib.parse.quote(path, safe="-/"), "sourceTitle": parse_h1(html), "tagline": tagline, "lead": lead, "descriptionLong": long_desc, "properties": props, "applications": apps, "availableIn": available_in, } def main() -> None: OUTPUT.parent.mkdir(parents=True, exist_ok=True) out: dict[str, dict] = {} for slug, path in PRODUCTS: try: data = scrape_one(slug, path) except Exception as exc: print(f" FAIL {slug}: {exc}") continue out[slug] = data n_apps = len(data["applications"]) n_props = len(data["properties"]) print(f" ok {slug} ({n_props} props, {n_apps} apps)") OUTPUT.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8") print(f"\nWrote {OUTPUT.relative_to(Path.cwd())} ({len(out)} products)") if __name__ == "__main__": main()