kaiser-natron/scripts/scrape_kaiser_natron.py

"""Scrape kaiser-natron.at product pages → JSON.

stdlib-only by design (no requests/bs4) so the venv stays minimal. The
site's product pages all share the same HTML skeleton:

    <h1>{title}</h1>
    <h2>Produktinformation:</h2><p>...</p>
    <h2>Erhältlich in:</h2><p>...</p>
    <h2>Produkteigenschaften:</h2><ul><li>...</li></ul>
    <h2>Anwendung:</h2><p><strong>{useCaseTitle}: </strong>{body}</p>...

So a tiny regex pass is robust enough — no need for a DOM parser.

Output: scripts/output/products-content.json (one entry per slug).
Re-run the script any time the source site is updated.
"""

from __future__ import annotations

import html as html_lib
import json
import re
import urllib.parse
import urllib.request
from pathlib import Path

OUTPUT = Path(__file__).resolve().parent / "output" / "products-content.json"

UA = "kaiser-natron-shop-build/1.0 (+https://github.com/Zazawowow/kaiser-natron)"
BASE = "https://kaiser-natron.at/"

# (our slug used in src/api/products.js, their URL path).
PRODUCTS: list[tuple[str, str]] = [
    ("kaiser-natron-pulver-50-g-beutel",                 "kaiser-natron-pulver-50-g-beutel"),
    ("kaiser-natron-pulver-250-g-grosspackung",          "kaiser-natron-pulver-250-g-großpackung"),
    ("kaiser-natron-pulver-3490-g-eimer",                "kaiser-natron-pulver-3-490-g-eimer"),
    ("kaiser-natron-tabletten-100-g-dose",               "kaiser-natron-tabletten-100-g-dose"),
    ("kaiser-natron-bad-500-g",                          "kaiser-natron-bad-gelb-500-g-dose"),
    ("kaiser-natron-fussbad-500-g",                      "kaiser-natron-fußbad-500-g-dose"),
    ("kaiser-natron-daunenwasch-250-ml",                 "kaiser-natron-daunen-wasch-250-ml-flasche"),
    ("kaiser-natron-sport-profi-250-ml",                 "kaiser-natron-sport-profi-250-ml-flasche"),
    ("kaiser-natron-spuelmittel-500-ml",                 "kaiser-natron-spülmittel-500-ml-flasche"),
    ("kaiser-natron-allzweck-reiniger-750-ml",           "kaiser-natron-allzweckreiniger-750-ml-flasche"),
    ("kaiser-natron-allzweck-spray-500-ml",              "kaiser-natron-allzweck-spray-500-ml-flasche"),
    ("holste-wasch-soda-500-g-beutel",                   "holste-wasch-soda-500-g-beutel"),
    ("holste-handwaschpaste-500-ml",                     "holste-handwaschpaste-500-ml"),
    ("holste-kalk-und-urinsteinloeser-750-ml",           "holste-kalk-und-urinsteinlöser-750-ml"),
    ("holste-reisstaerke-250-g-faltschachtel",           "holste-reisstärke-250-g-faltschachtel"),
    ("holste-schmierseife-fluessig-1-l-flasche",         "holste-schmierseife-flüssig-1-l-flasche"),
    ("holste-zitronensaeure-entkalker-fluessig-500-ml",  "holste-zitronensäure-entkalker-flüssig-500-ml"),
    ("gazelle-waeschestaerke-1000-ml-flasche",           "gazelle-wäschestärke-1000-ml-flasche"),
    ("gruene-tante-mit-quarzmehl-500-ml-dose",           "grüne-tante-mit-quarzmehl-500-ml-dose"),
    ("linda-fleckenweg-200-ml-tube",                     "linda-fleckenweg-200-ml-tube"),
    ("linda-handreiniger-der-kraftvolle-200-g-tube",     "linda-handreiniger-der-kraftvolle-200-g-tube"),
    ("linda-neutral-375-ml-dose",                        "linda-neutral-375-ml-dose"),
]


def fetch(path: str) -> str:
    url = BASE + urllib.parse.quote(path, safe="-/")
    req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "de-AT,de;q=0.9"})
    with urllib.request.urlopen(req, timeout=15) as r:
        return r.read().decode("utf-8")


# --- HTML helpers --------------------------------------------------------

TAG = re.compile(r"<[^>]+>")
WS = re.compile(r"\s+")


def strip_tags(s: str) -> str:
    return WS.sub(" ", html_lib.unescape(TAG.sub("", s))).strip()


def extract_block(html: str, heading: str) -> str | None:
    """Return the HTML between `<h2>{heading}:</h2>` and the next `<h2>` (or end of product-info div)."""
    m = re.search(rf"<h2>\s*{re.escape(heading)}\s*:?\s*</h2>(.*?)(?=<h2>|</div>)", html, re.DOTALL | re.IGNORECASE)
    return m.group(1) if m else None


def normalise_tagline(s: str) -> str:
    """Trim training colon, but keep terminal punctuation (`.`, `!`, `?`)
    that the brand voice uses to land each tagline."""
    s = s.strip().rstrip(":")
    if not s:
        return s
    if s[-1] not in ".!?":
        s = s + "."
    return s


def parse_product_info(block: str) -> tuple[str | None, str | None, str | None]:
    """Pulls (tagline, lead, descriptionLong).

    Source pattern is `<p><strong>{TAGLINE}</strong> {body}</p>` — we
    treat the bolded sentence as the tagline and the rest of the first
    paragraph as the lead. Subsequent paragraphs (rare on this site)
    become the long description."""
    if not block:
        return None, None, None

    # Pull the bold tagline straight out of the raw HTML before tags get
    # stripped; the *position* matters because lead = first paragraph
    # minus the tagline span.
    bold = re.search(r"<strong[^>]*>(.+?)</strong>", block, re.DOTALL)
    tagline = normalise_tagline(strip_tags(bold.group(1))) if bold else None

    paragraphs = [strip_tags(p) for p in re.findall(r"<p[^>]*>(.*?)</p>", block, re.DOTALL)]
    paragraphs = [p for p in paragraphs if p]
    if not paragraphs:
        return tagline, None, None

    first = paragraphs[0]
    lead = first
    if tagline:
        # Tagline appears at start of paragraph; remove it plus any
        # joining punctuation (`:` / `.` / spaces) so the lead reads
        # cleanly without a leftover colon.
        bare = tagline.rstrip(".!?:")
        lead = re.sub(rf"^{re.escape(bare)}[\s\.:!?]*", "", first).strip()
    if not lead:
        lead = None

    rest = " ".join(paragraphs[1:]).strip() or None
    return tagline, lead, rest


def parse_properties(block: str | None) -> list[str]:
    if not block:
        return []
    return [strip_tags(li) for li in re.findall(r"<li[^>]*>(.*?)</li>", block, re.DOTALL) if strip_tags(li)]


def parse_applications(block: str | None) -> list[dict]:
    """Use cases come in two flavours on the source site:

      A) `<p><strong>Title: </strong>Body</p>`  — separate label + body
      B) `<p>{single instruction}</p>`           — body only, no label

    For (B) we leave `title` null so the page can render it as a plain
    instruction card without an h3 above. Anything else is dropped."""
    if not block:
        return []
    out = []
    for p_html in re.findall(r"<p[^>]*>(.*?)</p>", block, re.DOTALL):
        m = re.search(r"<strong[^>]*>(.+?)</strong>(.*)", p_html, re.DOTALL)
        if m:
            title = strip_tags(m.group(1)).rstrip(":.").strip() or None
            body = strip_tags(m.group(2)).lstrip(":").strip() or None
        else:
            body = strip_tags(p_html) or None
            title = None
        if title or body:
            out.append({"title": title, "body": body})
    return out


def parse_h1(html: str) -> str | None:
    m = re.search(r"<h1>(.*?)</h1>", html, re.DOTALL)
    return strip_tags(m.group(1)) if m else None


def scrape_one(slug: str, path: str) -> dict:
    html = fetch(path)
    info_block = extract_block(html, "Produktinformation")
    props_block = extract_block(html, "Produkteigenschaften")
    apps_block = extract_block(html, "Anwendung")
    avail_block = extract_block(html, "Erhältlich in")

    tagline, lead, long_desc = parse_product_info(info_block or "")
    props = parse_properties(props_block)
    apps = parse_applications(apps_block)

    available_in = []
    if avail_block:
        available_in = [strip_tags(p) for p in re.findall(r"<p[^>]*>(.*?)</p>", avail_block, re.DOTALL) if strip_tags(p)]

    return {
        "slug": slug,
        "sourceUrl": BASE + urllib.parse.quote(path, safe="-/"),
        "sourceTitle": parse_h1(html),
        "tagline": tagline,
        "lead": lead,
        "descriptionLong": long_desc,
        "properties": props,
        "applications": apps,
        "availableIn": available_in,
    }


def main() -> None:
    OUTPUT.parent.mkdir(parents=True, exist_ok=True)
    out: dict[str, dict] = {}
    for slug, path in PRODUCTS:
        try:
            data = scrape_one(slug, path)
        except Exception as exc:
            print(f"  FAIL  {slug}: {exc}")
            continue
        out[slug] = data
        n_apps = len(data["applications"])
        n_props = len(data["properties"])
        print(f"  ok    {slug}  ({n_props} props, {n_apps} apps)")
    OUTPUT.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"\nWrote {OUTPUT.relative_to(Path.cwd())} ({len(out)} products)")


if __name__ == "__main__":
    main()