import re
from datetime import datetime
from typing import Any

STANDARD_FIELDS = [
    "retailer_name", "retailer_brand", "store_location", "product_id", "retailer_product_code",
    "product_name", "brand", "category_level_1", "category_level_2", "category_level_3",
    "pack_size", "unit_of_measure", "price", "promo_price", "promotion_text", "barcode_or_gtin",
    "barcode_source", "image_url", "product_url", "availability_status", "source_type", "source_url",
    "date_captured", "confidence_score"
]

FIELD_SYNONYMS = {
    "name": "product_name", "title": "product_name", "product": "product_name",
    "retailer": "retailer_name", "merchant": "retailer_name", "store": "retailer_name",
    "url": "product_url", "link": "product_url", "product_link": "product_url",
    "image": "image_url", "image_link": "image_url", "img": "image_url",
    "category": "category_level_1", "department": "category_level_1",
    "subcategory": "category_level_2", "sub_category": "category_level_2",
    "size": "pack_size", "pack": "pack_size", "packsize": "pack_size",
    "uom": "unit_of_measure", "unit": "unit_of_measure",
    "special_price": "promo_price", "promo": "promo_price", "sale_price": "promo_price",
    "old_price": "price", "regular_price": "price", "current_price": "price",
    "gtin": "barcode_or_gtin", "barcode": "barcode_or_gtin", "ean": "barcode_or_gtin",
    "availability": "availability_status", "status": "availability_status",
}

PRICE_RE = re.compile(r"[^0-9.,-]")
PACK_RE = re.compile(r"(?P<size>\d+(?:[.,]\d+)?)\s?(?P<uom>kg|g|ml|l|lt|litre|litres|x|pack|ea|each|s|tabs|caps|rolls|sachets)\b", re.I)


def clean_key(key: str) -> str:
    key = str(key or "").strip().lower()
    key = re.sub(r"[^a-z0-9]+", "_", key).strip("_")
    return FIELD_SYNONYMS.get(key, key)


def parse_price(value: Any) -> float | None:
    if value is None or value == "":
        return None
    if isinstance(value, (int, float)):
        return float(value)
    text = PRICE_RE.sub("", str(value)).replace(",", ".")
    if text.count(".") > 1:
        pieces = text.split(".")
        text = "".join(pieces[:-1]) + "." + pieces[-1]
    try:
        return round(float(text), 2)
    except ValueError:
        return None


def derive_pack_uom(product_name: str, pack_size: str = "", unit_of_measure: str = "") -> tuple[str, str]:
    if pack_size and unit_of_measure:
        return pack_size, unit_of_measure
    source = f"{pack_size} {product_name}"
    match = PACK_RE.search(source or "")
    if not match:
        return pack_size or "", unit_of_measure or ""
    size = match.group("size").replace(",", ".")
    uom = match.group("uom").lower()
    uom = {"lt": "l", "litre": "l", "litres": "l", "each": "ea"}.get(uom, uom)
    return pack_size or size, unit_of_measure or uom


def canonical_key(row: dict[str, Any]) -> str:
    parts = [row.get("brand", ""), row.get("product_name", ""), row.get("pack_size", ""), row.get("unit_of_measure", "")]
    raw = " ".join(str(p) for p in parts).lower()
    raw = re.sub(r"[^a-z0-9]+", " ", raw).strip()
    return raw


def normalise_row(raw: dict[str, Any], default_retailer: str = "Unknown", source_type: str = "manual_import") -> dict[str, Any]:
    row = {field: "" for field in STANDARD_FIELDS}
    for key, value in raw.items():
        mapped = clean_key(key)
        if mapped in row:
            row[mapped] = value if value is not None else ""

    row["retailer_name"] = str(row.get("retailer_name") or default_retailer or "Unknown").strip()
    row["product_name"] = str(row.get("product_name") or "").strip()
    row["brand"] = str(row.get("brand") or "").strip()
    row["price"] = parse_price(row.get("price"))
    row["promo_price"] = parse_price(row.get("promo_price"))
    row["source_type"] = str(row.get("source_type") or source_type)
    row["availability_status"] = str(row.get("availability_status") or "unknown")
    row["barcode_or_gtin"] = str(row.get("barcode_or_gtin") or "").strip()
    row["barcode_source"] = str(row.get("barcode_source") or ("public/authorised source" if row["barcode_or_gtin"] else "not publicly available"))
    pack, uom = derive_pack_uom(row["product_name"], str(row.get("pack_size") or ""), str(row.get("unit_of_measure") or ""))
    row["pack_size"] = pack
    row["unit_of_measure"] = uom
    row["confidence_score"] = float(row.get("confidence_score") or 1.0)

    if not row.get("date_captured"):
        row["date_captured"] = datetime.utcnow()
    elif isinstance(row["date_captured"], str):
        try:
            row["date_captured"] = datetime.fromisoformat(row["date_captured"].replace("Z", "+00:00")).replace(tzinfo=None)
        except ValueError:
            row["date_captured"] = datetime.utcnow()

    row["canonical_key"] = canonical_key(row)
    return row
