import re
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
import requests
from app.core.config import settings

BLOCK_PATTERNS = [
    r"captcha", r"cloudflare", r"access denied", r"bot protection", r"verify you are human",
    r"login required", r"sign in", r"forbidden", r"unusual traffic"
]


def robots_allowed(url: str, user_agent: str = "RetailCatalogueIntelligenceSA/1.0") -> tuple[bool, str]:
    parsed = urlparse(url)
    if not parsed.scheme or not parsed.netloc:
        return False, "invalid_url"
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    parser = RobotFileParser()
    try:
        parser.set_url(robots_url)
        parser.read()
        allowed = parser.can_fetch(user_agent, url)
        return allowed, "robots_allowed" if allowed else "robots_disallowed"
    except Exception as exc:
        # If robots cannot be read, stay conservative but do not fail imports.
        return False, f"robots_unavailable:{exc.__class__.__name__}"


def inspect_response(status_code: int, text: str) -> tuple[bool, str]:
    if status_code in (401, 403):
        return False, f"blocked_http_{status_code}"
    if status_code == 429:
        return False, "rate_limited_429"
    if status_code >= 500:
        return False, f"server_error_{status_code}"
    sample = (text or "")[:5000].lower()
    for pattern in BLOCK_PATTERNS:
        if re.search(pattern, sample):
            return False, f"blocked_indicator:{pattern}"
    return True, "public_page_accessible"


def safe_public_get(url: str) -> tuple[bool, str, str]:
    allowed, robots_reason = robots_allowed(url)
    if settings.compliance_enforced and not allowed:
        return False, "", robots_reason
    headers = {
        "User-Agent": "RetailCatalogueIntelligenceSA/1.0 compliance-contact=internal",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }
    try:
        response = requests.get(url, headers=headers, timeout=settings.public_fetch_timeout_seconds)
    except requests.RequestException as exc:
        return False, "", f"request_failed:{exc.__class__.__name__}"
    ok, reason = inspect_response(response.status_code, response.text)
    if not ok:
        return False, "", reason
    return True, response.text, reason
