"""
Audit translation quality — run inside bDS Python runtime.

Flags posts where translation content is suspicious:
  - Canonical body is empty/tiny but translation has substantial content (hallucination)
  - Translation is much longer than canonical (ratio > 3x)
  - Canonical is macro-only but translation contains prose
"""

from bds_api import bds

MACRO_PATTERN = r'^\s*(\[\[.*?\]\]\s*)+$'
SIZE_RATIO_THRESHOLD = 3.0
MIN_HALLUCINATION_SIZE = 200  # translation must be at least this big to flag


async def audit():
    import re

    # Get all published posts, paginated
    all_posts = []
    offset = 0
    limit = 100
    while True:
        result = await bds.posts.get_all(options={'limit': limit, 'offset': offset})
        items = result.get('items', result) if isinstance(result, dict) else result
        if isinstance(result, dict):
            all_posts.extend(result['items'])
            if not result.get('hasMore', False):
                break
        else:
            all_posts.extend(result)
            if len(result) < limit:
                break
        offset += limit

    print(f"Total posts: {len(all_posts)}")

    suspicious = []

    for post in all_posts:
        langs = post.get('availableLanguages', [])
        canonical_lang = post.get('language', '')

        # Only check posts that have translations
        translation_langs = [l for l in langs if l != canonical_lang]
        if not translation_langs:
            continue

        canonical_content = post.get('content', '') or ''
        canonical_len = len(canonical_content.strip())
        canonical_is_macro = bool(re.match(MACRO_PATTERN, canonical_content.strip(), re.DOTALL)) if canonical_content.strip() else False
        canonical_is_empty = canonical_len < 20

        # Get translations for this post
        translations = await bds.posts.get_translations(post_id=post['id'])
        if not translations:
            continue

        for tr in translations:
            tr_content = tr.get('content', '') or ''
            tr_len = len(tr_content.strip())
            tr_lang = tr.get('language', '?')
            reasons = []

            # Check 1: empty/tiny canonical but substantial translation
            if canonical_is_empty and tr_len > MIN_HALLUCINATION_SIZE:
                reasons.append(f"empty canonical ({canonical_len}b) but translation has {tr_len}b")

            # Check 2: macro-only canonical but translation has prose
            if canonical_is_macro and tr_len > MIN_HALLUCINATION_SIZE:
                tr_is_macro = bool(re.match(MACRO_PATTERN, tr_content.strip(), re.DOTALL))
                if not tr_is_macro:
                    reasons.append(f"canonical is macro-only but translation has prose ({tr_len}b)")

            # Check 3: translation is disproportionately longer
            if canonical_len > 20 and tr_len > 0:
                ratio = tr_len / canonical_len
                if ratio > SIZE_RATIO_THRESHOLD:
                    reasons.append(f"translation is {ratio:.1f}x longer ({canonical_len}b → {tr_len}b)")

            if reasons:
                suspicious.append({
                    'slug': post.get('slug', ''),
                    'title': post.get('title', ''),
                    'canonical_lang': canonical_lang,
                    'translation_lang': tr_lang,
                    'canonical_len': canonical_len,
                    'translation_len': tr_len,
                    'reasons': reasons,
                })

    # Print results
    print(f"\nChecked {len(all_posts)} posts")
    print(f"Found {len(suspicious)} suspicious translations:\n")

    for s in suspicious:
        print(f"  {s['slug']}  ({s['canonical_lang']}→{s['translation_lang']})")
        print(f"    title: {s['title']}")
        print(f"    sizes: canonical={s['canonical_len']}b, translation={s['translation_len']}b")
        for r in s['reasons']:
            print(f"    ⚠ {r}")
        print()


await audit()