* chore: updated todo with translation ideas * feat: first take at the implementation of translations * fix: small addition for the translation feature * feat: support language switching in the editor and preview * feat: better handling of long bodies by not running them through a json envelope * fix: unknown macros have better fallback * feat: api for python to get translations * fix: strip dumb prefix of content in translation * feat: extend meta diff for translations * feat: hook up translations to rebuild-from-disk * feat: generation of the website prefers project language, falling back to canonical language * fix: crashes during rendering * feat: translation validation report * fix: made the translation validation actually work * chore: reorganization of menu * fix: some topics cleanup * chore: updated doc * feat: translations for media * feat: more aligned in UI/UX * feat: edit translations possible * chore: added full multi-language todo * chore: updated todo for clarity * feat: implementation of full multi-linguality * fix: page creation creates pages * fix: flags on every page * fix: better prompt * feat: made MCP server aware of language content * feat: python tools for translations * fix: better fill-in-translations * fix: better prompt for translation. maybe. * fix: losing posts from search due to translation process * fix: translation validation handles in-db content and fill-in of missing translations fixed to flush * fix: faster scanning for infilling of missing translations * chore: updated agent instructions * feat: calendar and tag cloud respect current language now * fix: retries going up * fix: got metadata-diff and rebuild into sync * fix: extended meta-diff for timestamps * fix: made website validation look at translated content, too * fix: multi-lingual search * chore: refactor Editor.tsx into two separate editors * feat: do language detection when no explicit language given --------- Co-authored-by: hugo <hugoms@me.com>
107 lines
3.9 KiB
Python
107 lines
3.9 KiB
Python
"""
|
|
Audit translation quality — run inside bDS Python runtime.
|
|
|
|
Flags posts where translation content is suspicious:
|
|
- Canonical body is empty/tiny but translation has substantial content (hallucination)
|
|
- Translation is much longer than canonical (ratio > 3x)
|
|
- Canonical is macro-only but translation contains prose
|
|
"""
|
|
|
|
from bds_api import bds
|
|
|
|
MACRO_PATTERN = r'^\s*(\[\[.*?\]\]\s*)+$'
|
|
SIZE_RATIO_THRESHOLD = 3.0
|
|
MIN_HALLUCINATION_SIZE = 200 # translation must be at least this big to flag
|
|
|
|
|
|
async def audit():
|
|
import re
|
|
|
|
# Get all published posts, paginated
|
|
all_posts = []
|
|
offset = 0
|
|
limit = 100
|
|
while True:
|
|
result = await bds.posts.get_all(options={'limit': limit, 'offset': offset})
|
|
items = result.get('items', result) if isinstance(result, dict) else result
|
|
if isinstance(result, dict):
|
|
all_posts.extend(result['items'])
|
|
if not result.get('hasMore', False):
|
|
break
|
|
else:
|
|
all_posts.extend(result)
|
|
if len(result) < limit:
|
|
break
|
|
offset += limit
|
|
|
|
print(f"Total posts: {len(all_posts)}")
|
|
|
|
suspicious = []
|
|
|
|
for post in all_posts:
|
|
langs = post.get('availableLanguages', [])
|
|
canonical_lang = post.get('language', '')
|
|
|
|
# Only check posts that have translations
|
|
translation_langs = [l for l in langs if l != canonical_lang]
|
|
if not translation_langs:
|
|
continue
|
|
|
|
canonical_content = post.get('content', '') or ''
|
|
canonical_len = len(canonical_content.strip())
|
|
canonical_is_macro = bool(re.match(MACRO_PATTERN, canonical_content.strip(), re.DOTALL)) if canonical_content.strip() else False
|
|
canonical_is_empty = canonical_len < 20
|
|
|
|
# Get translations for this post
|
|
translations = await bds.posts.get_translations(post_id=post['id'])
|
|
if not translations:
|
|
continue
|
|
|
|
for tr in translations:
|
|
tr_content = tr.get('content', '') or ''
|
|
tr_len = len(tr_content.strip())
|
|
tr_lang = tr.get('language', '?')
|
|
reasons = []
|
|
|
|
# Check 1: empty/tiny canonical but substantial translation
|
|
if canonical_is_empty and tr_len > MIN_HALLUCINATION_SIZE:
|
|
reasons.append(f"empty canonical ({canonical_len}b) but translation has {tr_len}b")
|
|
|
|
# Check 2: macro-only canonical but translation has prose
|
|
if canonical_is_macro and tr_len > MIN_HALLUCINATION_SIZE:
|
|
tr_is_macro = bool(re.match(MACRO_PATTERN, tr_content.strip(), re.DOTALL))
|
|
if not tr_is_macro:
|
|
reasons.append(f"canonical is macro-only but translation has prose ({tr_len}b)")
|
|
|
|
# Check 3: translation is disproportionately longer
|
|
if canonical_len > 20 and tr_len > 0:
|
|
ratio = tr_len / canonical_len
|
|
if ratio > SIZE_RATIO_THRESHOLD:
|
|
reasons.append(f"translation is {ratio:.1f}x longer ({canonical_len}b → {tr_len}b)")
|
|
|
|
if reasons:
|
|
suspicious.append({
|
|
'slug': post.get('slug', ''),
|
|
'title': post.get('title', ''),
|
|
'canonical_lang': canonical_lang,
|
|
'translation_lang': tr_lang,
|
|
'canonical_len': canonical_len,
|
|
'translation_len': tr_len,
|
|
'reasons': reasons,
|
|
})
|
|
|
|
# Print results
|
|
print(f"\nChecked {len(all_posts)} posts")
|
|
print(f"Found {len(suspicious)} suspicious translations:\n")
|
|
|
|
for s in suspicious:
|
|
print(f" {s['slug']} ({s['canonical_lang']}→{s['translation_lang']})")
|
|
print(f" title: {s['title']}")
|
|
print(f" sizes: canonical={s['canonical_len']}b, translation={s['translation_len']}b")
|
|
for r in s['reasons']:
|
|
print(f" ⚠ {r}")
|
|
print()
|
|
|
|
|
|
await audit()
|