Feature/post media translations (#42)
* chore: updated todo with translation ideas * feat: first take at the implementation of translations * fix: small addition for the translation feature * feat: support language switching in the editor and preview * feat: better handling of long bodies by not running them through a json envelope * fix: unknown macros have better fallback * feat: api for python to get translations * fix: strip dumb prefix of content in translation * feat: extend meta diff for translations * feat: hook up translations to rebuild-from-disk * feat: generation of the website prefers project language, falling back to canonical language * fix: crashes during rendering * feat: translation validation report * fix: made the translation validation actually work * chore: reorganization of menu * fix: some topics cleanup * chore: updated doc * feat: translations for media * feat: more aligned in UI/UX * feat: edit translations possible * chore: added full multi-language todo * chore: updated todo for clarity * feat: implementation of full multi-linguality * fix: page creation creates pages * fix: flags on every page * fix: better prompt * feat: made MCP server aware of language content * feat: python tools for translations * fix: better fill-in-translations * fix: better prompt for translation. maybe. * fix: losing posts from search due to translation process * fix: translation validation handles in-db content and fill-in of missing translations fixed to flush * fix: faster scanning for infilling of missing translations * chore: updated agent instructions * feat: calendar and tag cloud respect current language now * fix: retries going up * fix: got metadata-diff and rebuild into sync * fix: extended meta-diff for timestamps * fix: made website validation look at translated content, too * fix: multi-lingual search * chore: refactor Editor.tsx into two separate editors * feat: do language detection when no explicit language given --------- Co-authored-by: hugo <hugoms@me.com>
This commit is contained in:
106
scripts/audit-translations.py
Normal file
106
scripts/audit-translations.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
Audit translation quality — run inside bDS Python runtime.
|
||||
|
||||
Flags posts where translation content is suspicious:
|
||||
- Canonical body is empty/tiny but translation has substantial content (hallucination)
|
||||
- Translation is much longer than canonical (ratio > 3x)
|
||||
- Canonical is macro-only but translation contains prose
|
||||
"""
|
||||
|
||||
from bds_api import bds
|
||||
|
||||
MACRO_PATTERN = r'^\s*(\[\[.*?\]\]\s*)+$'
|
||||
SIZE_RATIO_THRESHOLD = 3.0
|
||||
MIN_HALLUCINATION_SIZE = 200 # translation must be at least this big to flag
|
||||
|
||||
|
||||
async def audit():
|
||||
import re
|
||||
|
||||
# Get all published posts, paginated
|
||||
all_posts = []
|
||||
offset = 0
|
||||
limit = 100
|
||||
while True:
|
||||
result = await bds.posts.get_all(options={'limit': limit, 'offset': offset})
|
||||
items = result.get('items', result) if isinstance(result, dict) else result
|
||||
if isinstance(result, dict):
|
||||
all_posts.extend(result['items'])
|
||||
if not result.get('hasMore', False):
|
||||
break
|
||||
else:
|
||||
all_posts.extend(result)
|
||||
if len(result) < limit:
|
||||
break
|
||||
offset += limit
|
||||
|
||||
print(f"Total posts: {len(all_posts)}")
|
||||
|
||||
suspicious = []
|
||||
|
||||
for post in all_posts:
|
||||
langs = post.get('availableLanguages', [])
|
||||
canonical_lang = post.get('language', '')
|
||||
|
||||
# Only check posts that have translations
|
||||
translation_langs = [l for l in langs if l != canonical_lang]
|
||||
if not translation_langs:
|
||||
continue
|
||||
|
||||
canonical_content = post.get('content', '') or ''
|
||||
canonical_len = len(canonical_content.strip())
|
||||
canonical_is_macro = bool(re.match(MACRO_PATTERN, canonical_content.strip(), re.DOTALL)) if canonical_content.strip() else False
|
||||
canonical_is_empty = canonical_len < 20
|
||||
|
||||
# Get translations for this post
|
||||
translations = await bds.posts.get_translations(post_id=post['id'])
|
||||
if not translations:
|
||||
continue
|
||||
|
||||
for tr in translations:
|
||||
tr_content = tr.get('content', '') or ''
|
||||
tr_len = len(tr_content.strip())
|
||||
tr_lang = tr.get('language', '?')
|
||||
reasons = []
|
||||
|
||||
# Check 1: empty/tiny canonical but substantial translation
|
||||
if canonical_is_empty and tr_len > MIN_HALLUCINATION_SIZE:
|
||||
reasons.append(f"empty canonical ({canonical_len}b) but translation has {tr_len}b")
|
||||
|
||||
# Check 2: macro-only canonical but translation has prose
|
||||
if canonical_is_macro and tr_len > MIN_HALLUCINATION_SIZE:
|
||||
tr_is_macro = bool(re.match(MACRO_PATTERN, tr_content.strip(), re.DOTALL))
|
||||
if not tr_is_macro:
|
||||
reasons.append(f"canonical is macro-only but translation has prose ({tr_len}b)")
|
||||
|
||||
# Check 3: translation is disproportionately longer
|
||||
if canonical_len > 20 and tr_len > 0:
|
||||
ratio = tr_len / canonical_len
|
||||
if ratio > SIZE_RATIO_THRESHOLD:
|
||||
reasons.append(f"translation is {ratio:.1f}x longer ({canonical_len}b → {tr_len}b)")
|
||||
|
||||
if reasons:
|
||||
suspicious.append({
|
||||
'slug': post.get('slug', ''),
|
||||
'title': post.get('title', ''),
|
||||
'canonical_lang': canonical_lang,
|
||||
'translation_lang': tr_lang,
|
||||
'canonical_len': canonical_len,
|
||||
'translation_len': tr_len,
|
||||
'reasons': reasons,
|
||||
})
|
||||
|
||||
# Print results
|
||||
print(f"\nChecked {len(all_posts)} posts")
|
||||
print(f"Found {len(suspicious)} suspicious translations:\n")
|
||||
|
||||
for s in suspicious:
|
||||
print(f" {s['slug']} ({s['canonical_lang']}→{s['translation_lang']})")
|
||||
print(f" title: {s['title']}")
|
||||
print(f" sizes: canonical={s['canonical_len']}b, translation={s['translation_len']}b")
|
||||
for r in s['reasons']:
|
||||
print(f" ⚠ {r}")
|
||||
print()
|
||||
|
||||
|
||||
await audit()
|
||||
Reference in New Issue
Block a user