"""
Fix double-encoded Arabic strings in ar.json.

Root cause: the file was saved with each UTF-8 byte of Arabic text treated as a
Windows-1252 character and then re-encoded as UTF-8, producing Mojibake like
"Ø§Ù„Ù‚Ø§Ø¦Ù…Ø©" instead of "القائمة".

The cp1252 mapping means:
  - Bytes 0x00-0x7F: identical to ASCII
  - Bytes 0x80-0x9F: map to special Unicode chars (€, ‚, „, …, etc.)
    EXCEPT bytes 0x81, 0x8D, 0x8F, 0x90, 0x9D which are UNDEFINED in cp1252
    and were stored as their raw Latin-1 equivalents (U+0081, etc.)
  - Bytes 0xA0-0xFF: direct Latin-1 mapping

Fix algorithm for each string value:
  1. For each character, try to encode it with cp1252.
     - If successful, we get the original byte value back.
  2. If cp1252 encoding fails AND ord(ch) <= 0xFF:
     - The char is one of the 5 undefined cp1252 bytes stored as Latin-1 raw value.
     - Use ord(ch) directly as the byte.
  3. If cp1252 encoding fails AND ord(ch) > 0xFF:
     - The char is a genuine Unicode code point outside the Latin-1 range
       (real Arabic, CJK, emoji, etc.) — the string is already correct.
     - Return the original string untouched.
  4. After building the byte array, decode as UTF-8.
     - If that succeeds, return the decoded Arabic text.
     - If it fails, return the original (not Mojibake — leave untouched).

Run from the project root:
    python apps/frontend/fix-ar-encoding.py
"""

import json
import sys

SRC = "apps/frontend/messages/ar.json"


def fix(s: str) -> str:
    buf = bytearray()
    for ch in s:
        try:
            # cp1252 covers bytes 0x00-0xFF including the special 0x80-0x9F range.
            buf.extend(ch.encode("cp1252"))
        except UnicodeEncodeError:
            cp = ord(ch)
            if cp <= 0xFF:
                # Undefined in cp1252 (0x81, 0x8D, 0x8F, 0x90, 0x9D) but valid
                # Latin-1 — use the raw byte value.
                buf.append(cp)
            else:
                # Genuine Unicode outside Latin-1 (e.g. real Arabic U+0600+).
                # This string is already correct — leave it untouched.
                return s
    try:
        return buf.decode("utf-8")
    except (UnicodeDecodeError, ValueError):
        # Bytes don't form valid UTF-8 — not Mojibake; leave untouched.
        return s


def walk(obj):
    if isinstance(obj, dict):
        return {k: walk(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [walk(i) for i in obj]
    if isinstance(obj, str):
        return fix(obj)
    return obj


with open(SRC, encoding="utf-8-sig") as fh:
    data = json.load(fh)

fixed = walk(data)

with open(SRC, "w", encoding="utf-8") as fh:
    json.dump(fixed, fh, ensure_ascii=False, indent=4)

print(f"Done — {SRC} rewritten with correct Arabic text.")
