clean_translations.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. import sys
  2. import re
  3. def clean_translations(file_path):
  4. with open(file_path, 'r', encoding='utf-8') as f:
  5. content = f.read()
  6. # Split into blocks
  7. blocks = re.split(r'(\s+\w+: \{)', content)
  8. # Header is blocks[0]
  9. # Then blocks[1] is " zh: {", blocks[2] is content of zh
  10. # blocks[3] is " en: {", blocks[4] is content of en
  11. # blocks[5] is " ja: {", blocks[6] is content of ja
  12. header = blocks[0]
  13. processed_blocks = []
  14. # Missing keys to ensure (with basic English values)
  15. missing_keys = [
  16. "kbSettingsSaved", "failedToSaveSettings", "actionFailed", "userAddedToOrganization",
  17. "featureUpdated", "roleTenantAdmin", "roleRegularUser", "creatingRegularUser",
  18. "editUserRole", "targetRole", "editCategory", "totalTenants", "systemUsers",
  19. "systemHealth", "operational", "orgManagement", "globalTenantControl",
  20. "newTenant", "domainOptional", "saveChanges", "modelConfiguration",
  21. "defaultLLMModel", "selectLLM", "selectEmbedding", "rerankModel", "none",
  22. "indexingChunkingConfig", "chatHyperparameters", "temperature", "precise",
  23. "creative", "maxResponseTokens", "retrievalSearchSettings", "topK",
  24. "similarityThreshold", "enableHybridSearch", "hybridSearchDesc", "hybridWeight",
  25. "pureText", "pureVector", "enableQueryExpansion", "queryExpansionDesc",
  26. "enableHyDE", "hydeDesc", "enableReranking", "rerankingDesc", "broad",
  27. "strict", "maxInput", "dimensions", "defaultBadge", "dims", "ctx",
  28. "baseApi", "configured", "groupUpdated", "groupDeleted", "groupCreated",
  29. "navCatalog", "allDocuments", "categories", "uncategorizedFiles", "category",
  30. "statusReadyDesc", "statusIndexingDesc", "selectCategory", "noneUncategorized",
  31. "previous", "next", "createCategory", "categoryDesc", "categoryName",
  32. "createCategoryBtn", "newGroup", "noKnowledgeGroups", "createGroupDesc",
  33. "noDescriptionProvided", "browseManageFiles", "filterGroupFiles"
  34. ]
  35. for i in range(1, len(blocks), 2):
  36. block_header = blocks[i]
  37. block_content = blocks[i+1]
  38. # Parse keys and values
  39. lines = block_content.split('\n')
  40. keys_seen = set()
  41. new_lines = []
  42. # Regex to match "key: value," or "key: `value`,"
  43. # Support multiline strings too? Let's be careful.
  44. # Most are single line: " key: \"value\","
  45. for line in lines:
  46. match = re.search(r'^\s+([a-zA-Z0-9_-]+):', line)
  47. if match:
  48. key = match.group(1)
  49. if key in keys_seen:
  50. continue # Skip duplicate
  51. keys_seen.add(key)
  52. new_lines.append(line)
  53. # Add missing keys if they are not in keys_seen
  54. # Remove trailing " }," or "}," to append
  55. if new_lines and re.search(r'^\s+},?$', new_lines[-1]):
  56. last_line = new_lines.pop()
  57. elif new_lines and re.search(r'^\s+},?$', new_lines[-2]): # Check if last is empty
  58. last_line = new_lines.pop(-2)
  59. else:
  60. last_line = " },"
  61. for key in missing_keys:
  62. if key not in keys_seen:
  63. # Add a descriptive placeholder or common translation
  64. val = f'"{key}"' # Default to key name
  65. new_lines.append(f' {key}: {val},')
  66. new_lines.append(last_line)
  67. processed_blocks.append(block_header + '\n'.join(new_lines))
  68. new_content = header + ''.join(processed_blocks)
  69. with open(file_path, 'w', encoding='utf-8') as f:
  70. f.write(new_content)
  71. if __name__ == "__main__":
  72. clean_translations(sys.argv[1])