extract_cjk.py 1.3 KB

12345678910111213141516171819202122232425262728293031
  1. import os
  2. import re
  3. import json
  4. directories = ['d:/workspace/AuraK/web', 'd:/workspace/AuraK/server/src']
  5. exclude_dirs = ['node_modules', '.git', 'dist', '.next']
  6. extensions = ['.ts', '.tsx', '.js', '.jsx']
  7. cjk_pattern = re.compile(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]+')
  8. cjk_lines = {}
  9. for d in directories:
  10. for root, dirs, files in os.walk(d):
  11. dirs[:] = [dir for dir in dirs if dir not in exclude_dirs]
  12. for file in files:
  13. if any(file.endswith(ext) for ext in extensions):
  14. file_path = os.path.join(root, file)
  15. try:
  16. with open(file_path, 'r', encoding='utf-8') as f:
  17. lines = f.readlines()
  18. for i, line in enumerate(lines):
  19. if cjk_pattern.search(line):
  20. if file_path not in cjk_lines:
  21. cjk_lines[file_path] = []
  22. cjk_lines[file_path].append({"line": i + 1, "text": line.strip()})
  23. except Exception as e:
  24. print(f"Error reading {file_path}: {e}")
  25. with open('cjk_extract.json', 'w', encoding='utf-8') as f:
  26. json.dump(cjk_lines, f, ensure_ascii=False, indent=2)
  27. print("Extracted to cjk_extract.json")