vision.service.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. import { Injectable, Logger } from '@nestjs/common';
  2. import { I18nService } from '../i18n/i18n.service';
  3. import { ConfigService } from '@nestjs/config';
  4. import { ChatOpenAI } from '@langchain/openai';
  5. import { HumanMessage } from '@langchain/core/messages';
  6. import * as fs from 'fs/promises';
  7. import { VisionAnalysisResult, VisionModelConfig, BatchAnalysisResult, ImageDescription } from './vision.interface';
  8. @Injectable()
  9. export class VisionService {
  10. private readonly logger = new Logger(VisionService.name);
  11. constructor(
  12. private configService: ConfigService,
  13. private i18nService: I18nService,
  14. ) { }
  15. /**
  16. * Analyze single image (document page)
  17. */
  18. async analyzeImage(
  19. imagePath: string,
  20. modelConfig: VisionModelConfig,
  21. pageIndex?: number,
  22. ): Promise<VisionAnalysisResult> {
  23. const maxRetries = 3;
  24. const baseDelay = 3000; // 3 second base delay
  25. for (let attempt = 1; attempt <= maxRetries; attempt++) {
  26. try {
  27. return await this.performAnalysis(imagePath, modelConfig, pageIndex);
  28. } catch (error) {
  29. const isRetryableError = this.isRetryableError(error);
  30. if (attempt === maxRetries || !isRetryableError) {
  31. throw new Error(this.i18nService.formatMessage('visionAnalysisFailed', { message: error.message }));
  32. }
  33. const delay = baseDelay + Math.random() * 2000; // 3-5 second random delay
  34. this.logger.warn(
  35. `⚠️ Failed to analyze page ${pageIndex || '?'} (${attempt}/${maxRetries}), retrying in ${delay.toFixed(0)}ms: ${error.message}`
  36. );
  37. await this.sleep(delay);
  38. }
  39. }
  40. // This line theoretically should not execute, but included to satisfy TypeScript
  41. throw new Error(this.i18nService.getMessage('retryMechanismError'));
  42. }
  43. /**
  44. * Perform actual image analysis
  45. */
  46. private async performAnalysis(
  47. imagePath: string,
  48. modelConfig: VisionModelConfig,
  49. pageIndex?: number,
  50. ): Promise<VisionAnalysisResult> {
  51. try {
  52. // Load image and convert to base64
  53. const imageBuffer = await fs.readFile(imagePath);
  54. const base64Image = imageBuffer.toString('base64');
  55. const mimeType = this.getMimeType(imagePath);
  56. // Create vision model instance
  57. const model = new ChatOpenAI({
  58. apiKey: modelConfig.apiKey,
  59. model: modelConfig.modelId,
  60. configuration: {
  61. baseURL: modelConfig.baseUrl,
  62. },
  63. temperature: 0.1, // Reduce randomness, increase consistency
  64. });
  65. // Build professional document analysis prompt
  66. const systemPrompt = this.i18nService.getMessage('visionSystemPrompt');
  67. const message = new HumanMessage({
  68. content: [
  69. {
  70. type: 'text',
  71. text: systemPrompt,
  72. },
  73. {
  74. type: 'image_url',
  75. image_url: {
  76. url: `data:${mimeType};base64,${base64Image}`,
  77. },
  78. },
  79. ],
  80. });
  81. // Call model
  82. this.logger.log(this.i18nService.formatMessage('visionModelCall', { model: modelConfig.modelId, page: pageIndex || 'single' }));
  83. const response = await model.invoke([message]);
  84. let content = response.content as string;
  85. // Try to parse JSON
  86. let result: VisionAnalysisResult;
  87. try {
  88. // Clean up markdown code block tags
  89. content = content.replace(/```json/g, '').replace(/```/g, '').trim();
  90. const parsed = JSON.parse(content);
  91. result = {
  92. text: parsed.text || '',
  93. images: parsed.images || [],
  94. layout: parsed.layout || 'unknown',
  95. confidence: parsed.confidence ?? 0.8,
  96. pageIndex,
  97. };
  98. } catch (parseError) {
  99. // If parsing fails, treat entire content as text
  100. this.logger.warn(`Failed to parse JSON response for ${imagePath}, using raw text`);
  101. result = {
  102. text: content,
  103. images: [],
  104. layout: 'unknown',
  105. confidence: 0.5,
  106. pageIndex,
  107. };
  108. }
  109. this.logger.log(
  110. this.i18nService.formatMessage('visionAnalysisSuccess', {
  111. path: imagePath,
  112. page: pageIndex ? ` (page ${pageIndex})` : '',
  113. textLen: result.text.length,
  114. imgCount: result.images.length,
  115. layout: result.layout,
  116. confidence: (result.confidence * 100).toFixed(1)
  117. })
  118. );
  119. return result;
  120. } catch (error) {
  121. this.logger.error(
  122. this.i18nService.formatMessage('visionAnalysisFailed', {
  123. message: error.message
  124. })
  125. );
  126. this.logger.error(`Vision analysis error details: ${error.stack}`);
  127. throw error; // Re-throw error for retry mechanism
  128. }
  129. }
  130. /**
  131. * Determine if error is retryable
  132. */
  133. private isRetryableError(error: any): boolean {
  134. const errorMessage = error.message?.toLowerCase() || '';
  135. const errorCode = error.status || error.code;
  136. // 429 rate limit error
  137. if (errorCode === 429 || errorMessage.includes('rate limit') || errorMessage.includes('too many requests')) {
  138. return true;
  139. }
  140. // 5xx server error
  141. if (errorCode >= 500 && errorCode < 600) {
  142. return true;
  143. }
  144. // Network related error
  145. if (errorMessage.includes('timeout') || errorMessage.includes('network') || errorMessage.includes('connection')) {
  146. return true;
  147. }
  148. return false;
  149. }
  150. /**
  151. * Sleep function
  152. */
  153. private sleep(ms: number): Promise<void> {
  154. return new Promise(resolve => setTimeout(resolve, ms));
  155. }
  156. /**
  157. * Batch analyze multiple images
  158. */
  159. async batchAnalyze(
  160. imagePaths: string[],
  161. modelConfig: VisionModelConfig,
  162. options: {
  163. startIndex?: number;
  164. skipQualityCheck?: boolean;
  165. onProgress?: (current: number, total: number, pageResult?: VisionAnalysisResult) => void;
  166. } = {},
  167. ): Promise<BatchAnalysisResult> {
  168. const { startIndex = 1, skipQualityCheck = false, onProgress } = options;
  169. const results: VisionAnalysisResult[] = [];
  170. let successCount = 0;
  171. let failedCount = 0;
  172. this.logger.log(this.i18nService.formatMessage('batchAnalysisStarted', { count: imagePaths.length }));
  173. this.logger.log(`🔧 Model config: ${modelConfig.modelId} (${modelConfig.baseUrl || 'OpenAI'})`);
  174. for (let i = 0; i < imagePaths.length; i++) {
  175. const imagePath = imagePaths[i];
  176. const pageIndex = startIndex + i;
  177. const progress = Math.round(((i + 1) / imagePaths.length) * 100);
  178. this.logger.log(`🖼️ Analyzing page ${pageIndex} (${i + 1}/${imagePaths.length}, ${progress}%)`);
  179. // Call progress callback
  180. if (onProgress) {
  181. onProgress(i + 1, imagePaths.length);
  182. }
  183. // Quality check(skip analysis if skipped)
  184. if (!skipQualityCheck) {
  185. const quality = await this.checkImageQuality(imagePath);
  186. if (!quality.isGood) {
  187. this.logger.warn(`⚠️ Skipped page ${pageIndex} (poor quality): ${quality.reason}`);
  188. failedCount++;
  189. continue;
  190. } else {
  191. this.logger.log(`✅ Page ${pageIndex} quality check passed (score: ${(quality.score || 0).toFixed(2)})`);
  192. }
  193. }
  194. try {
  195. this.logger.log(`🔍 Analyzing page ${pageIndex} with Vision model...`);
  196. const startTime = Date.now();
  197. const result = await this.analyzeImage(imagePath, modelConfig, pageIndex);
  198. const duration = ((Date.now() - startTime) / 1000).toFixed(1);
  199. results.push(result);
  200. successCount++;
  201. this.logger.log(
  202. `✅ Page ${pageIndex} analysis completed (time: ${duration}s, ` +
  203. `text: ${result.text.length} chars, ` +
  204. `images: ${result.images.length}, ` +
  205. `confidence: ${(result.confidence * 100).toFixed(1)}%)`
  206. );
  207. // Call progress callback with result
  208. if (onProgress) {
  209. onProgress(i + 1, imagePaths.length, result);
  210. }
  211. } catch (error) {
  212. this.logger.error(this.i18nService.formatMessage('pageAnalysisFailed', { page: pageIndex }) + `: ${error.message}`);
  213. failedCount++;
  214. }
  215. }
  216. // Calculate estimated cost (assuming $0.01 per image)
  217. const estimatedCost = successCount * 0.01;
  218. this.logger.log(
  219. `🎉 Vision batch analysis completed! ` +
  220. `✅ Success: ${successCount} pages, ❌ Failed: ${failedCount} pages, ` +
  221. `💰 Estimated cost: $${estimatedCost.toFixed(2)}`
  222. );
  223. return {
  224. results,
  225. totalPages: imagePaths.length,
  226. successCount,
  227. failedCount,
  228. estimatedCost,
  229. };
  230. }
  231. /**
  232. * Check image quality
  233. */
  234. async checkImageQuality(imagePath: string): Promise<{ isGood: boolean; reason?: string; score?: number }> {
  235. try {
  236. const stats = await fs.stat(imagePath);
  237. const sizeKB = stats.size / 1024;
  238. // Check file size(5KB+)
  239. if (sizeKB < 5) {
  240. return { isGood: false, reason: `File too small (${sizeKB.toFixed(2)}KB)`, score: 0 };
  241. }
  242. // Check file size limit(10MB)
  243. if (sizeKB > 10240) {
  244. return { isGood: false, reason: `File too large (${sizeKB.toFixed(2)}KB)`, score: 0 };
  245. }
  246. // Simple quality scoring
  247. let score = 0.5;
  248. if (sizeKB > 50) score += 0.2;
  249. if (sizeKB > 100) score += 0.2;
  250. if (sizeKB > 500) score += 0.1;
  251. score = Math.min(score, 1.0);
  252. return { isGood: true, score };
  253. } catch (error) {
  254. return { isGood: false, reason: this.i18nService.formatMessage('imageLoadError', { message: error.message }), score: 0 };
  255. }
  256. }
  257. /**
  258. * Check if file is a supported image format
  259. */
  260. isImageFile(mimetype: string): boolean {
  261. const imageMimeTypes = [
  262. 'image/jpeg',
  263. 'image/jpg',
  264. 'image/png',
  265. 'image/gif',
  266. 'image/bmp',
  267. 'image/webp',
  268. ];
  269. return imageMimeTypes.includes(mimetype);
  270. }
  271. /**
  272. * Get MIME type
  273. */
  274. private getMimeType(filePath: string): string {
  275. const ext = filePath.toLowerCase().split('.').pop();
  276. if (!ext) return 'image/jpeg';
  277. const mimeTypes: Record<string, string> = {
  278. jpg: 'image/jpeg',
  279. jpeg: 'image/jpeg',
  280. png: 'image/png',
  281. gif: 'image/gif',
  282. bmp: 'image/bmp',
  283. webp: 'image/webp',
  284. };
  285. return mimeTypes[ext] || 'image/jpeg';
  286. }
  287. /**
  288. * Legacy interface compatibility: extract content from single image
  289. */
  290. async extractImageContent(
  291. imagePath: string,
  292. modelConfig: { baseUrl: string; apiKey: string; modelId: string },
  293. ): Promise<string> {
  294. const result = await this.analyzeImage(imagePath, modelConfig);
  295. return result.text;
  296. }
  297. }