knowledge-base.service.ts 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453
  1. import { Injectable, Logger, NotFoundException, Inject, forwardRef } from '@nestjs/common';
  2. import { ConfigService } from '@nestjs/config';
  3. import { I18nService } from '../i18n/i18n.service';
  4. import { InjectRepository } from '@nestjs/typeorm';
  5. import { Repository } from 'typeorm';
  6. import { FileStatus, KnowledgeBase, ProcessingMode } from './knowledge-base.entity';
  7. import { ElasticsearchService } from '../elasticsearch/elasticsearch.service';
  8. import { TikaService } from '../tika/tika.service';
  9. import * as fs from 'fs';
  10. import * as path from 'path';
  11. import { EmbeddingService } from './embedding.service';
  12. import { TextChunkerService } from './text-chunker.service';
  13. import { ModelConfigService } from '../model-config/model-config.service';
  14. import { RagService } from '../rag/rag.service';
  15. import { VisionService } from '../vision/vision.service';
  16. import { UserSettingService } from '../user-setting/user-setting.service';
  17. import { MemoryMonitorService } from './memory-monitor.service';
  18. import { ChunkConfigService } from './chunk-config.service';
  19. import { VisionPipelineService } from '../vision-pipeline/vision-pipeline.service';
  20. import { LibreOfficeService } from '../libreoffice/libreoffice.service';
  21. import { Pdf2ImageService } from '../pdf2image/pdf2image.service';
  22. import { DOC_EXTENSIONS, IMAGE_EXTENSIONS } from '../common/file-support.constants';
  23. import { ChatService } from '../chat/chat.service';
  24. @Injectable()
  25. export class KnowledgeBaseService {
  26. private readonly logger = new Logger(KnowledgeBaseService.name);
  27. constructor(
  28. @InjectRepository(KnowledgeBase)
  29. private kbRepository: Repository<KnowledgeBase>,
  30. @Inject(forwardRef(() => ElasticsearchService))
  31. private elasticsearchService: ElasticsearchService,
  32. private tikaService: TikaService,
  33. private embeddingService: EmbeddingService,
  34. private textChunkerService: TextChunkerService,
  35. private modelConfigService: ModelConfigService,
  36. @Inject(forwardRef(() => RagService))
  37. private ragService: RagService,
  38. private visionService: VisionService,
  39. private userSettingService: UserSettingService,
  40. private memoryMonitor: MemoryMonitorService,
  41. private chunkConfigService: ChunkConfigService,
  42. private visionPipelineService: VisionPipelineService,
  43. private libreOfficeService: LibreOfficeService,
  44. private pdf2ImageService: Pdf2ImageService,
  45. private configService: ConfigService,
  46. private i18nService: I18nService,
  47. @Inject(forwardRef(() => ChatService))
  48. private chatService: ChatService,
  49. ) { }
  50. async createAndIndex(
  51. fileInfo: any,
  52. userId: string,
  53. tenantId: string,
  54. config?: any,
  55. ): Promise<KnowledgeBase> {
  56. const mode = config?.mode || 'fast';
  57. const processingMode = mode === 'precise' ? ProcessingMode.PRECISE : ProcessingMode.FAST;
  58. const kb = this.kbRepository.create({
  59. originalName: fileInfo.originalname,
  60. storagePath: fileInfo.path,
  61. size: fileInfo.size,
  62. mimetype: fileInfo.mimetype,
  63. status: FileStatus.PENDING,
  64. userId: userId,
  65. tenantId: tenantId,
  66. chunkSize: config?.chunkSize || 200,
  67. chunkOverlap: config?.chunkOverlap || 40,
  68. embeddingModelId: config?.embeddingModelId || null,
  69. processingMode: processingMode,
  70. });
  71. const savedKb = await this.kbRepository.save(kb);
  72. this.logger.log(
  73. `Created KB record: ${savedKb.id}, mode: ${mode}, file: ${fileInfo.originalname}`
  74. );
  75. // ---------------------------------------------------------
  76. // Move the file to the final partitioned directory
  77. // source: uploads/{tenantId}/{filename} (or wherever it was)
  78. // target: uploads/{tenantId}/{savedKb.id}/{filename}
  79. // ---------------------------------------------------------
  80. const fs = await import('fs');
  81. const path = await import('path');
  82. const uploadPath = process.env.UPLOAD_FILE_PATH || './uploads';
  83. const targetDir = path.join(uploadPath, tenantId || 'default', savedKb.id);
  84. const targetPath = path.join(targetDir, fileInfo.filename);
  85. try {
  86. if (!fs.existsSync(targetDir)) {
  87. fs.mkdirSync(targetDir, { recursive: true });
  88. }
  89. if (fs.existsSync(fileInfo.path)) {
  90. fs.renameSync(fileInfo.path, targetPath);
  91. // Update the DB record with the new path
  92. savedKb.storagePath = targetPath;
  93. await this.kbRepository.save(savedKb);
  94. this.logger.log(`Moved file to partitioned storage: ${targetPath}`);
  95. }
  96. } catch (fsError) {
  97. this.logger.error(`Failed to move file ${savedKb.id} to partitioned storage`, fsError);
  98. // We will let it continue, but the file might be stuck in the temp/root folder
  99. }
  100. // If queue processing is requested, await completion
  101. if (config?.waitForCompletion) {
  102. await this.processFile(savedKb.id, userId, tenantId, config);
  103. } else {
  104. // Otherwise trigger asynchronously (default)
  105. this.processFile(savedKb.id, userId, tenantId, config).catch((err) => {
  106. this.logger.error(`Error processing file ${savedKb.id}`, err);
  107. });
  108. }
  109. return savedKb;
  110. }
  111. async findAll(userId: string, tenantId?: string): Promise<KnowledgeBase[]> {
  112. const where: any = {};
  113. if (tenantId) {
  114. where.tenantId = tenantId;
  115. } else {
  116. where.userId = userId;
  117. }
  118. return this.kbRepository.find({
  119. where,
  120. relations: ['groups'], // グループリレーションをロード
  121. order: { createdAt: 'DESC' },
  122. });
  123. }
  124. async searchKnowledge(userId: string, tenantId: string, query: string, topK: number = 5) {
  125. try {
  126. // 環境変数のデフォルト次元数を使用してシミュレーションベクトルを生成
  127. const defaultDimensions = parseInt(
  128. process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
  129. );
  130. const mockEmbedding = Array.from(
  131. { length: defaultDimensions },
  132. () => Math.random() - 0.5,
  133. );
  134. const queryVector = mockEmbedding;
  135. // 2. Search in Elasticsearch
  136. const searchResults = await this.elasticsearchService.searchSimilar(
  137. queryVector,
  138. userId,
  139. topK,
  140. tenantId, // Ensure shared visibility within tenant
  141. );
  142. // 3. Get file information from database
  143. const fileIds = [...new Set(searchResults.map((r) => r.fileId))];
  144. const files = await this.kbRepository.findByIds(fileIds);
  145. const fileMap = new Map(files.map((f) => [f.id, f]));
  146. // 4. Combine results with file info
  147. const results = searchResults.map((result) => {
  148. const file = fileMap.get(result.fileId);
  149. return {
  150. ...result,
  151. file: file
  152. ? {
  153. id: file.id,
  154. name: file.originalName,
  155. mimetype: file.mimetype,
  156. size: file.size,
  157. createdAt: file.createdAt,
  158. }
  159. : null,
  160. };
  161. });
  162. return {
  163. query,
  164. results,
  165. total: results.length,
  166. };
  167. } catch (error) {
  168. this.logger.error(
  169. `Metadata search failed for tenant ${tenantId}:`,
  170. error.stack || error.message,
  171. );
  172. throw error;
  173. }
  174. }
  175. async ragSearch(userId: string, tenantId: string, query: string, settings: any) {
  176. this.logger.log(
  177. `RAG search request: userId=${userId}, query="${query}", settings=${JSON.stringify(settings)}`,
  178. );
  179. try {
  180. const ragResults = await this.ragService.searchKnowledge(
  181. query,
  182. userId,
  183. settings.topK,
  184. settings.similarityThreshold,
  185. settings.selectedEmbeddingId,
  186. settings.enableFullTextSearch,
  187. settings.enableRerank,
  188. settings.selectedRerankId,
  189. undefined,
  190. undefined,
  191. settings.rerankSimilarityThreshold,
  192. tenantId, // Ensure shared visibility within tenant for RAG
  193. );
  194. const sources = this.ragService.extractSources(ragResults);
  195. const ragPrompt = this.ragService.buildRagPrompt(
  196. query,
  197. ragResults,
  198. settings.language || 'ja',
  199. );
  200. const result = {
  201. searchResults: ragResults,
  202. sources,
  203. ragPrompt,
  204. hasRelevantContent: ragResults.length > 0,
  205. };
  206. this.logger.log(
  207. `RAG search completed: found ${ragResults.length} results`,
  208. );
  209. return result;
  210. } catch (error) {
  211. this.logger.error(
  212. `RAG search failed for user ${userId}:`,
  213. error.stack || error.message,
  214. );
  215. // エラーをスローするのではなく空の結果を返し、システムの稼働を継続させる
  216. return {
  217. searchResults: [],
  218. sources: [],
  219. ragPrompt: query, // オリジナルのクエリを使用
  220. hasRelevantContent: false,
  221. };
  222. }
  223. }
  224. async deleteFile(fileId: string, userId: string, tenantId: string): Promise<void> {
  225. this.logger.log(`Deleting file ${fileId} for user ${userId}`);
  226. try {
  227. // 1. Get file info
  228. const file = await this.kbRepository.findOne({
  229. where: { id: fileId, tenantId }, // Filter by tenantId
  230. });
  231. if (!file) {
  232. throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
  233. }
  234. // 2. Delete file from filesystem
  235. const fs = await import('fs');
  236. try {
  237. if (fs.existsSync(file.storagePath)) {
  238. fs.unlinkSync(file.storagePath);
  239. this.logger.log(`Deleted file: ${file.storagePath}`);
  240. }
  241. } catch (error) {
  242. this.logger.warn(`Failed to delete file ${file.storagePath}:`, error);
  243. }
  244. // 3. Delete from Elasticsearch
  245. try {
  246. await this.elasticsearchService.deleteByFileId(fileId, userId, tenantId);
  247. this.logger.log(`Deleted ES documents for file ${fileId}`);
  248. } catch (error) {
  249. this.logger.warn(
  250. `Failed to delete ES documents for file ${fileId}:`,
  251. error,
  252. );
  253. }
  254. // 4. Remove from all groups (cleanup M2M relations)
  255. const fileWithGroups = await this.kbRepository.findOne({
  256. where: { id: fileId, tenantId },
  257. relations: ['groups'],
  258. });
  259. if (fileWithGroups && fileWithGroups.groups && fileWithGroups.groups.length > 0) {
  260. // Clear groups to remove entries from join table
  261. fileWithGroups.groups = [];
  262. await this.kbRepository.save(fileWithGroups);
  263. this.logger.log(`Cleared group associations for file ${fileId}`);
  264. }
  265. // 5. Delete from SQLite
  266. await this.kbRepository.delete({ id: fileId });
  267. this.logger.log(`Deleted database record for file ${fileId}`);
  268. } catch (error) {
  269. this.logger.error(`Failed to delete file ${fileId}`, error);
  270. throw error;
  271. }
  272. }
  273. async clearAll(userId: string, tenantId: string): Promise<void> {
  274. this.logger.log(`Clearing all knowledge base data for user ${userId} in tenant ${tenantId}`);
  275. try {
  276. // Get all files and delete them one by one
  277. const files = await this.kbRepository.find();
  278. for (const file of files) {
  279. await this.deleteFile(file.id, userId, tenantId);
  280. }
  281. this.logger.log(`Cleared all knowledge base data for user ${userId}`);
  282. } catch (error) {
  283. this.logger.error(
  284. `Failed to clear knowledge base for user ${userId}`,
  285. error,
  286. );
  287. throw error;
  288. }
  289. }
  290. private async processFile(kbId: string, userId: string, tenantId: string, config?: any) {
  291. this.logger.log(`Starting processing for file ${kbId}, mode: ${config?.mode || 'fast'}`);
  292. await this.updateStatus(kbId, FileStatus.INDEXING);
  293. try {
  294. const kb = await this.kbRepository.findOne({ where: { id: kbId } });
  295. if (!kb) {
  296. this.logger.error(`KB not found: ${kbId}`);
  297. return;
  298. }
  299. // メモリ監視 - 処理前チェック
  300. const memBefore = this.memoryMonitor.getMemoryUsage();
  301. this.logger.log(`メモリ状態 - 処理前: ${memBefore.heapUsed}/${memBefore.heapTotal}MB`);
  302. // モードに基づいて処理フローを選択
  303. const mode = config?.mode || 'fast';
  304. if (mode === 'precise') {
  305. // 精密モード - Vision Pipeline を使用
  306. await this.processPreciseMode(kb, userId, tenantId, config);
  307. } else {
  308. // 高速モード - Tika を使用
  309. await this.processFastMode(kb, userId, tenantId, config);
  310. }
  311. this.logger.log(`File ${kbId} processed successfully in ${mode} mode.`);
  312. } catch (error) {
  313. this.logger.error(`Failed to process file ${kbId}`, error);
  314. await this.updateStatus(kbId, FileStatus.FAILED);
  315. }
  316. }
  317. /**
  318. * 高速モード処理(既存フロー)
  319. */
  320. private async processFastMode(kb: KnowledgeBase, userId: string, tenantId: string, config?: any) {
  321. // 1. Tika を使用してテキストを抽出
  322. let text = await this.tikaService.extractText(kb.storagePath);
  323. // 画像ファイルの場合はビジョンモデルを使用
  324. if (this.visionService.isImageFile(kb.mimetype)) {
  325. const visionModelId = await this.userSettingService.getVisionModelId(userId);
  326. if (visionModelId) {
  327. const visionModel = await this.modelConfigService.findOne(
  328. visionModelId,
  329. userId,
  330. tenantId,
  331. );
  332. if (visionModel && visionModel.type === 'vision' && visionModel.isEnabled !== false) {
  333. text = await this.visionService.extractImageContent(kb.storagePath, {
  334. baseUrl: visionModel.baseUrl || '',
  335. apiKey: visionModel.apiKey || '',
  336. modelId: visionModel.modelId,
  337. });
  338. }
  339. }
  340. }
  341. if (!text || text.trim().length === 0) {
  342. this.logger.warn(this.i18nService.getMessage('noTextExtracted'));
  343. }
  344. // テキストサイズを確認
  345. const textSizeMB = Math.round(text.length / 1024 / 1024);
  346. if (textSizeMB > 50) {
  347. this.logger.warn(this.i18nService.formatMessage('extractedTextTooLarge', { size: textSizeMB }));
  348. }
  349. // テキストをデータベースに保存
  350. await this.kbRepository.update(kb.id, { content: text });
  351. await this.updateStatus(kb.id, FileStatus.EXTRACTED);
  352. // 非同期ベクトル化
  353. await this.vectorizeToElasticsearch(kb.id, userId, tenantId, text, config).catch((err) => {
  354. this.logger.error(`Error vectorizing file ${kb.id}`, err);
  355. });
  356. // 自動タイトル生成 (非同期的に実行)
  357. this.generateTitle(kb.id).catch((err) => {
  358. this.logger.error(`Error generating title for file ${kb.id}`, err);
  359. });
  360. // 非同期的に PDF 変換をトリガー(ドキュメントファイルの場合)
  361. this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => {
  362. this.logger.warn(this.i18nService.formatMessage('pdfConversionFailedDetail', { id: kb.id }), err);
  363. });
  364. }
  365. /**
  366. * 精密モード処理(新規フロー)
  367. */
  368. private async processPreciseMode(kb: KnowledgeBase, userId: string, tenantId: string, config?: any) {
  369. // 精密モードがサポートされているか確認
  370. const preciseFormats = ['.pdf', '.doc', '.docx', '.ppt', '.pptx'];
  371. const ext = kb.originalName.toLowerCase().substring(kb.originalName.lastIndexOf('.'));
  372. if (!preciseFormats.includes(ext)) {
  373. this.logger.warn(
  374. this.i18nService.formatMessage('preciseModeUnsupported', { ext })
  375. );
  376. return this.processFastMode(kb, userId, tenantId, config);
  377. }
  378. // Vision モデルが設定されているか確認
  379. const visionModelId = await this.userSettingService.getVisionModelId(userId);
  380. if (!visionModelId) {
  381. this.logger.warn(
  382. this.i18nService.getMessage('visionModelNotConfiguredFallback')
  383. );
  384. return this.processFastMode(kb, userId, tenantId, config);
  385. }
  386. const visionModel = await this.modelConfigService.findOne(
  387. visionModelId,
  388. userId,
  389. tenantId,
  390. );
  391. if (!visionModel || visionModel.type !== 'vision' || visionModel.isEnabled === false) {
  392. this.logger.warn(
  393. this.i18nService.getMessage('visionModelInvalidFallback')
  394. );
  395. return this.processFastMode(kb, userId, tenantId, config);
  396. }
  397. // Vision Pipeline を呼び出し
  398. try {
  399. const result = await this.visionPipelineService.processPreciseMode(
  400. kb.storagePath,
  401. {
  402. userId,
  403. tenantId, // New
  404. modelId: visionModelId,
  405. fileId: kb.id,
  406. fileName: kb.originalName,
  407. skipQualityCheck: false,
  408. }
  409. );
  410. if (!result.success) {
  411. this.logger.error(`Vision pipeline failed, falling back to fast mode`);
  412. this.logger.warn(this.i18nService.getMessage('visionPipelineFailed'));
  413. return this.processFastMode(kb, userId, tenantId, config);
  414. }
  415. // テキスト内容をデータベースに保存
  416. const combinedText = result.results.map(r => r.text).join('\n\n');
  417. const metadata = {
  418. processedPages: result.processedPages,
  419. failedPages: result.failedPages,
  420. cost: result.cost,
  421. duration: result.duration,
  422. results: result.results.map(r => ({
  423. pageIndex: r.pageIndex,
  424. confidence: r.confidence,
  425. layout: r.layout,
  426. imageCount: r.images.length,
  427. })),
  428. };
  429. await this.kbRepository.update(kb.id, {
  430. content: combinedText,
  431. metadata: metadata as any,
  432. });
  433. await this.updateStatus(kb.id, FileStatus.EXTRACTED);
  434. this.logger.log(
  435. this.i18nService.formatMessage('preciseModeComplete', { pages: result.processedPages, cost: result.cost.toFixed(2) })
  436. );
  437. // 非同期でベクトル化し、Elasticsearch にインデックス
  438. // 各ページを独立したドキュメントとして作成し、メタデータを保持
  439. this.indexPreciseResults(kb, userId, tenantId, kb.embeddingModelId, result.results).catch((err) => {
  440. this.logger.error(`Error indexing precise results for ${kb.id}`, err);
  441. });
  442. // 非同期で PDF 変換をトリガー
  443. this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => {
  444. this.logger.warn(`Initial PDF conversion failed for ${kb.id}`, err);
  445. });
  446. // 自動タイトル生成 (非同期的に実行)
  447. this.generateTitle(kb.id).catch((err) => {
  448. this.logger.error(`Error generating title for file ${kb.id}`, err);
  449. });
  450. } catch (error) {
  451. this.logger.error(`Vision pipeline error: ${error.message}, falling back to fast mode`);
  452. return this.processFastMode(kb, userId, tenantId, config);
  453. }
  454. }
  455. /**
  456. * 精密モードの結果をインデックス
  457. */
  458. private async indexPreciseResults(
  459. kb: KnowledgeBase,
  460. userId: string,
  461. tenantId: string,
  462. embeddingModelId: string,
  463. results: any[]
  464. ): Promise<void> {
  465. this.logger.log(`Indexing ${results.length} precise results for ${kb.id}`);
  466. // インデックスの存在を確認 - 実際のモデル次元数を取得
  467. const actualDimensions = await this.getActualModelDimensions(embeddingModelId, userId, tenantId);
  468. await this.elasticsearchService.createIndexIfNotExists(actualDimensions);
  469. // ベクトル化とインデックスをバッチ処理
  470. const batchSize = parseInt(process.env.CHUNK_BATCH_SIZE || '50');
  471. for (let i = 0; i < results.length; i += batchSize) {
  472. const batch = results.slice(i, i + batchSize);
  473. const texts = batch.map(r => r.text);
  474. try {
  475. // ベクトルを生成
  476. const embeddings = await this.embeddingService.getEmbeddings(
  477. texts,
  478. userId,
  479. embeddingModelId
  480. );
  481. // 各結果をインデックス
  482. for (let j = 0; j < batch.length; j++) {
  483. const result = batch[j];
  484. const embedding = embeddings[j];
  485. if (!embedding || embedding.length === 0) {
  486. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorPage', { page: result.pageIndex }));
  487. continue;
  488. }
  489. await this.elasticsearchService.indexDocument(
  490. `${kb.id}_page_${result.pageIndex}`,
  491. result.text,
  492. embedding,
  493. {
  494. fileId: kb.id,
  495. originalName: kb.originalName,
  496. mimetype: kb.mimetype,
  497. userId: userId,
  498. tenantId: tenantId, // New
  499. pageNumber: result.pageIndex,
  500. images: result.images,
  501. layout: result.layout,
  502. confidence: result.confidence,
  503. source: 'precise',
  504. mode: 'vision',
  505. }
  506. );
  507. }
  508. this.logger.log(`バッチ ${Math.floor(i / batchSize) + 1} 完了: ${batch.length} ページ`);
  509. } catch (error) {
  510. this.logger.error(`バッチ ${Math.floor(i / batchSize) + 1} の処理に失敗しました`, error);
  511. }
  512. }
  513. await this.updateStatus(kb.id, FileStatus.VECTORIZED);
  514. this.logger.log(`精密モードのインデックス完了: ${results.length} ページ`);
  515. }
  516. /**
  517. * PDF の特定ページの画像を取得
  518. */
  519. async getPageAsImage(fileId: string, pageIndex: number, userId: string, tenantId: string): Promise<string> {
  520. const pdfPath = await this.ensurePDFExists(fileId, userId, tenantId);
  521. // 特定のページを変換
  522. const result = await this.pdf2ImageService.convertToImages(pdfPath, {
  523. density: 150,
  524. quality: 75,
  525. format: 'jpeg',
  526. });
  527. // 対応するページ番号の画像を見つける
  528. const pageImage = result.images.find(img => img.pageIndex === pageIndex + 1);
  529. if (!pageImage) {
  530. throw new NotFoundException(this.i18nService.formatMessage('pageImageNotFoundDetail', { page: pageIndex + 1 }));
  531. }
  532. return pageImage.path;
  533. }
  534. private async vectorizeToElasticsearch(
  535. kbId: string,
  536. userId: string,
  537. tenantId: string,
  538. text: string,
  539. config?: any,
  540. ) {
  541. try {
  542. const kb = await this.kbRepository.findOne({ where: { id: kbId, tenantId } });
  543. if (!kb) return;
  544. // メモリ監視 - ベクトル化前チェック
  545. const memBeforeChunk = this.memoryMonitor.getMemoryUsage();
  546. this.logger.log(
  547. `ベクトル化前メモリ: ${memBeforeChunk.heapUsed}/${memBeforeChunk.heapTotal}MB`,
  548. );
  549. this.logger.debug(`File ${kbId}: Validating chunk config...`);
  550. // 1. チャンク設定の検証と修正(モデルの制限と環境変数に基づく)
  551. const validatedConfig = await this.chunkConfigService.validateChunkConfig(
  552. kb.chunkSize,
  553. kb.chunkOverlap,
  554. kb.embeddingModelId,
  555. userId,
  556. );
  557. this.logger.debug(`File ${kbId}: Chunk config validated.`);
  558. // 設定が修正された場合、警告を記録しデータベースを更新
  559. if (validatedConfig.warnings.length > 0) {
  560. this.logger.warn(
  561. this.i18nService.formatMessage('chunkConfigCorrection', { warnings: validatedConfig.warnings.join(', ') })
  562. );
  563. // データベース内の設定を更新
  564. if (validatedConfig.chunkSize !== kb.chunkSize ||
  565. validatedConfig.chunkOverlap !== kb.chunkOverlap) {
  566. await this.kbRepository.update(kbId, {
  567. chunkSize: validatedConfig.chunkSize,
  568. chunkOverlap: validatedConfig.chunkOverlap,
  569. });
  570. }
  571. }
  572. // 設定サマリーを表示(実際に適用される上限を含む)
  573. this.logger.debug(`File ${kbId}: Getting config summary...`);
  574. const configSummary = await this.chunkConfigService.getConfigSummary(
  575. validatedConfig.chunkSize,
  576. validatedConfig.chunkOverlap,
  577. kb.embeddingModelId,
  578. userId,
  579. );
  580. this.logger.log(`チャンク設定: ${configSummary}`);
  581. this.logger.log(`設定上限: チャンク=${validatedConfig.effectiveMaxChunkSize}, 重複=${validatedConfig.effectiveMaxOverlapSize}`);
  582. // 2. 検証済みの設定を使用してチャンク分割
  583. const chunks = this.textChunkerService.chunkText(
  584. text,
  585. validatedConfig.chunkSize,
  586. validatedConfig.chunkOverlap,
  587. );
  588. this.logger.log(`ファイル ${kbId} から ${chunks.length} 個のテキストブロックを分割しました`);
  589. if (chunks.length === 0) {
  590. this.logger.warn(this.i18nService.formatMessage('noChunksGenerated', { id: kbId }));
  591. await this.updateStatus(kbId, FileStatus.VECTORIZED);
  592. return;
  593. }
  594. // 3. チャンク数が妥当か確認
  595. const estimatedChunkCount = this.chunkConfigService.estimateChunkCount(
  596. text.length,
  597. validatedConfig.chunkSize,
  598. );
  599. if (chunks.length > estimatedChunkCount * 1.2) {
  600. this.logger.warn(
  601. this.i18nService.formatMessage('chunkCountAnomaly', { actual: chunks.length, estimated: estimatedChunkCount })
  602. );
  603. }
  604. // 4. 推奨バッチサイズを取得(モデルの制限に基づく)
  605. const recommendedBatchSize = await this.chunkConfigService.getRecommendedBatchSize(
  606. kb.embeddingModelId,
  607. userId,
  608. tenantId,
  609. parseInt(process.env.CHUNK_BATCH_SIZE || '100'),
  610. );
  611. // 5. メモリ使用量を推定
  612. const avgChunkSize = chunks.reduce((sum, c) => sum + c.content.length, 0) / chunks.length;
  613. const estimatedMemory = this.memoryMonitor.estimateMemoryUsage(
  614. chunks.length,
  615. avgChunkSize,
  616. parseInt(process.env.DEFAULT_VECTOR_DIMENSIONS || '2560'),
  617. );
  618. this.logger.log(`推定メモリ使用量: ${estimatedMemory}MB (バッチサイズ: ${recommendedBatchSize})`);
  619. // 6. 実際のモデル次元数を取得し、インデックスの存在を確認
  620. const actualDimensions = await this.getActualModelDimensions(kb.embeddingModelId, userId, tenantId);
  621. await this.elasticsearchService.createIndexIfNotExists(actualDimensions);
  622. // 7. ベクトル化とインデックス作成をバッチ処理
  623. const useBatching = this.memoryMonitor.shouldUseBatching(
  624. chunks.length,
  625. avgChunkSize,
  626. actualDimensions,
  627. );
  628. if (useBatching) {
  629. try {
  630. await this.processInBatches(
  631. chunks,
  632. async (batch, batchIndex) => {
  633. // バッチサイズがモデルの制限を超えていないか検証
  634. if (batch.length > recommendedBatchSize) {
  635. this.logger.warn(
  636. this.i18nService.formatMessage('batchSizeExceeded', { index: batchIndex, actual: batch.length, limit: recommendedBatchSize })
  637. );
  638. }
  639. const chunkTexts = batch.map((chunk) => chunk.content);
  640. const embeddings = await this.embeddingService.getEmbeddings(
  641. chunkTexts,
  642. userId,
  643. kb.embeddingModelId,
  644. );
  645. // 次元の整合性を検証
  646. if (embeddings.length > 0 && embeddings[0].length !== actualDimensions) {
  647. this.logger.warn(
  648. `ベクトル次元が不一致です: 期待値 ${actualDimensions}, 実際 ${embeddings[0].length}`
  649. );
  650. }
  651. // このバッチデータを即座にインデックス
  652. for (let i = 0; i < batch.length; i++) {
  653. const chunk = batch[i];
  654. const embedding = embeddings[i];
  655. if (!embedding || embedding.length === 0) {
  656. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
  657. continue;
  658. }
  659. await this.elasticsearchService.indexDocument(
  660. `${kb.id}_chunk_${chunk.index}`,
  661. chunk.content,
  662. embedding,
  663. {
  664. fileId: kb.id,
  665. originalName: kb.originalName,
  666. mimetype: kb.mimetype,
  667. userId: userId,
  668. chunkIndex: chunk.index,
  669. startPosition: chunk.startPosition,
  670. tenantId, // Passing tenantId to ES
  671. }
  672. );
  673. }
  674. this.logger.log(`バッチ ${batchIndex} 完了: ${batch.length} チャンク`);
  675. },
  676. {
  677. batchSize: recommendedBatchSize,
  678. onBatchComplete: (batchIndex, totalBatches) => {
  679. const mem = this.memoryMonitor.getMemoryUsage();
  680. this.logger.log(
  681. `バッチ ${batchIndex}/${totalBatches} 完了, メモリ: ${mem.heapUsed}MB`,
  682. );
  683. },
  684. },
  685. );
  686. } catch (error) {
  687. // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
  688. if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
  689. this.logger.warn(this.i18nService.getMessage('contextLengthErrorFallback'));
  690. // 単一テキスト処理にダウングレード
  691. for (let i = 0; i < chunks.length; i++) {
  692. const chunk = chunks[i];
  693. try {
  694. const embeddings = await this.embeddingService.getEmbeddings(
  695. [chunk.content], // 単一テキスト
  696. userId,
  697. kb.embeddingModelId,
  698. );
  699. if (!embeddings[0] || embeddings[0].length === 0) {
  700. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
  701. continue;
  702. }
  703. await this.elasticsearchService.indexDocument(
  704. `${kb.id}_chunk_${chunk.index}`,
  705. chunk.content,
  706. embeddings[0],
  707. {
  708. fileId: kb.id,
  709. originalName: kb.originalName,
  710. mimetype: kb.mimetype,
  711. userId: userId,
  712. chunkIndex: chunk.index,
  713. startPosition: chunk.startPosition,
  714. endPosition: chunk.endPosition,
  715. tenantId,
  716. }
  717. );
  718. if ((i + 1) % 10 === 0) {
  719. this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
  720. }
  721. } catch (chunkError) {
  722. this.logger.error(
  723. `テキストブロック ${chunk.index} の処理に失敗しました。スキップします: ${chunkError.message}`
  724. );
  725. continue;
  726. }
  727. }
  728. this.logger.log(`単一テキスト処理完了: ${chunks.length} チャンク`);
  729. } else {
  730. // その他のエラーは直接スロー
  731. throw error;
  732. }
  733. }
  734. } else {
  735. // 小さなファイル、一括処理(ただしバッチ制限の確認が必要)
  736. const chunkTexts = chunks.map((chunk) => chunk.content);
  737. // チャンク数がモデルのバッチ制限を超える場合は、強制的にバッチ処理
  738. if (chunks.length > recommendedBatchSize) {
  739. this.logger.warn(
  740. this.i18nService.formatMessage('chunkLimitExceededForceBatch', { actual: chunks.length, limit: recommendedBatchSize })
  741. );
  742. try {
  743. await this.processInBatches(
  744. chunks,
  745. async (batch, batchIndex) => {
  746. const batchTexts = batch.map((c) => c.content);
  747. const embeddings = await this.embeddingService.getEmbeddings(
  748. batchTexts,
  749. userId,
  750. kb.embeddingModelId,
  751. );
  752. for (let i = 0; i < batch.length; i++) {
  753. const chunk = batch[i];
  754. const embedding = embeddings[i];
  755. if (!embedding || embedding.length === 0) {
  756. this.logger.warn(`空ベクトルのテキストブロック ${chunk.index} をスキップします`);
  757. continue;
  758. }
  759. await this.elasticsearchService.indexDocument(
  760. `${kb.id}_chunk_${chunk.index}`,
  761. chunk.content,
  762. embedding,
  763. {
  764. fileId: kb.id,
  765. originalName: kb.originalName,
  766. mimetype: kb.mimetype,
  767. userId: userId,
  768. chunkIndex: chunk.index,
  769. startPosition: chunk.startPosition,
  770. endPosition: chunk.endPosition,
  771. tenantId, // Passing tenantId to ES metadata
  772. }
  773. );
  774. }
  775. },
  776. );
  777. } catch (error) {
  778. // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
  779. if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
  780. this.logger.warn(this.i18nService.getMessage('batchContextLengthErrorFallback'));
  781. // 単一テキスト処理にダウングレード
  782. for (let i = 0; i < chunks.length; i++) {
  783. const chunk = chunks[i];
  784. try {
  785. const embeddings = await this.embeddingService.getEmbeddings(
  786. [chunk.content], // 単一テキスト
  787. userId,
  788. kb.embeddingModelId,
  789. );
  790. if (!embeddings[0] || embeddings[0].length === 0) {
  791. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
  792. continue;
  793. }
  794. await this.elasticsearchService.indexDocument(
  795. `${kb.id}_chunk_${chunk.index}`,
  796. chunk.content,
  797. embeddings[0],
  798. {
  799. fileId: kb.id,
  800. originalName: kb.originalName,
  801. mimetype: kb.mimetype,
  802. userId: userId,
  803. chunkIndex: chunk.index,
  804. startPosition: chunk.startPosition,
  805. endPosition: chunk.endPosition,
  806. },
  807. );
  808. if ((i + 1) % 10 === 0) {
  809. this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
  810. }
  811. } catch (chunkError) {
  812. this.logger.error(
  813. this.i18nService.formatMessage('chunkProcessingFailed', { index: chunk.index, message: chunkError.message })
  814. );
  815. continue;
  816. }
  817. }
  818. this.logger.log(this.i18nService.formatMessage('singleTextProcessingComplete', { count: chunks.length }));
  819. } else {
  820. // その他のエラー、直接スロー
  821. throw error;
  822. }
  823. }
  824. } else {
  825. // 十分に小さいファイルの場合は一括で処理
  826. try {
  827. const embeddings = await this.embeddingService.getEmbeddings(
  828. chunkTexts,
  829. userId,
  830. kb.embeddingModelId,
  831. );
  832. for (let i = 0; i < chunks.length; i++) {
  833. const chunk = chunks[i];
  834. const embedding = embeddings[i];
  835. if (!embedding || embedding.length === 0) {
  836. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
  837. continue;
  838. }
  839. await this.elasticsearchService.indexDocument(
  840. `${kb.id}_chunk_${chunk.index}`,
  841. chunk.content,
  842. embedding,
  843. {
  844. fileId: kb.id,
  845. originalName: kb.originalName,
  846. mimetype: kb.mimetype,
  847. userId: userId,
  848. chunkIndex: chunk.index,
  849. startPosition: chunk.startPosition,
  850. endPosition: chunk.endPosition,
  851. },
  852. );
  853. }
  854. } catch (error) {
  855. // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
  856. if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
  857. this.logger.warn(this.i18nService.getMessage('batchContextLengthErrorFallback'));
  858. // 単一テキスト処理にダウングレード
  859. for (let i = 0; i < chunks.length; i++) {
  860. const chunk = chunks[i];
  861. try {
  862. const embeddings = await this.embeddingService.getEmbeddings(
  863. [chunk.content], // 単一テキスト
  864. userId,
  865. kb.embeddingModelId,
  866. );
  867. if (!embeddings[0] || embeddings[0].length === 0) {
  868. this.logger.warn(`空ベクトルのテキストブロック ${chunk.index} をスキップします`);
  869. continue;
  870. }
  871. await this.elasticsearchService.indexDocument(
  872. `${kb.id}_chunk_${chunk.index}`,
  873. chunk.content,
  874. embeddings[0],
  875. {
  876. fileId: kb.id,
  877. originalName: kb.originalName,
  878. mimetype: kb.mimetype,
  879. userId: userId,
  880. chunkIndex: chunk.index,
  881. startPosition: chunk.startPosition,
  882. endPosition: chunk.endPosition,
  883. },
  884. );
  885. if ((i + 1) % 10 === 0) {
  886. this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
  887. }
  888. } catch (chunkError) {
  889. this.logger.error(
  890. `テキストブロック ${chunk.index} の処理に失敗しました。スキップします: ${chunkError.message}`
  891. );
  892. continue;
  893. }
  894. }
  895. this.logger.log(this.i18nService.formatMessage('singleTextProcessingComplete', { count: chunks.length }));
  896. } else {
  897. // その他のエラー、直接スロー
  898. throw error;
  899. }
  900. }
  901. }
  902. }
  903. await this.updateStatus(kbId, FileStatus.VECTORIZED);
  904. const memAfter = this.memoryMonitor.getMemoryUsage();
  905. this.logger.log(
  906. this.i18nService.formatMessage('fileVectorizationComplete', { id: kbId, count: chunks.length, memory: memAfter.heapUsed })
  907. );
  908. } catch (error) {
  909. this.logger.error(this.i18nService.formatMessage('fileVectorizationFailed', { id: kbId }), error);
  910. // エラー情報を metadata に保存
  911. try {
  912. const kb = await this.kbRepository.findOne({ where: { id: kbId } });
  913. if (kb) {
  914. const metadata = kb.metadata || {};
  915. metadata.lastError = error.message;
  916. metadata.failedAt = new Date().toISOString();
  917. await this.kbRepository.update(kbId, { metadata });
  918. }
  919. } catch (e) {
  920. this.logger.warn(`Failed to update metadata for failed file ${kbId}`, e);
  921. }
  922. await this.updateStatus(kbId, FileStatus.FAILED);
  923. }
  924. }
  925. /**
  926. * バッチ処理、メモリ制御付き
  927. */
  928. private async processInBatches<T>(
  929. items: T[],
  930. processor: (batch: T[], batchIndex: number) => Promise<void>,
  931. options?: {
  932. batchSize?: number;
  933. onBatchComplete?: (batchIndex: number, totalBatches: number) => void;
  934. },
  935. ): Promise<void> {
  936. const totalItems = items.length;
  937. if (totalItems === 0) return;
  938. const startTime = Date.now();
  939. this.logger.log(this.i18nService.formatMessage('batchProcessingStarted', { count: totalItems }));
  940. // Use provided batch size or fallback to env/default
  941. const initialBatchSize = options?.batchSize || parseInt(process.env.CHUNK_BATCH_SIZE || '100');
  942. const totalBatches = Math.ceil(totalItems / initialBatchSize);
  943. for (let i = 0; i < totalItems;) {
  944. // メモリを確認し待機
  945. await this.memoryMonitor.waitForMemoryAvailable();
  946. // バッチサイズを動的に調整 (initialBatchSize から開始し、必要に応じてメモリモニターが削減できるようにします)
  947. // 注意: memoryMonitor.getDynamicBatchSize はメモリ状況に基づいてより大きな値を返す可能性がありますが、
  948. // モデルの制限 (initialBatchSize) を尊重する必要があります。
  949. const currentMem = this.memoryMonitor.getMemoryUsage().heapUsed;
  950. const dynamicBatchSize = this.memoryMonitor.getDynamicBatchSize(currentMem);
  951. // Ensure we don't exceed the model's limit (initialBatchSize) even if memory allows more
  952. const batchSize = Math.min(dynamicBatchSize, initialBatchSize);
  953. // 現在のバッチを取得
  954. const batch = items.slice(i, i + batchSize);
  955. const batchIndex = Math.floor(i / batchSize) + 1;
  956. this.logger.log(
  957. this.i18nService.formatMessage('batchProcessingProgress', { index: batchIndex, total: totalBatches, count: batch.length })
  958. );
  959. // バッチを処理
  960. await processor(batch, batchIndex);
  961. // コールバック通知
  962. if (options?.onBatchComplete) {
  963. options.onBatchComplete(batchIndex, totalBatches);
  964. }
  965. // 強制GC(メモリがしきい値に近い場合)
  966. if (currentMem > 800) {
  967. this.memoryMonitor.forceGC();
  968. }
  969. // 参照をクリアしGCを助ける
  970. batch.length = 0;
  971. i += batchSize;
  972. }
  973. const duration = ((Date.now() - startTime) / 1000).toFixed(2);
  974. this.logger.log(this.i18nService.formatMessage('batchProcessingComplete', { count: totalItems, duration }));
  975. }
  976. /**
  977. * 失敗したファイルのベクトル化を再試行
  978. */
  979. async retryFailedFile(fileId: string, userId: string, tenantId: string): Promise<KnowledgeBase> {
  980. this.logger.log(`Retrying failed file ${fileId} for user ${userId} in tenant ${tenantId}`);
  981. // 1. Get file with tenant restriction
  982. const kb = await this.kbRepository.findOne({
  983. where: { id: fileId, tenantId },
  984. });
  985. if (!kb) {
  986. throw new NotFoundException('ファイルが存在しません');
  987. }
  988. if (kb.status !== FileStatus.FAILED) {
  989. throw new Error(this.i18nService.formatMessage('onlyFailedFilesRetryable', { status: kb.status }));
  990. }
  991. if (!kb.content || kb.content.trim().length === 0) {
  992. throw new Error(this.i18nService.getMessage('emptyFileRetryFailed'));
  993. }
  994. // 2. ステータスを INDEXING にリセット
  995. await this.updateStatus(fileId, FileStatus.INDEXING);
  996. // 3. 非同期でベクトル化をトリガー(既存ロジックを再利用)
  997. this.vectorizeToElasticsearch(
  998. fileId,
  999. userId,
  1000. tenantId,
  1001. kb.content,
  1002. {
  1003. chunkSize: kb.chunkSize,
  1004. chunkOverlap: kb.chunkOverlap,
  1005. embeddingModelId: kb.embeddingModelId,
  1006. }
  1007. ).catch((err) => {
  1008. this.logger.error(`Retry vectorization failed for file ${fileId}`, err);
  1009. });
  1010. // 4. 更新後のファイルステータスを返却
  1011. const updatedKb = await this.kbRepository.findOne({ where: { id: fileId, tenantId } });
  1012. if (!updatedKb) {
  1013. throw new NotFoundException('ファイルが存在しません');
  1014. }
  1015. return updatedKb;
  1016. }
  1017. /**
  1018. * ファイルのすべてのチャンク情報を取得
  1019. */
  1020. async getFileChunks(fileId: string, userId: string, tenantId: string) {
  1021. this.logger.log(`Getting chunks for file ${fileId}, user ${userId}, tenant ${tenantId}`);
  1022. // 1. Get file with tenant check
  1023. const kb = await this.kbRepository.findOne({
  1024. where: { id: fileId, tenantId },
  1025. });
  1026. if (!kb) {
  1027. throw new NotFoundException('ファイルが存在しません');
  1028. }
  1029. // 2. Elasticsearch からすべてのチャンクを取得
  1030. const chunks = await this.elasticsearchService.getFileChunks(fileId, userId, tenantId);
  1031. // 3. チャンク情報を返却
  1032. return {
  1033. fileId: kb.id,
  1034. fileName: kb.originalName,
  1035. totalChunks: chunks.length,
  1036. chunkSize: kb.chunkSize,
  1037. chunkOverlap: kb.chunkOverlap,
  1038. chunks: chunks.map(chunk => ({
  1039. index: chunk.chunkIndex,
  1040. content: chunk.content,
  1041. contentLength: chunk.content.length,
  1042. startPosition: chunk.startPosition,
  1043. endPosition: chunk.endPosition,
  1044. })),
  1045. };
  1046. }
  1047. private async updateStatus(id: string, status: FileStatus) {
  1048. await this.kbRepository.update(id, { status });
  1049. }
  1050. // PDF プレビュー関連メソッド
  1051. async ensurePDFExists(fileId: string, userId: string, tenantId: string, force: boolean = false): Promise<string> {
  1052. const kb = await this.kbRepository.findOne({
  1053. where: { id: fileId, tenantId },
  1054. });
  1055. if (!kb) {
  1056. throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
  1057. }
  1058. // 元ファイルが PDF の場合は、元ファイルのパスを直接返す
  1059. if (kb.mimetype === 'application/pdf') {
  1060. return kb.storagePath;
  1061. }
  1062. // プレビュー変換に対応しているか確認(ドキュメント類または画像類のみ許可)
  1063. const ext = kb.originalName.toLowerCase().split('.').pop() || '';
  1064. const isConvertible = [...DOC_EXTENSIONS, ...IMAGE_EXTENSIONS].includes(ext);
  1065. if (!isConvertible) {
  1066. this.logger.log(`Skipping PDF conversion for unsupported format: .${ext} (${kb.originalName})`);
  1067. throw new Error(this.i18nService.getMessage('pdfPreviewNotSupported'));
  1068. }
  1069. // PDF フィールドパスを生成
  1070. const path = await import('path');
  1071. const fs = await import('fs');
  1072. const uploadDir = path.dirname(kb.storagePath);
  1073. const baseName = path.basename(kb.storagePath, path.extname(kb.storagePath));
  1074. const pdfPath = path.join(uploadDir, `${baseName}.pdf`);
  1075. // 強制再生成が指定され、ファイルが存在する場合は削除
  1076. if (force && fs.existsSync(pdfPath)) {
  1077. try {
  1078. fs.unlinkSync(pdfPath);
  1079. this.logger.log(`Forced regeneration: Deleted existing PDF ${pdfPath}`);
  1080. } catch (e) {
  1081. this.logger.warn(`Failed to delete existing PDF for regeneration: ${e.message}`);
  1082. }
  1083. }
  1084. // 変換済みかつ強制再生成が不要か確認
  1085. if (fs.existsSync(pdfPath) && !force) {
  1086. if (!kb.pdfPath) {
  1087. await this.kbRepository.update(kb.id, { pdfPath: pdfPath });
  1088. }
  1089. return pdfPath;
  1090. }
  1091. // PDF への変換が必要
  1092. try {
  1093. this.logger.log(`Starting PDF conversion for ${kb.originalName} at ${kb.storagePath}`);
  1094. // ファイルを変換
  1095. await this.libreOfficeService.convertToPDF(kb.storagePath);
  1096. // 変換結果を確認
  1097. if (!fs.existsSync(pdfPath)) {
  1098. throw new Error(`PDF conversion completed but file not found at ${pdfPath}`);
  1099. }
  1100. const stats = fs.statSync(pdfPath);
  1101. if (stats.size === 0) {
  1102. fs.unlinkSync(pdfPath);
  1103. throw new Error(`PDF conversion failed: output file is empty`);
  1104. }
  1105. await this.kbRepository.update(kb.id, { pdfPath: pdfPath });
  1106. this.logger.log(`PDF conversion successful: ${pdfPath}`);
  1107. return pdfPath;
  1108. } catch (error) {
  1109. this.logger.error(`PDF conversion failed for ${fileId}: ${error.message}`, error.stack);
  1110. throw new Error(this.i18nService.formatMessage('pdfConversionFailedDetail', { id: fileId }));
  1111. }
  1112. }
  1113. async getPDFStatus(fileId: string, userId: string, tenantId: string) {
  1114. const kb = await this.kbRepository.findOne({
  1115. where: { id: fileId, tenantId },
  1116. });
  1117. if (!kb) {
  1118. throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
  1119. }
  1120. // 元ファイルが PDF の場合
  1121. if (kb.mimetype === 'application/pdf') {
  1122. const token = this.generateTempToken(fileId, userId, tenantId);
  1123. return {
  1124. status: 'ready',
  1125. url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
  1126. };
  1127. }
  1128. // PDF ファイルパスを生成
  1129. const path = await import('path');
  1130. const fs = await import('fs');
  1131. const uploadDir = path.dirname(kb.storagePath);
  1132. const baseName = path.basename(kb.storagePath, path.extname(kb.storagePath));
  1133. const pdfPath = path.join(uploadDir, `${baseName}.pdf`);
  1134. // 変換済みか確認
  1135. if (fs.existsSync(pdfPath)) {
  1136. if (!kb.pdfPath) {
  1137. kb.pdfPath = pdfPath;
  1138. await this.kbRepository.save(kb);
  1139. }
  1140. const token = this.generateTempToken(fileId, userId, tenantId);
  1141. return {
  1142. status: 'ready',
  1143. url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
  1144. };
  1145. }
  1146. // 変換が必要
  1147. return {
  1148. status: 'pending',
  1149. };
  1150. }
  1151. private generateTempToken(fileId: string, userId: string, tenantId: string): string {
  1152. const jwt = require('jsonwebtoken');
  1153. const secret = process.env.JWT_SECRET;
  1154. if (!secret) {
  1155. throw new Error('JWT_SECRET environment variable is required but not set');
  1156. }
  1157. return jwt.sign(
  1158. { fileId, userId, tenantId, type: 'pdf-access' },
  1159. secret,
  1160. { expiresIn: '1h' }
  1161. );
  1162. }
  1163. /**
  1164. * モデルの実際の次元数を取得(キャッシュ確認とプローブロジック付き)
  1165. */
  1166. private async getActualModelDimensions(embeddingModelId: string, userId: string, tenantId: string): Promise<number> {
  1167. const defaultDimensions = parseInt(
  1168. process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
  1169. );
  1170. try {
  1171. // 1. モデル設定から優先的に取得
  1172. const modelConfig = await this.modelConfigService.findOne(
  1173. embeddingModelId,
  1174. userId,
  1175. tenantId,
  1176. );
  1177. if (modelConfig && modelConfig.dimensions) {
  1178. this.logger.log(`設定から ${modelConfig.name} の次元数を取得しました: ${modelConfig.dimensions}`);
  1179. return modelConfig.dimensions;
  1180. }
  1181. // 2. それ以外の場合はプローブにより取得
  1182. this.logger.log(`モデル次元数をプローブ中: ${embeddingModelId}`);
  1183. const probeEmbeddings = await this.embeddingService.getEmbeddings(
  1184. ['probe'],
  1185. userId,
  1186. embeddingModelId,
  1187. );
  1188. if (probeEmbeddings.length > 0) {
  1189. const actualDimensions = probeEmbeddings[0].length;
  1190. this.logger.log(`モデルの実際の次元数を検出しました: ${actualDimensions}`);
  1191. // 次回利用のためにモデル設定を更新
  1192. if (modelConfig) {
  1193. try {
  1194. await this.modelConfigService.update(userId, tenantId, modelConfig.id, {
  1195. dimensions: actualDimensions,
  1196. });
  1197. this.logger.log(`モデル ${modelConfig.name} の次元数設定を ${actualDimensions} に更新しました`);
  1198. } catch (updateErr) {
  1199. this.logger.warn(`モデル次元数設定の更新に失敗しました: ${updateErr.message}`);
  1200. }
  1201. }
  1202. return actualDimensions;
  1203. }
  1204. } catch (err) {
  1205. this.logger.warn(
  1206. `次元数の取得に失敗しました。デフォルト次元数を使用します: ${defaultDimensions}`,
  1207. err.message,
  1208. );
  1209. }
  1210. return defaultDimensions;
  1211. }
  1212. /**
  1213. * AIを使用して文書のタイトルを自動生成する
  1214. */
  1215. async generateTitle(kbId: string): Promise<string | null> {
  1216. this.logger.log(`Generating automatic title for file ${kbId}`);
  1217. try {
  1218. const kb = await this.kbRepository.findOne({ where: { id: kbId } });
  1219. if (!kb || !kb.content || kb.content.trim().length === 0) {
  1220. return null;
  1221. }
  1222. const tenantId = kb.tenantId;
  1223. // すでにタイトルがある場合はスキップ
  1224. if (kb.title) {
  1225. return kb.title;
  1226. }
  1227. // コンテンツの冒頭サンプルを取得(最大2500文字)
  1228. const contentSample = kb.content.substring(0, 2500);
  1229. // ユーザー設定から言語を取得、またはデフォルトを使用
  1230. const settings = await this.userSettingService.findOrCreate(kb.userId);
  1231. const language = settings.language || 'ja';
  1232. // プロンプトを構築
  1233. const prompt = this.i18nService.getDocumentTitlePrompt(language, contentSample);
  1234. // LLMを呼び出してタイトルを生成
  1235. let generatedTitle: string | undefined;
  1236. try {
  1237. generatedTitle = await this.chatService.generateSimpleChat(
  1238. [{ role: 'user', content: prompt }],
  1239. kb.userId,
  1240. kb.tenantId
  1241. );
  1242. } catch (err) {
  1243. this.logger.warn(`Failed to generate title for document ${kbId} due to LLM configuration issue: ${err.message}`);
  1244. return null; // Skip title generation if LLM is not configured for this tenant
  1245. }
  1246. if (generatedTitle && generatedTitle.trim().length > 0) {
  1247. // 余分な引用符や改行を除去
  1248. const cleanedTitle = generatedTitle.trim().replace(/^["']|["']$/g, '').substring(0, 100);
  1249. await this.kbRepository.update(kbId, { title: cleanedTitle });
  1250. // Elasticsearch のチャンクも更新
  1251. await this.elasticsearchService.updateTitleByFileId(kbId, cleanedTitle, tenantId).catch((err) => {
  1252. this.logger.error(`Failed to update title in Elasticsearch for ${kbId}`, err);
  1253. });
  1254. this.logger.log(`Successfully generated title for ${kbId}: ${cleanedTitle}`);
  1255. return cleanedTitle;
  1256. }
  1257. } catch (error) {
  1258. this.logger.error(`Failed to generate title for ${kbId}`, error);
  1259. }
  1260. return null;
  1261. }
  1262. }