knowledge-base.service.ts 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467
  1. import { Injectable, Logger, NotFoundException, Inject, forwardRef } from '@nestjs/common';
  2. import { ConfigService } from '@nestjs/config';
  3. import { I18nService } from '../i18n/i18n.service';
  4. import { InjectRepository } from '@nestjs/typeorm';
  5. import { Repository, In } from 'typeorm';
  6. import { FileStatus, KnowledgeBase, ProcessingMode } from './knowledge-base.entity';
  7. import { KnowledgeGroup } from '../knowledge-group/knowledge-group.entity';
  8. import { ElasticsearchService } from '../elasticsearch/elasticsearch.service';
  9. import { TikaService } from '../tika/tika.service';
  10. import * as fs from 'fs';
  11. import * as path from 'path';
  12. import { EmbeddingService } from './embedding.service';
  13. import { TextChunkerService } from './text-chunker.service';
  14. import { ModelConfigService } from '../model-config/model-config.service';
  15. import { RagService } from '../rag/rag.service';
  16. import { VisionService } from '../vision/vision.service';
  17. import { UserSettingService } from '../user-setting/user-setting.service';
  18. import { MemoryMonitorService } from './memory-monitor.service';
  19. import { ChunkConfigService } from './chunk-config.service';
  20. import { VisionPipelineService } from '../vision-pipeline/vision-pipeline.service';
  21. import { LibreOfficeService } from '../libreoffice/libreoffice.service';
  22. import { Pdf2ImageService } from '../pdf2image/pdf2image.service';
  23. import { DOC_EXTENSIONS, IMAGE_EXTENSIONS } from '../common/file-support.constants';
  24. import { ChatService } from '../chat/chat.service';
  25. @Injectable()
  26. export class KnowledgeBaseService {
  27. private readonly logger = new Logger(KnowledgeBaseService.name);
  28. constructor(
  29. @InjectRepository(KnowledgeBase)
  30. private kbRepository: Repository<KnowledgeBase>,
  31. @InjectRepository(KnowledgeGroup)
  32. private groupRepository: Repository<KnowledgeGroup>,
  33. @Inject(forwardRef(() => ElasticsearchService))
  34. private elasticsearchService: ElasticsearchService,
  35. private tikaService: TikaService,
  36. private embeddingService: EmbeddingService,
  37. private textChunkerService: TextChunkerService,
  38. private modelConfigService: ModelConfigService,
  39. @Inject(forwardRef(() => RagService))
  40. private ragService: RagService,
  41. private visionService: VisionService,
  42. private userSettingService: UserSettingService,
  43. private memoryMonitor: MemoryMonitorService,
  44. private chunkConfigService: ChunkConfigService,
  45. private visionPipelineService: VisionPipelineService,
  46. private libreOfficeService: LibreOfficeService,
  47. private pdf2ImageService: Pdf2ImageService,
  48. private configService: ConfigService,
  49. private i18nService: I18nService,
  50. @Inject(forwardRef(() => ChatService))
  51. private chatService: ChatService,
  52. ) { }
  53. async createAndIndex(
  54. fileInfo: any,
  55. userId: string,
  56. tenantId: string,
  57. config?: any,
  58. ): Promise<KnowledgeBase> {
  59. const mode = config?.mode || 'fast';
  60. const processingMode = mode === 'precise' ? ProcessingMode.PRECISE : ProcessingMode.FAST;
  61. const kb = this.kbRepository.create({
  62. originalName: fileInfo.originalname,
  63. storagePath: fileInfo.path,
  64. size: fileInfo.size,
  65. mimetype: fileInfo.mimetype,
  66. status: FileStatus.PENDING,
  67. userId: userId,
  68. tenantId: tenantId,
  69. chunkSize: config?.chunkSize || 200,
  70. chunkOverlap: config?.chunkOverlap || 40,
  71. embeddingModelId: config?.embeddingModelId || null,
  72. processingMode: processingMode,
  73. });
  74. // 分類(グループ)の関連付け
  75. if (config?.groupIds && config.groupIds.length > 0) {
  76. const groups = await this.groupRepository.find({
  77. where: { id: In(config.groupIds), tenantId: tenantId }
  78. });
  79. kb.groups = groups;
  80. }
  81. const savedKb = await this.kbRepository.save(kb);
  82. this.logger.log(
  83. `Created KB record: ${savedKb.id}, mode: ${mode}, file: ${fileInfo.originalname}`
  84. );
  85. // ---------------------------------------------------------
  86. // Move the file to the final partitioned directory
  87. // source: uploads/{tenantId}/{filename} (or wherever it was)
  88. // target: uploads/{tenantId}/{savedKb.id}/{filename}
  89. // ---------------------------------------------------------
  90. const fs = await import('fs');
  91. const path = await import('path');
  92. const uploadPath = process.env.UPLOAD_FILE_PATH || './uploads';
  93. const targetDir = path.join(uploadPath, tenantId || 'default', savedKb.id);
  94. const targetPath = path.join(targetDir, fileInfo.filename);
  95. try {
  96. if (!fs.existsSync(targetDir)) {
  97. fs.mkdirSync(targetDir, { recursive: true });
  98. }
  99. if (fs.existsSync(fileInfo.path)) {
  100. fs.renameSync(fileInfo.path, targetPath);
  101. // Update the DB record with the new path
  102. savedKb.storagePath = targetPath;
  103. await this.kbRepository.save(savedKb);
  104. this.logger.log(`Moved file to partitioned storage: ${targetPath}`);
  105. }
  106. } catch (fsError) {
  107. this.logger.error(`Failed to move file ${savedKb.id} to partitioned storage`, fsError);
  108. // We will let it continue, but the file might be stuck in the temp/root folder
  109. }
  110. // If queue processing is requested, await completion
  111. if (config?.waitForCompletion) {
  112. await this.processFile(savedKb.id, userId, tenantId, config);
  113. } else {
  114. // Otherwise trigger asynchronously (default)
  115. this.processFile(savedKb.id, userId, tenantId, config).catch((err) => {
  116. this.logger.error(`Error processing file ${savedKb.id}`, err);
  117. });
  118. }
  119. return savedKb;
  120. }
  121. async findAll(userId: string, tenantId?: string): Promise<KnowledgeBase[]> {
  122. const where: any = {};
  123. if (tenantId) {
  124. where.tenantId = tenantId;
  125. } else {
  126. where.userId = userId;
  127. }
  128. return this.kbRepository.find({
  129. where,
  130. relations: ['groups'], // グループリレーションをロード
  131. order: { createdAt: 'DESC' },
  132. });
  133. }
  134. async searchKnowledge(userId: string, tenantId: string, query: string, topK: number = 5) {
  135. try {
  136. // 環境変数のデフォルト次元数を使用してシミュレーションベクトルを生成
  137. const defaultDimensions = parseInt(
  138. process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
  139. );
  140. const mockEmbedding = Array.from(
  141. { length: defaultDimensions },
  142. () => Math.random() - 0.5,
  143. );
  144. const queryVector = mockEmbedding;
  145. // 2. Search in Elasticsearch
  146. const searchResults = await this.elasticsearchService.searchSimilar(
  147. queryVector,
  148. userId,
  149. topK,
  150. tenantId, // Ensure shared visibility within tenant
  151. );
  152. // 3. Get file information from database
  153. const fileIds = [...new Set(searchResults.map((r) => r.fileId))];
  154. const files = await this.kbRepository.findByIds(fileIds);
  155. const fileMap = new Map(files.map((f) => [f.id, f]));
  156. // 4. Combine results with file info
  157. const results = searchResults.map((result) => {
  158. const file = fileMap.get(result.fileId);
  159. return {
  160. ...result,
  161. file: file
  162. ? {
  163. id: file.id,
  164. name: file.originalName,
  165. mimetype: file.mimetype,
  166. size: file.size,
  167. createdAt: file.createdAt,
  168. }
  169. : null,
  170. };
  171. });
  172. return {
  173. query,
  174. results,
  175. total: results.length,
  176. };
  177. } catch (error) {
  178. this.logger.error(
  179. `Metadata search failed for tenant ${tenantId}:`,
  180. error.stack || error.message,
  181. );
  182. throw error;
  183. }
  184. }
  185. async ragSearch(userId: string, tenantId: string, query: string, settings: any) {
  186. this.logger.log(
  187. `RAG search request: userId=${userId}, query="${query}", settings=${JSON.stringify(settings)}`,
  188. );
  189. try {
  190. const ragResults = await this.ragService.searchKnowledge(
  191. query,
  192. userId,
  193. settings.topK,
  194. settings.similarityThreshold,
  195. settings.selectedEmbeddingId,
  196. settings.enableFullTextSearch,
  197. settings.enableRerank,
  198. settings.selectedRerankId,
  199. undefined,
  200. undefined,
  201. settings.rerankSimilarityThreshold,
  202. tenantId, // Ensure shared visibility within tenant for RAG
  203. );
  204. const sources = this.ragService.extractSources(ragResults);
  205. const ragPrompt = this.ragService.buildRagPrompt(
  206. query,
  207. ragResults,
  208. settings.language || 'ja',
  209. );
  210. const result = {
  211. searchResults: ragResults,
  212. sources,
  213. ragPrompt,
  214. hasRelevantContent: ragResults.length > 0,
  215. };
  216. this.logger.log(
  217. `RAG search completed: found ${ragResults.length} results`,
  218. );
  219. return result;
  220. } catch (error) {
  221. this.logger.error(
  222. `RAG search failed for user ${userId}:`,
  223. error.stack || error.message,
  224. );
  225. // エラーをスローするのではなく空の結果を返し、システムの稼働を継続させる
  226. return {
  227. searchResults: [],
  228. sources: [],
  229. ragPrompt: query, // オリジナルのクエリを使用
  230. hasRelevantContent: false,
  231. };
  232. }
  233. }
  234. async deleteFile(fileId: string, userId: string, tenantId: string): Promise<void> {
  235. this.logger.log(`Deleting file ${fileId} for user ${userId}`);
  236. try {
  237. // 1. Get file info
  238. const file = await this.kbRepository.findOne({
  239. where: { id: fileId, tenantId }, // Filter by tenantId
  240. });
  241. if (!file) {
  242. throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
  243. }
  244. // 2. Delete file from filesystem
  245. const fs = await import('fs');
  246. try {
  247. if (fs.existsSync(file.storagePath)) {
  248. fs.unlinkSync(file.storagePath);
  249. this.logger.log(`Deleted file: ${file.storagePath}`);
  250. }
  251. } catch (error) {
  252. this.logger.warn(`Failed to delete file ${file.storagePath}:`, error);
  253. }
  254. // 3. Delete from Elasticsearch
  255. try {
  256. await this.elasticsearchService.deleteByFileId(fileId, userId, tenantId);
  257. this.logger.log(`Deleted ES documents for file ${fileId}`);
  258. } catch (error) {
  259. this.logger.warn(
  260. `Failed to delete ES documents for file ${fileId}:`,
  261. error,
  262. );
  263. }
  264. // 4. Remove from all groups (cleanup M2M relations)
  265. const fileWithGroups = await this.kbRepository.findOne({
  266. where: { id: fileId, tenantId },
  267. relations: ['groups'],
  268. });
  269. if (fileWithGroups && fileWithGroups.groups && fileWithGroups.groups.length > 0) {
  270. // Clear groups to remove entries from join table
  271. fileWithGroups.groups = [];
  272. await this.kbRepository.save(fileWithGroups);
  273. this.logger.log(`Cleared group associations for file ${fileId}`);
  274. }
  275. // 5. Delete from SQLite
  276. await this.kbRepository.delete({ id: fileId });
  277. this.logger.log(`Deleted database record for file ${fileId}`);
  278. } catch (error) {
  279. this.logger.error(`Failed to delete file ${fileId}`, error);
  280. throw error;
  281. }
  282. }
  283. async clearAll(userId: string, tenantId: string): Promise<void> {
  284. this.logger.log(`Clearing all knowledge base data for user ${userId} in tenant ${tenantId}`);
  285. try {
  286. // Get all files and delete them one by one
  287. const files = await this.kbRepository.find();
  288. for (const file of files) {
  289. await this.deleteFile(file.id, userId, tenantId);
  290. }
  291. this.logger.log(`Cleared all knowledge base data for user ${userId}`);
  292. } catch (error) {
  293. this.logger.error(
  294. `Failed to clear knowledge base for user ${userId}`,
  295. error,
  296. );
  297. throw error;
  298. }
  299. }
  300. private async processFile(kbId: string, userId: string, tenantId: string, config?: any) {
  301. this.logger.log(`Starting processing for file ${kbId}, mode: ${config?.mode || 'fast'}`);
  302. await this.updateStatus(kbId, FileStatus.INDEXING);
  303. try {
  304. const kb = await this.kbRepository.findOne({ where: { id: kbId } });
  305. if (!kb) {
  306. this.logger.error(`KB not found: ${kbId}`);
  307. return;
  308. }
  309. // メモリ監視 - 処理前チェック
  310. const memBefore = this.memoryMonitor.getMemoryUsage();
  311. this.logger.log(`メモリ状態 - 処理前: ${memBefore.heapUsed}/${memBefore.heapTotal}MB`);
  312. // モードに基づいて処理フローを選択
  313. const mode = config?.mode || 'fast';
  314. if (mode === 'precise') {
  315. // 精密モード - Vision Pipeline を使用
  316. await this.processPreciseMode(kb, userId, tenantId, config);
  317. } else {
  318. // 高速モード - Tika を使用
  319. await this.processFastMode(kb, userId, tenantId, config);
  320. }
  321. this.logger.log(`File ${kbId} processed successfully in ${mode} mode.`);
  322. } catch (error) {
  323. this.logger.error(`Failed to process file ${kbId}`, error);
  324. await this.updateStatus(kbId, FileStatus.FAILED);
  325. }
  326. }
  327. /**
  328. * 高速モード処理(既存フロー)
  329. */
  330. private async processFastMode(kb: KnowledgeBase, userId: string, tenantId: string, config?: any) {
  331. // 1. Tika を使用してテキストを抽出
  332. let text = await this.tikaService.extractText(kb.storagePath);
  333. // 画像ファイルの場合はビジョンモデルを使用
  334. if (this.visionService.isImageFile(kb.mimetype)) {
  335. const visionModelId = await this.userSettingService.getVisionModelId(userId);
  336. if (visionModelId) {
  337. const visionModel = await this.modelConfigService.findOne(
  338. visionModelId,
  339. userId,
  340. tenantId,
  341. );
  342. if (visionModel && visionModel.type === 'vision' && visionModel.isEnabled !== false) {
  343. text = await this.visionService.extractImageContent(kb.storagePath, {
  344. baseUrl: visionModel.baseUrl || '',
  345. apiKey: visionModel.apiKey || '',
  346. modelId: visionModel.modelId,
  347. });
  348. }
  349. }
  350. }
  351. if (!text || text.trim().length === 0) {
  352. this.logger.warn(this.i18nService.getMessage('noTextExtracted'));
  353. }
  354. // テキストサイズを確認
  355. const textSizeMB = Math.round(text.length / 1024 / 1024);
  356. if (textSizeMB > 50) {
  357. this.logger.warn(this.i18nService.formatMessage('extractedTextTooLarge', { size: textSizeMB }));
  358. }
  359. // テキストをデータベースに保存
  360. await this.kbRepository.update(kb.id, { content: text });
  361. await this.updateStatus(kb.id, FileStatus.EXTRACTED);
  362. // 非同期ベクトル化
  363. await this.vectorizeToElasticsearch(kb.id, userId, tenantId, text, config).catch((err) => {
  364. this.logger.error(`Error vectorizing file ${kb.id}`, err);
  365. });
  366. // 自動タイトル生成 (非同期的に実行)
  367. this.generateTitle(kb.id).catch((err) => {
  368. this.logger.error(`Error generating title for file ${kb.id}`, err);
  369. });
  370. // 非同期的に PDF 変換をトリガー(ドキュメントファイルの場合)
  371. this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => {
  372. this.logger.warn(this.i18nService.formatMessage('pdfConversionFailedDetail', { id: kb.id }), err);
  373. });
  374. }
  375. /**
  376. * 精密モード処理(新規フロー)
  377. */
  378. private async processPreciseMode(kb: KnowledgeBase, userId: string, tenantId: string, config?: any) {
  379. // 精密モードがサポートされているか確認
  380. const preciseFormats = ['.pdf', '.doc', '.docx', '.ppt', '.pptx'];
  381. const ext = kb.originalName.toLowerCase().substring(kb.originalName.lastIndexOf('.'));
  382. if (!preciseFormats.includes(ext)) {
  383. this.logger.warn(
  384. this.i18nService.formatMessage('preciseModeUnsupported', { ext })
  385. );
  386. return this.processFastMode(kb, userId, tenantId, config);
  387. }
  388. // Vision モデルが設定されているか確認
  389. const visionModelId = await this.userSettingService.getVisionModelId(userId);
  390. if (!visionModelId) {
  391. this.logger.warn(
  392. this.i18nService.getMessage('visionModelNotConfiguredFallback')
  393. );
  394. return this.processFastMode(kb, userId, tenantId, config);
  395. }
  396. const visionModel = await this.modelConfigService.findOne(
  397. visionModelId,
  398. userId,
  399. tenantId,
  400. );
  401. if (!visionModel || visionModel.type !== 'vision' || visionModel.isEnabled === false) {
  402. this.logger.warn(
  403. this.i18nService.getMessage('visionModelInvalidFallback')
  404. );
  405. return this.processFastMode(kb, userId, tenantId, config);
  406. }
  407. // Vision Pipeline を呼び出し
  408. try {
  409. const result = await this.visionPipelineService.processPreciseMode(
  410. kb.storagePath,
  411. {
  412. userId,
  413. tenantId, // New
  414. modelId: visionModelId,
  415. fileId: kb.id,
  416. fileName: kb.originalName,
  417. skipQualityCheck: false,
  418. }
  419. );
  420. if (!result.success) {
  421. this.logger.error(`Vision pipeline failed, falling back to fast mode`);
  422. this.logger.warn(this.i18nService.getMessage('visionPipelineFailed'));
  423. return this.processFastMode(kb, userId, tenantId, config);
  424. }
  425. // テキスト内容をデータベースに保存
  426. const combinedText = result.results.map(r => r.text).join('\n\n');
  427. const metadata = {
  428. processedPages: result.processedPages,
  429. failedPages: result.failedPages,
  430. cost: result.cost,
  431. duration: result.duration,
  432. results: result.results.map(r => ({
  433. pageIndex: r.pageIndex,
  434. confidence: r.confidence,
  435. layout: r.layout,
  436. imageCount: r.images.length,
  437. })),
  438. };
  439. await this.kbRepository.update(kb.id, {
  440. content: combinedText,
  441. metadata: metadata as any,
  442. });
  443. await this.updateStatus(kb.id, FileStatus.EXTRACTED);
  444. this.logger.log(
  445. this.i18nService.formatMessage('preciseModeComplete', { pages: result.processedPages, cost: result.cost.toFixed(2) })
  446. );
  447. // 非同期でベクトル化し、Elasticsearch にインデックス
  448. // 各ページを独立したドキュメントとして作成し、メタデータを保持
  449. this.indexPreciseResults(kb, userId, tenantId, kb.embeddingModelId, result.results).catch((err) => {
  450. this.logger.error(`Error indexing precise results for ${kb.id}`, err);
  451. });
  452. // 非同期で PDF 変換をトリガー
  453. this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => {
  454. this.logger.warn(`Initial PDF conversion failed for ${kb.id}`, err);
  455. });
  456. // 自動タイトル生成 (非同期的に実行)
  457. this.generateTitle(kb.id).catch((err) => {
  458. this.logger.error(`Error generating title for file ${kb.id}`, err);
  459. });
  460. } catch (error) {
  461. this.logger.error(`Vision pipeline error: ${error.message}, falling back to fast mode`);
  462. return this.processFastMode(kb, userId, tenantId, config);
  463. }
  464. }
  465. /**
  466. * 精密モードの結果をインデックス
  467. */
  468. private async indexPreciseResults(
  469. kb: KnowledgeBase,
  470. userId: string,
  471. tenantId: string,
  472. embeddingModelId: string,
  473. results: any[]
  474. ): Promise<void> {
  475. this.logger.log(`Indexing ${results.length} precise results for ${kb.id}`);
  476. // インデックスの存在を確認 - 実際のモデル次元数を取得
  477. const actualDimensions = await this.getActualModelDimensions(embeddingModelId, userId, tenantId);
  478. await this.elasticsearchService.createIndexIfNotExists(actualDimensions);
  479. // ベクトル化とインデックスをバッチ処理
  480. const batchSize = parseInt(process.env.CHUNK_BATCH_SIZE || '50');
  481. for (let i = 0; i < results.length; i += batchSize) {
  482. const batch = results.slice(i, i + batchSize);
  483. const texts = batch.map(r => r.text);
  484. try {
  485. // ベクトルを生成
  486. const embeddings = await this.embeddingService.getEmbeddings(
  487. texts,
  488. userId,
  489. embeddingModelId
  490. );
  491. // 各結果をインデックス
  492. for (let j = 0; j < batch.length; j++) {
  493. const result = batch[j];
  494. const embedding = embeddings[j];
  495. if (!embedding || embedding.length === 0) {
  496. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorPage', { page: result.pageIndex }));
  497. continue;
  498. }
  499. await this.elasticsearchService.indexDocument(
  500. `${kb.id}_page_${result.pageIndex}`,
  501. result.text,
  502. embedding,
  503. {
  504. fileId: kb.id,
  505. originalName: kb.originalName,
  506. mimetype: kb.mimetype,
  507. userId: userId,
  508. tenantId: tenantId, // New
  509. pageNumber: result.pageIndex,
  510. images: result.images,
  511. layout: result.layout,
  512. confidence: result.confidence,
  513. source: 'precise',
  514. mode: 'vision',
  515. }
  516. );
  517. }
  518. this.logger.log(`バッチ ${Math.floor(i / batchSize) + 1} 完了: ${batch.length} ページ`);
  519. } catch (error) {
  520. this.logger.error(`バッチ ${Math.floor(i / batchSize) + 1} の処理に失敗しました`, error);
  521. }
  522. }
  523. await this.updateStatus(kb.id, FileStatus.VECTORIZED);
  524. this.logger.log(`精密モードのインデックス完了: ${results.length} ページ`);
  525. }
  526. /**
  527. * PDF の特定ページの画像を取得
  528. */
  529. async getPageAsImage(fileId: string, pageIndex: number, userId: string, tenantId: string): Promise<string> {
  530. const pdfPath = await this.ensurePDFExists(fileId, userId, tenantId);
  531. // 特定のページを変換
  532. const result = await this.pdf2ImageService.convertToImages(pdfPath, {
  533. density: 150,
  534. quality: 75,
  535. format: 'jpeg',
  536. });
  537. // 対応するページ番号の画像を見つける
  538. const pageImage = result.images.find(img => img.pageIndex === pageIndex + 1);
  539. if (!pageImage) {
  540. throw new NotFoundException(this.i18nService.formatMessage('pageImageNotFoundDetail', { page: pageIndex + 1 }));
  541. }
  542. return pageImage.path;
  543. }
  544. private async vectorizeToElasticsearch(
  545. kbId: string,
  546. userId: string,
  547. tenantId: string,
  548. text: string,
  549. config?: any,
  550. ) {
  551. try {
  552. const kb = await this.kbRepository.findOne({ where: { id: kbId, tenantId } });
  553. if (!kb) return;
  554. // メモリ監視 - ベクトル化前チェック
  555. const memBeforeChunk = this.memoryMonitor.getMemoryUsage();
  556. this.logger.log(
  557. `ベクトル化前メモリ: ${memBeforeChunk.heapUsed}/${memBeforeChunk.heapTotal}MB`,
  558. );
  559. this.logger.debug(`File ${kbId}: Validating chunk config...`);
  560. // 1. チャンク設定の検証と修正(モデルの制限と環境変数に基づく)
  561. const validatedConfig = await this.chunkConfigService.validateChunkConfig(
  562. kb.chunkSize,
  563. kb.chunkOverlap,
  564. kb.embeddingModelId,
  565. userId,
  566. );
  567. this.logger.debug(`File ${kbId}: Chunk config validated.`);
  568. // 設定が修正された場合、警告を記録しデータベースを更新
  569. if (validatedConfig.warnings.length > 0) {
  570. this.logger.warn(
  571. this.i18nService.formatMessage('chunkConfigCorrection', { warnings: validatedConfig.warnings.join(', ') })
  572. );
  573. // データベース内の設定を更新
  574. if (validatedConfig.chunkSize !== kb.chunkSize ||
  575. validatedConfig.chunkOverlap !== kb.chunkOverlap) {
  576. await this.kbRepository.update(kbId, {
  577. chunkSize: validatedConfig.chunkSize,
  578. chunkOverlap: validatedConfig.chunkOverlap,
  579. });
  580. }
  581. }
  582. // 設定サマリーを表示(実際に適用される上限を含む)
  583. this.logger.debug(`File ${kbId}: Getting config summary...`);
  584. const configSummary = await this.chunkConfigService.getConfigSummary(
  585. validatedConfig.chunkSize,
  586. validatedConfig.chunkOverlap,
  587. kb.embeddingModelId,
  588. userId,
  589. );
  590. this.logger.log(`チャンク設定: ${configSummary}`);
  591. this.logger.log(`設定上限: チャンク=${validatedConfig.effectiveMaxChunkSize}, 重複=${validatedConfig.effectiveMaxOverlapSize}`);
  592. // 2. 検証済みの設定を使用してチャンク分割
  593. const chunks = this.textChunkerService.chunkText(
  594. text,
  595. validatedConfig.chunkSize,
  596. validatedConfig.chunkOverlap,
  597. );
  598. this.logger.log(`ファイル ${kbId} から ${chunks.length} 個のテキストブロックを分割しました`);
  599. if (chunks.length === 0) {
  600. this.logger.warn(this.i18nService.formatMessage('noChunksGenerated', { id: kbId }));
  601. await this.updateStatus(kbId, FileStatus.VECTORIZED);
  602. return;
  603. }
  604. // 3. チャンク数が妥当か確認
  605. const estimatedChunkCount = this.chunkConfigService.estimateChunkCount(
  606. text.length,
  607. validatedConfig.chunkSize,
  608. );
  609. if (chunks.length > estimatedChunkCount * 1.2) {
  610. this.logger.warn(
  611. this.i18nService.formatMessage('chunkCountAnomaly', { actual: chunks.length, estimated: estimatedChunkCount })
  612. );
  613. }
  614. // 4. 推奨バッチサイズを取得(モデルの制限に基づく)
  615. const recommendedBatchSize = await this.chunkConfigService.getRecommendedBatchSize(
  616. kb.embeddingModelId,
  617. userId,
  618. tenantId,
  619. parseInt(process.env.CHUNK_BATCH_SIZE || '100'),
  620. );
  621. // 5. メモリ使用量を推定
  622. const avgChunkSize = chunks.reduce((sum, c) => sum + c.content.length, 0) / chunks.length;
  623. const estimatedMemory = this.memoryMonitor.estimateMemoryUsage(
  624. chunks.length,
  625. avgChunkSize,
  626. parseInt(process.env.DEFAULT_VECTOR_DIMENSIONS || '2560'),
  627. );
  628. this.logger.log(`推定メモリ使用量: ${estimatedMemory}MB (バッチサイズ: ${recommendedBatchSize})`);
  629. // 6. 実際のモデル次元数を取得し、インデックスの存在を確認
  630. const actualDimensions = await this.getActualModelDimensions(kb.embeddingModelId, userId, tenantId);
  631. await this.elasticsearchService.createIndexIfNotExists(actualDimensions);
  632. // 7. ベクトル化とインデックス作成をバッチ処理
  633. const useBatching = this.memoryMonitor.shouldUseBatching(
  634. chunks.length,
  635. avgChunkSize,
  636. actualDimensions,
  637. );
  638. if (useBatching) {
  639. try {
  640. await this.processInBatches(
  641. chunks,
  642. async (batch, batchIndex) => {
  643. // バッチサイズがモデルの制限を超えていないか検証
  644. if (batch.length > recommendedBatchSize) {
  645. this.logger.warn(
  646. this.i18nService.formatMessage('batchSizeExceeded', { index: batchIndex, actual: batch.length, limit: recommendedBatchSize })
  647. );
  648. }
  649. const chunkTexts = batch.map((chunk) => chunk.content);
  650. const embeddings = await this.embeddingService.getEmbeddings(
  651. chunkTexts,
  652. userId,
  653. kb.embeddingModelId,
  654. );
  655. // 次元の整合性を検証
  656. if (embeddings.length > 0 && embeddings[0].length !== actualDimensions) {
  657. this.logger.warn(
  658. `ベクトル次元が不一致です: 期待値 ${actualDimensions}, 実際 ${embeddings[0].length}`
  659. );
  660. }
  661. // このバッチデータを即座にインデックス
  662. for (let i = 0; i < batch.length; i++) {
  663. const chunk = batch[i];
  664. const embedding = embeddings[i];
  665. if (!embedding || embedding.length === 0) {
  666. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
  667. continue;
  668. }
  669. await this.elasticsearchService.indexDocument(
  670. `${kb.id}_chunk_${chunk.index}`,
  671. chunk.content,
  672. embedding,
  673. {
  674. fileId: kb.id,
  675. originalName: kb.originalName,
  676. mimetype: kb.mimetype,
  677. userId: userId,
  678. chunkIndex: chunk.index,
  679. startPosition: chunk.startPosition,
  680. tenantId, // Passing tenantId to ES
  681. }
  682. );
  683. }
  684. this.logger.log(`バッチ ${batchIndex} 完了: ${batch.length} チャンク`);
  685. },
  686. {
  687. batchSize: recommendedBatchSize,
  688. onBatchComplete: (batchIndex, totalBatches) => {
  689. const mem = this.memoryMonitor.getMemoryUsage();
  690. this.logger.log(
  691. `バッチ ${batchIndex}/${totalBatches} 完了, メモリ: ${mem.heapUsed}MB`,
  692. );
  693. },
  694. },
  695. );
  696. } catch (error) {
  697. // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
  698. if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
  699. this.logger.warn(this.i18nService.getMessage('contextLengthErrorFallback'));
  700. // 単一テキスト処理にダウングレード
  701. for (let i = 0; i < chunks.length; i++) {
  702. const chunk = chunks[i];
  703. try {
  704. const embeddings = await this.embeddingService.getEmbeddings(
  705. [chunk.content], // 単一テキスト
  706. userId,
  707. kb.embeddingModelId,
  708. );
  709. if (!embeddings[0] || embeddings[0].length === 0) {
  710. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
  711. continue;
  712. }
  713. await this.elasticsearchService.indexDocument(
  714. `${kb.id}_chunk_${chunk.index}`,
  715. chunk.content,
  716. embeddings[0],
  717. {
  718. fileId: kb.id,
  719. originalName: kb.originalName,
  720. mimetype: kb.mimetype,
  721. userId: userId,
  722. chunkIndex: chunk.index,
  723. startPosition: chunk.startPosition,
  724. endPosition: chunk.endPosition,
  725. tenantId,
  726. }
  727. );
  728. if ((i + 1) % 10 === 0) {
  729. this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
  730. }
  731. } catch (chunkError) {
  732. this.logger.error(
  733. `テキストブロック ${chunk.index} の処理に失敗しました。スキップします: ${chunkError.message}`
  734. );
  735. continue;
  736. }
  737. }
  738. this.logger.log(`単一テキスト処理完了: ${chunks.length} チャンク`);
  739. } else {
  740. // その他のエラーは直接スロー
  741. throw error;
  742. }
  743. }
  744. } else {
  745. // 小さなファイル、一括処理(ただしバッチ制限の確認が必要)
  746. const chunkTexts = chunks.map((chunk) => chunk.content);
  747. // チャンク数がモデルのバッチ制限を超える場合は、強制的にバッチ処理
  748. if (chunks.length > recommendedBatchSize) {
  749. this.logger.warn(
  750. this.i18nService.formatMessage('chunkLimitExceededForceBatch', { actual: chunks.length, limit: recommendedBatchSize })
  751. );
  752. try {
  753. await this.processInBatches(
  754. chunks,
  755. async (batch, batchIndex) => {
  756. const batchTexts = batch.map((c) => c.content);
  757. const embeddings = await this.embeddingService.getEmbeddings(
  758. batchTexts,
  759. userId,
  760. kb.embeddingModelId,
  761. );
  762. for (let i = 0; i < batch.length; i++) {
  763. const chunk = batch[i];
  764. const embedding = embeddings[i];
  765. if (!embedding || embedding.length === 0) {
  766. this.logger.warn(`空ベクトルのテキストブロック ${chunk.index} をスキップします`);
  767. continue;
  768. }
  769. await this.elasticsearchService.indexDocument(
  770. `${kb.id}_chunk_${chunk.index}`,
  771. chunk.content,
  772. embedding,
  773. {
  774. fileId: kb.id,
  775. originalName: kb.originalName,
  776. mimetype: kb.mimetype,
  777. userId: userId,
  778. chunkIndex: chunk.index,
  779. startPosition: chunk.startPosition,
  780. endPosition: chunk.endPosition,
  781. tenantId, // Passing tenantId to ES metadata
  782. }
  783. );
  784. }
  785. },
  786. );
  787. } catch (error) {
  788. // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
  789. if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
  790. this.logger.warn(this.i18nService.getMessage('batchContextLengthErrorFallback'));
  791. // 単一テキスト処理にダウングレード
  792. for (let i = 0; i < chunks.length; i++) {
  793. const chunk = chunks[i];
  794. try {
  795. const embeddings = await this.embeddingService.getEmbeddings(
  796. [chunk.content], // 単一テキスト
  797. userId,
  798. kb.embeddingModelId,
  799. );
  800. if (!embeddings[0] || embeddings[0].length === 0) {
  801. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
  802. continue;
  803. }
  804. await this.elasticsearchService.indexDocument(
  805. `${kb.id}_chunk_${chunk.index}`,
  806. chunk.content,
  807. embeddings[0],
  808. {
  809. fileId: kb.id,
  810. originalName: kb.originalName,
  811. mimetype: kb.mimetype,
  812. userId: userId,
  813. tenantId, // Added tenantId
  814. chunkIndex: chunk.index,
  815. startPosition: chunk.startPosition,
  816. endPosition: chunk.endPosition,
  817. },
  818. );
  819. if ((i + 1) % 10 === 0) {
  820. this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
  821. }
  822. } catch (chunkError) {
  823. this.logger.error(
  824. this.i18nService.formatMessage('chunkProcessingFailed', { index: chunk.index, message: chunkError.message })
  825. );
  826. continue;
  827. }
  828. }
  829. this.logger.log(this.i18nService.formatMessage('singleTextProcessingComplete', { count: chunks.length }));
  830. } else {
  831. // その他のエラー、直接スロー
  832. throw error;
  833. }
  834. }
  835. } else {
  836. // 十分に小さいファイルの場合は一括で処理
  837. try {
  838. const embeddings = await this.embeddingService.getEmbeddings(
  839. chunkTexts,
  840. userId,
  841. kb.embeddingModelId,
  842. );
  843. for (let i = 0; i < chunks.length; i++) {
  844. const chunk = chunks[i];
  845. const embedding = embeddings[i];
  846. if (!embedding || embedding.length === 0) {
  847. this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
  848. continue;
  849. }
  850. await this.elasticsearchService.indexDocument(
  851. `${kb.id}_chunk_${chunk.index}`,
  852. chunk.content,
  853. embedding,
  854. {
  855. fileId: kb.id,
  856. originalName: kb.originalName,
  857. mimetype: kb.mimetype,
  858. userId: userId,
  859. tenantId, // Added tenantId
  860. chunkIndex: chunk.index,
  861. startPosition: chunk.startPosition,
  862. endPosition: chunk.endPosition,
  863. },
  864. );
  865. }
  866. } catch (error) {
  867. // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
  868. if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
  869. this.logger.warn(this.i18nService.getMessage('batchContextLengthErrorFallback'));
  870. // 単一テキスト処理にダウングレード
  871. for (let i = 0; i < chunks.length; i++) {
  872. const chunk = chunks[i];
  873. try {
  874. const embeddings = await this.embeddingService.getEmbeddings(
  875. [chunk.content], // 単一テキスト
  876. userId,
  877. kb.embeddingModelId,
  878. );
  879. if (!embeddings[0] || embeddings[0].length === 0) {
  880. this.logger.warn(`空ベクトルのテキストブロック ${chunk.index} をスキップします`);
  881. continue;
  882. }
  883. await this.elasticsearchService.indexDocument(
  884. `${kb.id}_chunk_${chunk.index}`,
  885. chunk.content,
  886. embeddings[0],
  887. {
  888. fileId: kb.id,
  889. originalName: kb.originalName,
  890. mimetype: kb.mimetype,
  891. userId: userId,
  892. tenantId, // Added tenantId
  893. chunkIndex: chunk.index,
  894. startPosition: chunk.startPosition,
  895. endPosition: chunk.endPosition,
  896. },
  897. );
  898. if ((i + 1) % 10 === 0) {
  899. this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
  900. }
  901. } catch (chunkError) {
  902. this.logger.error(
  903. `テキストブロック ${chunk.index} の処理に失敗しました。スキップします: ${chunkError.message}`
  904. );
  905. continue;
  906. }
  907. }
  908. this.logger.log(this.i18nService.formatMessage('singleTextProcessingComplete', { count: chunks.length }));
  909. } else {
  910. // その他のエラー、直接スロー
  911. throw error;
  912. }
  913. }
  914. }
  915. }
  916. await this.updateStatus(kbId, FileStatus.VECTORIZED);
  917. const memAfter = this.memoryMonitor.getMemoryUsage();
  918. this.logger.log(
  919. this.i18nService.formatMessage('fileVectorizationComplete', { id: kbId, count: chunks.length, memory: memAfter.heapUsed })
  920. );
  921. } catch (error) {
  922. this.logger.error(this.i18nService.formatMessage('fileVectorizationFailed', { id: kbId }), error);
  923. // エラー情報を metadata に保存
  924. try {
  925. const kb = await this.kbRepository.findOne({ where: { id: kbId } });
  926. if (kb) {
  927. const metadata = kb.metadata || {};
  928. metadata.lastError = error.message;
  929. metadata.failedAt = new Date().toISOString();
  930. await this.kbRepository.update(kbId, { metadata });
  931. }
  932. } catch (e) {
  933. this.logger.warn(`Failed to update metadata for failed file ${kbId}`, e);
  934. }
  935. await this.updateStatus(kbId, FileStatus.FAILED);
  936. }
  937. }
  938. /**
  939. * バッチ処理、メモリ制御付き
  940. */
  941. private async processInBatches<T>(
  942. items: T[],
  943. processor: (batch: T[], batchIndex: number) => Promise<void>,
  944. options?: {
  945. batchSize?: number;
  946. onBatchComplete?: (batchIndex: number, totalBatches: number) => void;
  947. },
  948. ): Promise<void> {
  949. const totalItems = items.length;
  950. if (totalItems === 0) return;
  951. const startTime = Date.now();
  952. this.logger.log(this.i18nService.formatMessage('batchProcessingStarted', { count: totalItems }));
  953. // Use provided batch size or fallback to env/default
  954. const initialBatchSize = options?.batchSize || parseInt(process.env.CHUNK_BATCH_SIZE || '100');
  955. const totalBatches = Math.ceil(totalItems / initialBatchSize);
  956. for (let i = 0; i < totalItems;) {
  957. // メモリを確認し待機
  958. await this.memoryMonitor.waitForMemoryAvailable();
  959. // バッチサイズを動的に調整 (initialBatchSize から開始し、必要に応じてメモリモニターが削減できるようにします)
  960. // 注意: memoryMonitor.getDynamicBatchSize はメモリ状況に基づいてより大きな値を返す可能性がありますが、
  961. // モデルの制限 (initialBatchSize) を尊重する必要があります。
  962. const currentMem = this.memoryMonitor.getMemoryUsage().heapUsed;
  963. const dynamicBatchSize = this.memoryMonitor.getDynamicBatchSize(currentMem);
  964. // Ensure we don't exceed the model's limit (initialBatchSize) even if memory allows more
  965. const batchSize = Math.min(dynamicBatchSize, initialBatchSize);
  966. // 現在のバッチを取得
  967. const batch = items.slice(i, i + batchSize);
  968. const batchIndex = Math.floor(i / batchSize) + 1;
  969. this.logger.log(
  970. this.i18nService.formatMessage('batchProcessingProgress', { index: batchIndex, total: totalBatches, count: batch.length })
  971. );
  972. // バッチを処理
  973. await processor(batch, batchIndex);
  974. // コールバック通知
  975. if (options?.onBatchComplete) {
  976. options.onBatchComplete(batchIndex, totalBatches);
  977. }
  978. // 強制GC(メモリがしきい値に近い場合)
  979. if (currentMem > 800) {
  980. this.memoryMonitor.forceGC();
  981. }
  982. // 参照をクリアしGCを助ける
  983. batch.length = 0;
  984. i += batchSize;
  985. }
  986. const duration = ((Date.now() - startTime) / 1000).toFixed(2);
  987. this.logger.log(this.i18nService.formatMessage('batchProcessingComplete', { count: totalItems, duration }));
  988. }
  989. /**
  990. * 失敗したファイルのベクトル化を再試行
  991. */
  992. async retryFailedFile(fileId: string, userId: string, tenantId: string): Promise<KnowledgeBase> {
  993. this.logger.log(`Retrying failed file ${fileId} for user ${userId} in tenant ${tenantId}`);
  994. // 1. Get file with tenant restriction
  995. const kb = await this.kbRepository.findOne({
  996. where: { id: fileId, tenantId },
  997. });
  998. if (!kb) {
  999. throw new NotFoundException('ファイルが存在しません');
  1000. }
  1001. if (kb.status !== FileStatus.FAILED) {
  1002. throw new Error(this.i18nService.formatMessage('onlyFailedFilesRetryable', { status: kb.status }));
  1003. }
  1004. if (!kb.content || kb.content.trim().length === 0) {
  1005. throw new Error(this.i18nService.getMessage('emptyFileRetryFailed'));
  1006. }
  1007. // 2. ステータスを INDEXING にリセット
  1008. await this.updateStatus(fileId, FileStatus.INDEXING);
  1009. // 3. 非同期でベクトル化をトリガー(既存ロジックを再利用)
  1010. this.vectorizeToElasticsearch(
  1011. fileId,
  1012. userId,
  1013. tenantId,
  1014. kb.content,
  1015. {
  1016. chunkSize: kb.chunkSize,
  1017. chunkOverlap: kb.chunkOverlap,
  1018. embeddingModelId: kb.embeddingModelId,
  1019. }
  1020. ).catch((err) => {
  1021. this.logger.error(`Retry vectorization failed for file ${fileId}`, err);
  1022. });
  1023. // 4. 更新後のファイルステータスを返却
  1024. const updatedKb = await this.kbRepository.findOne({ where: { id: fileId, tenantId } });
  1025. if (!updatedKb) {
  1026. throw new NotFoundException('ファイルが存在しません');
  1027. }
  1028. return updatedKb;
  1029. }
  1030. /**
  1031. * ファイルのすべてのチャンク情報を取得
  1032. */
  1033. async getFileChunks(fileId: string, userId: string, tenantId: string) {
  1034. this.logger.log(`Getting chunks for file ${fileId}, user ${userId}, tenant ${tenantId}`);
  1035. // 1. Get file with tenant check
  1036. const kb = await this.kbRepository.findOne({
  1037. where: { id: fileId, tenantId },
  1038. });
  1039. if (!kb) {
  1040. throw new NotFoundException('ファイルが存在しません');
  1041. }
  1042. // 2. Elasticsearch からすべてのチャンクを取得
  1043. const chunks = await this.elasticsearchService.getFileChunks(fileId, userId, tenantId);
  1044. // 3. チャンク情報を返却
  1045. return {
  1046. fileId: kb.id,
  1047. fileName: kb.originalName,
  1048. totalChunks: chunks.length,
  1049. chunkSize: kb.chunkSize,
  1050. chunkOverlap: kb.chunkOverlap,
  1051. chunks: chunks.map(chunk => ({
  1052. index: chunk.chunkIndex,
  1053. content: chunk.content,
  1054. contentLength: chunk.content.length,
  1055. startPosition: chunk.startPosition,
  1056. endPosition: chunk.endPosition,
  1057. })),
  1058. };
  1059. }
  1060. private async updateStatus(id: string, status: FileStatus) {
  1061. await this.kbRepository.update(id, { status });
  1062. }
  1063. // PDF プレビュー関連メソッド
  1064. async ensurePDFExists(fileId: string, userId: string, tenantId: string, force: boolean = false): Promise<string> {
  1065. const kb = await this.kbRepository.findOne({
  1066. where: { id: fileId, tenantId },
  1067. });
  1068. if (!kb) {
  1069. throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
  1070. }
  1071. // 元ファイルが PDF の場合は、元ファイルのパスを直接返す
  1072. if (kb.mimetype === 'application/pdf') {
  1073. return kb.storagePath;
  1074. }
  1075. // プレビュー変換に対応しているか確認(ドキュメント類または画像類のみ許可)
  1076. const ext = kb.originalName.toLowerCase().split('.').pop() || '';
  1077. const isConvertible = [...DOC_EXTENSIONS, ...IMAGE_EXTENSIONS].includes(ext);
  1078. if (!isConvertible) {
  1079. this.logger.log(`Skipping PDF conversion for unsupported format: .${ext} (${kb.originalName})`);
  1080. throw new Error(this.i18nService.getMessage('pdfPreviewNotSupported'));
  1081. }
  1082. // PDF フィールドパスを生成
  1083. const path = await import('path');
  1084. const fs = await import('fs');
  1085. const uploadDir = path.dirname(kb.storagePath);
  1086. const baseName = path.basename(kb.storagePath, path.extname(kb.storagePath));
  1087. const pdfPath = path.join(uploadDir, `${baseName}.pdf`);
  1088. // 強制再生成が指定され、ファイルが存在する場合は削除
  1089. if (force && fs.existsSync(pdfPath)) {
  1090. try {
  1091. fs.unlinkSync(pdfPath);
  1092. this.logger.log(`Forced regeneration: Deleted existing PDF ${pdfPath}`);
  1093. } catch (e) {
  1094. this.logger.warn(`Failed to delete existing PDF for regeneration: ${e.message}`);
  1095. }
  1096. }
  1097. // 変換済みかつ強制再生成が不要か確認
  1098. if (fs.existsSync(pdfPath) && !force) {
  1099. if (!kb.pdfPath) {
  1100. await this.kbRepository.update(kb.id, { pdfPath: pdfPath });
  1101. }
  1102. return pdfPath;
  1103. }
  1104. // PDF への変換が必要
  1105. try {
  1106. this.logger.log(`Starting PDF conversion for ${kb.originalName} at ${kb.storagePath}`);
  1107. // ファイルを変換
  1108. await this.libreOfficeService.convertToPDF(kb.storagePath);
  1109. // 変換結果を確認
  1110. if (!fs.existsSync(pdfPath)) {
  1111. throw new Error(`PDF conversion completed but file not found at ${pdfPath}`);
  1112. }
  1113. const stats = fs.statSync(pdfPath);
  1114. if (stats.size === 0) {
  1115. fs.unlinkSync(pdfPath);
  1116. throw new Error(`PDF conversion failed: output file is empty`);
  1117. }
  1118. await this.kbRepository.update(kb.id, { pdfPath: pdfPath });
  1119. this.logger.log(`PDF conversion successful: ${pdfPath}`);
  1120. return pdfPath;
  1121. } catch (error) {
  1122. this.logger.error(`PDF conversion failed for ${fileId}: ${error.message}`, error.stack);
  1123. throw new Error(this.i18nService.formatMessage('pdfConversionFailedDetail', { id: fileId }));
  1124. }
  1125. }
  1126. async getPDFStatus(fileId: string, userId: string, tenantId: string) {
  1127. const kb = await this.kbRepository.findOne({
  1128. where: { id: fileId, tenantId },
  1129. });
  1130. if (!kb) {
  1131. throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
  1132. }
  1133. // 元ファイルが PDF の場合
  1134. if (kb.mimetype === 'application/pdf') {
  1135. const token = this.generateTempToken(fileId, userId, tenantId);
  1136. return {
  1137. status: 'ready',
  1138. url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
  1139. };
  1140. }
  1141. // PDF ファイルパスを生成
  1142. const path = await import('path');
  1143. const fs = await import('fs');
  1144. const uploadDir = path.dirname(kb.storagePath);
  1145. const baseName = path.basename(kb.storagePath, path.extname(kb.storagePath));
  1146. const pdfPath = path.join(uploadDir, `${baseName}.pdf`);
  1147. // 変換済みか確認
  1148. if (fs.existsSync(pdfPath)) {
  1149. if (!kb.pdfPath) {
  1150. kb.pdfPath = pdfPath;
  1151. await this.kbRepository.save(kb);
  1152. }
  1153. const token = this.generateTempToken(fileId, userId, tenantId);
  1154. return {
  1155. status: 'ready',
  1156. url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
  1157. };
  1158. }
  1159. // 変換が必要
  1160. return {
  1161. status: 'pending',
  1162. };
  1163. }
  1164. private generateTempToken(fileId: string, userId: string, tenantId: string): string {
  1165. const jwt = require('jsonwebtoken');
  1166. const secret = process.env.JWT_SECRET;
  1167. if (!secret) {
  1168. throw new Error('JWT_SECRET environment variable is required but not set');
  1169. }
  1170. return jwt.sign(
  1171. { fileId, userId, tenantId, type: 'pdf-access' },
  1172. secret,
  1173. { expiresIn: '1h' }
  1174. );
  1175. }
  1176. /**
  1177. * モデルの実際の次元数を取得(キャッシュ確認とプローブロジック付き)
  1178. */
  1179. private async getActualModelDimensions(embeddingModelId: string, userId: string, tenantId: string): Promise<number> {
  1180. const defaultDimensions = parseInt(
  1181. process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
  1182. );
  1183. try {
  1184. // 1. モデル設定から優先的に取得
  1185. const modelConfig = await this.modelConfigService.findOne(
  1186. embeddingModelId,
  1187. userId,
  1188. tenantId,
  1189. );
  1190. if (modelConfig && modelConfig.dimensions) {
  1191. this.logger.log(`設定から ${modelConfig.name} の次元数を取得しました: ${modelConfig.dimensions}`);
  1192. return modelConfig.dimensions;
  1193. }
  1194. // 2. それ以外の場合はプローブにより取得
  1195. this.logger.log(`モデル次元数をプローブ中: ${embeddingModelId}`);
  1196. const probeEmbeddings = await this.embeddingService.getEmbeddings(
  1197. ['probe'],
  1198. userId,
  1199. embeddingModelId,
  1200. );
  1201. if (probeEmbeddings.length > 0) {
  1202. const actualDimensions = probeEmbeddings[0].length;
  1203. this.logger.log(`モデルの実際の次元数を検出しました: ${actualDimensions}`);
  1204. // 次回利用のためにモデル設定を更新
  1205. if (modelConfig) {
  1206. try {
  1207. await this.modelConfigService.update(userId, tenantId, modelConfig.id, {
  1208. dimensions: actualDimensions,
  1209. });
  1210. this.logger.log(`モデル ${modelConfig.name} の次元数設定を ${actualDimensions} に更新しました`);
  1211. } catch (updateErr) {
  1212. this.logger.warn(`モデル次元数設定の更新に失敗しました: ${updateErr.message}`);
  1213. }
  1214. }
  1215. return actualDimensions;
  1216. }
  1217. } catch (err) {
  1218. this.logger.warn(
  1219. `次元数の取得に失敗しました。デフォルト次元数を使用します: ${defaultDimensions}`,
  1220. err.message,
  1221. );
  1222. }
  1223. return defaultDimensions;
  1224. }
  1225. /**
  1226. * AIを使用して文書のタイトルを自動生成する
  1227. */
  1228. async generateTitle(kbId: string): Promise<string | null> {
  1229. this.logger.log(`Generating automatic title for file ${kbId}`);
  1230. try {
  1231. const kb = await this.kbRepository.findOne({ where: { id: kbId } });
  1232. if (!kb || !kb.content || kb.content.trim().length === 0) {
  1233. return null;
  1234. }
  1235. const tenantId = kb.tenantId;
  1236. // すでにタイトルがある場合はスキップ
  1237. if (kb.title) {
  1238. return kb.title;
  1239. }
  1240. // コンテンツの冒頭サンプルを取得(最大2500文字)
  1241. const contentSample = kb.content.substring(0, 2500);
  1242. // ユーザー設定から言語を取得、またはデフォルトを使用
  1243. const settings = await this.userSettingService.findOrCreate(kb.userId);
  1244. const language = settings.language || 'ja';
  1245. // プロンプトを構築
  1246. const prompt = this.i18nService.getDocumentTitlePrompt(language, contentSample);
  1247. // LLMを呼び出してタイトルを生成
  1248. let generatedTitle: string | undefined;
  1249. try {
  1250. generatedTitle = await this.chatService.generateSimpleChat(
  1251. [{ role: 'user', content: prompt }],
  1252. kb.userId,
  1253. kb.tenantId
  1254. );
  1255. } catch (err) {
  1256. this.logger.warn(`Failed to generate title for document ${kbId} due to LLM configuration issue: ${err.message}`);
  1257. return null; // Skip title generation if LLM is not configured for this tenant
  1258. }
  1259. if (generatedTitle && generatedTitle.trim().length > 0) {
  1260. // 余分な引用符や改行を除去
  1261. const cleanedTitle = generatedTitle.trim().replace(/^["']|["']$/g, '').substring(0, 100);
  1262. await this.kbRepository.update(kbId, { title: cleanedTitle });
  1263. // Elasticsearch のチャンクも更新
  1264. await this.elasticsearchService.updateTitleByFileId(kbId, cleanedTitle, tenantId).catch((err) => {
  1265. this.logger.error(`Failed to update title in Elasticsearch for ${kbId}`, err);
  1266. });
  1267. this.logger.log(`Successfully generated title for ${kbId}: ${cleanedTitle}`);
  1268. return cleanedTitle;
  1269. }
  1270. } catch (error) {
  1271. this.logger.error(`Failed to generate title for ${kbId}`, error);
  1272. }
  1273. return null;
  1274. }
  1275. }