import { Injectable, Logger, NotFoundException, Inject, forwardRef } from '@nestjs/common'; import { ConfigService } from '@nestjs/config'; import { I18nService } from '../i18n/i18n.service'; import { InjectRepository } from '@nestjs/typeorm'; import { Repository, In } from 'typeorm'; import { FileStatus, KnowledgeBase, ProcessingMode } from './knowledge-base.entity'; import { KnowledgeGroup } from '../knowledge-group/knowledge-group.entity'; import { ElasticsearchService } from '../elasticsearch/elasticsearch.service'; import { TikaService } from '../tika/tika.service'; import * as fs from 'fs'; import * as path from 'path'; import { EmbeddingService } from './embedding.service'; import { TextChunkerService } from './text-chunker.service'; import { ModelConfigService } from '../model-config/model-config.service'; import { RagService } from '../rag/rag.service'; import { VisionService } from '../vision/vision.service'; import { UserSettingService } from '../user-setting/user-setting.service'; import { MemoryMonitorService } from './memory-monitor.service'; import { ChunkConfigService } from './chunk-config.service'; import { VisionPipelineService } from '../vision-pipeline/vision-pipeline.service'; import { LibreOfficeService } from '../libreoffice/libreoffice.service'; import { Pdf2ImageService } from '../pdf2image/pdf2image.service'; import { DOC_EXTENSIONS, IMAGE_EXTENSIONS } from '../common/file-support.constants'; import { ChatService } from '../chat/chat.service'; @Injectable() export class KnowledgeBaseService { private readonly logger = new Logger(KnowledgeBaseService.name); constructor( @InjectRepository(KnowledgeBase) private kbRepository: Repository, @InjectRepository(KnowledgeGroup) private groupRepository: Repository, @Inject(forwardRef(() => ElasticsearchService)) private elasticsearchService: ElasticsearchService, private tikaService: TikaService, private embeddingService: EmbeddingService, private textChunkerService: TextChunkerService, private modelConfigService: ModelConfigService, @Inject(forwardRef(() => RagService)) private ragService: RagService, private visionService: VisionService, private userSettingService: UserSettingService, private memoryMonitor: MemoryMonitorService, private chunkConfigService: ChunkConfigService, private visionPipelineService: VisionPipelineService, private libreOfficeService: LibreOfficeService, private pdf2ImageService: Pdf2ImageService, private configService: ConfigService, private i18nService: I18nService, @Inject(forwardRef(() => ChatService)) private chatService: ChatService, ) { } async createAndIndex( fileInfo: any, userId: string, tenantId: string, config?: any, ): Promise { const mode = config?.mode || 'fast'; const processingMode = mode === 'precise' ? ProcessingMode.PRECISE : ProcessingMode.FAST; const kb = this.kbRepository.create({ originalName: fileInfo.originalname, storagePath: fileInfo.path, size: fileInfo.size, mimetype: fileInfo.mimetype, status: FileStatus.PENDING, userId: userId, tenantId: tenantId, chunkSize: config?.chunkSize || 200, chunkOverlap: config?.chunkOverlap || 40, embeddingModelId: config?.embeddingModelId || null, processingMode: processingMode, }); // 分類(グループ)の関連付け if (config?.groupIds && config.groupIds.length > 0) { const groups = await this.groupRepository.find({ where: { id: In(config.groupIds), tenantId: tenantId } }); kb.groups = groups; } const savedKb = await this.kbRepository.save(kb); this.logger.log( `Created KB record: ${savedKb.id}, mode: ${mode}, file: ${fileInfo.originalname}` ); // --------------------------------------------------------- // Move the file to the final partitioned directory // source: uploads/{tenantId}/{filename} (or wherever it was) // target: uploads/{tenantId}/{savedKb.id}/{filename} // --------------------------------------------------------- const fs = await import('fs'); const path = await import('path'); const uploadPath = process.env.UPLOAD_FILE_PATH || './uploads'; const targetDir = path.join(uploadPath, tenantId || 'default', savedKb.id); const targetPath = path.join(targetDir, fileInfo.filename); try { if (!fs.existsSync(targetDir)) { fs.mkdirSync(targetDir, { recursive: true }); } if (fs.existsSync(fileInfo.path)) { fs.renameSync(fileInfo.path, targetPath); // Update the DB record with the new path savedKb.storagePath = targetPath; await this.kbRepository.save(savedKb); this.logger.log(`Moved file to partitioned storage: ${targetPath}`); } } catch (fsError) { this.logger.error(`Failed to move file ${savedKb.id} to partitioned storage`, fsError); // We will let it continue, but the file might be stuck in the temp/root folder } // If queue processing is requested, await completion if (config?.waitForCompletion) { await this.processFile(savedKb.id, userId, tenantId, config); } else { // Otherwise trigger asynchronously (default) this.processFile(savedKb.id, userId, tenantId, config).catch((err) => { this.logger.error(`Error processing file ${savedKb.id}`, err); }); } return savedKb; } async findAll(userId: string, tenantId?: string): Promise { const where: any = {}; if (tenantId) { where.tenantId = tenantId; } else { where.userId = userId; } return this.kbRepository.find({ where, relations: ['groups'], // グループリレーションをロード order: { createdAt: 'DESC' }, }); } async searchKnowledge(userId: string, tenantId: string, query: string, topK: number = 5) { try { // 環境変数のデフォルト次元数を使用してシミュレーションベクトルを生成 const defaultDimensions = parseInt( process.env.DEFAULT_VECTOR_DIMENSIONS || '2560', ); const mockEmbedding = Array.from( { length: defaultDimensions }, () => Math.random() - 0.5, ); const queryVector = mockEmbedding; // 2. Search in Elasticsearch const searchResults = await this.elasticsearchService.searchSimilar( queryVector, userId, topK, tenantId, // Ensure shared visibility within tenant ); // 3. Get file information from database const fileIds = [...new Set(searchResults.map((r) => r.fileId))]; const files = await this.kbRepository.findByIds(fileIds); const fileMap = new Map(files.map((f) => [f.id, f])); // 4. Combine results with file info const results = searchResults.map((result) => { const file = fileMap.get(result.fileId); return { ...result, file: file ? { id: file.id, name: file.originalName, mimetype: file.mimetype, size: file.size, createdAt: file.createdAt, } : null, }; }); return { query, results, total: results.length, }; } catch (error) { this.logger.error( `Metadata search failed for tenant ${tenantId}:`, error.stack || error.message, ); throw error; } } async ragSearch(userId: string, tenantId: string, query: string, settings: any) { this.logger.log( `RAG search request: userId=${userId}, query="${query}", settings=${JSON.stringify(settings)}`, ); try { const ragResults = await this.ragService.searchKnowledge( query, userId, settings.topK, settings.similarityThreshold, settings.selectedEmbeddingId, settings.enableFullTextSearch, settings.enableRerank, settings.selectedRerankId, undefined, undefined, settings.rerankSimilarityThreshold, tenantId, // Ensure shared visibility within tenant for RAG ); const sources = this.ragService.extractSources(ragResults); const ragPrompt = this.ragService.buildRagPrompt( query, ragResults, settings.language || 'ja', ); const result = { searchResults: ragResults, sources, ragPrompt, hasRelevantContent: ragResults.length > 0, }; this.logger.log( `RAG search completed: found ${ragResults.length} results`, ); return result; } catch (error) { this.logger.error( `RAG search failed for user ${userId}:`, error.stack || error.message, ); // エラーをスローするのではなく空の結果を返し、システムの稼働を継続させる return { searchResults: [], sources: [], ragPrompt: query, // オリジナルのクエリを使用 hasRelevantContent: false, }; } } async deleteFile(fileId: string, userId: string, tenantId: string): Promise { this.logger.log(`Deleting file ${fileId} for user ${userId}`); try { // 1. Get file info const file = await this.kbRepository.findOne({ where: { id: fileId, tenantId }, // Filter by tenantId }); if (!file) { throw new NotFoundException(this.i18nService.getMessage('fileNotFound')); } // 2. Delete file from filesystem const fs = await import('fs'); try { if (fs.existsSync(file.storagePath)) { fs.unlinkSync(file.storagePath); this.logger.log(`Deleted file: ${file.storagePath}`); } } catch (error) { this.logger.warn(`Failed to delete file ${file.storagePath}:`, error); } // 3. Delete from Elasticsearch try { await this.elasticsearchService.deleteByFileId(fileId, userId, tenantId); this.logger.log(`Deleted ES documents for file ${fileId}`); } catch (error) { this.logger.warn( `Failed to delete ES documents for file ${fileId}:`, error, ); } // 4. Remove from all groups (cleanup M2M relations) const fileWithGroups = await this.kbRepository.findOne({ where: { id: fileId, tenantId }, relations: ['groups'], }); if (fileWithGroups && fileWithGroups.groups && fileWithGroups.groups.length > 0) { // Clear groups to remove entries from join table fileWithGroups.groups = []; await this.kbRepository.save(fileWithGroups); this.logger.log(`Cleared group associations for file ${fileId}`); } // 5. Delete from SQLite await this.kbRepository.delete({ id: fileId }); this.logger.log(`Deleted database record for file ${fileId}`); } catch (error) { this.logger.error(`Failed to delete file ${fileId}`, error); throw error; } } async clearAll(userId: string, tenantId: string): Promise { this.logger.log(`Clearing all knowledge base data for user ${userId} in tenant ${tenantId}`); try { // Get all files and delete them one by one const files = await this.kbRepository.find(); for (const file of files) { await this.deleteFile(file.id, userId, tenantId); } this.logger.log(`Cleared all knowledge base data for user ${userId}`); } catch (error) { this.logger.error( `Failed to clear knowledge base for user ${userId}`, error, ); throw error; } } private async processFile(kbId: string, userId: string, tenantId: string, config?: any) { this.logger.log(`Starting processing for file ${kbId}, mode: ${config?.mode || 'fast'}`); await this.updateStatus(kbId, FileStatus.INDEXING); try { const kb = await this.kbRepository.findOne({ where: { id: kbId } }); if (!kb) { this.logger.error(`KB not found: ${kbId}`); return; } // メモリ監視 - 処理前チェック const memBefore = this.memoryMonitor.getMemoryUsage(); this.logger.log(`メモリ状態 - 処理前: ${memBefore.heapUsed}/${memBefore.heapTotal}MB`); // モードに基づいて処理フローを選択 const mode = config?.mode || 'fast'; if (mode === 'precise') { // 精密モード - Vision Pipeline を使用 await this.processPreciseMode(kb, userId, tenantId, config); } else { // 高速モード - Tika を使用 await this.processFastMode(kb, userId, tenantId, config); } this.logger.log(`File ${kbId} processed successfully in ${mode} mode.`); } catch (error) { this.logger.error(`Failed to process file ${kbId}`, error); await this.updateStatus(kbId, FileStatus.FAILED); } } /** * 高速モード処理(既存フロー) */ private async processFastMode(kb: KnowledgeBase, userId: string, tenantId: string, config?: any) { // 1. Tika を使用してテキストを抽出 let text = await this.tikaService.extractText(kb.storagePath); // 画像ファイルの場合はビジョンモデルを使用 if (this.visionService.isImageFile(kb.mimetype)) { const visionModelId = await this.userSettingService.getVisionModelId(userId); if (visionModelId) { const visionModel = await this.modelConfigService.findOne( visionModelId, userId, tenantId, ); if (visionModel && visionModel.type === 'vision' && visionModel.isEnabled !== false) { text = await this.visionService.extractImageContent(kb.storagePath, { baseUrl: visionModel.baseUrl || '', apiKey: visionModel.apiKey || '', modelId: visionModel.modelId, }); } } } if (!text || text.trim().length === 0) { this.logger.warn(this.i18nService.getMessage('noTextExtracted')); } // テキストサイズを確認 const textSizeMB = Math.round(text.length / 1024 / 1024); if (textSizeMB > 50) { this.logger.warn(this.i18nService.formatMessage('extractedTextTooLarge', { size: textSizeMB })); } // テキストをデータベースに保存 await this.kbRepository.update(kb.id, { content: text }); await this.updateStatus(kb.id, FileStatus.EXTRACTED); // 非同期ベクトル化 await this.vectorizeToElasticsearch(kb.id, userId, tenantId, text, config).catch((err) => { this.logger.error(`Error vectorizing file ${kb.id}`, err); }); // 自動タイトル生成 (非同期的に実行) this.generateTitle(kb.id).catch((err) => { this.logger.error(`Error generating title for file ${kb.id}`, err); }); // 非同期的に PDF 変換をトリガー(ドキュメントファイルの場合) this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => { this.logger.warn(this.i18nService.formatMessage('pdfConversionFailedDetail', { id: kb.id }), err); }); } /** * 精密モード処理(新規フロー) */ private async processPreciseMode(kb: KnowledgeBase, userId: string, tenantId: string, config?: any) { // 精密モードがサポートされているか確認 const preciseFormats = ['.pdf', '.doc', '.docx', '.ppt', '.pptx']; const ext = kb.originalName.toLowerCase().substring(kb.originalName.lastIndexOf('.')); if (!preciseFormats.includes(ext)) { this.logger.warn( this.i18nService.formatMessage('preciseModeUnsupported', { ext }) ); return this.processFastMode(kb, userId, tenantId, config); } // Vision モデルが設定されているか確認 const visionModelId = await this.userSettingService.getVisionModelId(userId); if (!visionModelId) { this.logger.warn( this.i18nService.getMessage('visionModelNotConfiguredFallback') ); return this.processFastMode(kb, userId, tenantId, config); } const visionModel = await this.modelConfigService.findOne( visionModelId, userId, tenantId, ); if (!visionModel || visionModel.type !== 'vision' || visionModel.isEnabled === false) { this.logger.warn( this.i18nService.getMessage('visionModelInvalidFallback') ); return this.processFastMode(kb, userId, tenantId, config); } // Vision Pipeline を呼び出し try { const result = await this.visionPipelineService.processPreciseMode( kb.storagePath, { userId, tenantId, // New modelId: visionModelId, fileId: kb.id, fileName: kb.originalName, skipQualityCheck: false, } ); if (!result.success) { this.logger.error(`Vision pipeline failed, falling back to fast mode`); this.logger.warn(this.i18nService.getMessage('visionPipelineFailed')); return this.processFastMode(kb, userId, tenantId, config); } // テキスト内容をデータベースに保存 const combinedText = result.results.map(r => r.text).join('\n\n'); const metadata = { processedPages: result.processedPages, failedPages: result.failedPages, cost: result.cost, duration: result.duration, results: result.results.map(r => ({ pageIndex: r.pageIndex, confidence: r.confidence, layout: r.layout, imageCount: r.images.length, })), }; await this.kbRepository.update(kb.id, { content: combinedText, metadata: metadata as any, }); await this.updateStatus(kb.id, FileStatus.EXTRACTED); this.logger.log( this.i18nService.formatMessage('preciseModeComplete', { pages: result.processedPages, cost: result.cost.toFixed(2) }) ); // 非同期でベクトル化し、Elasticsearch にインデックス // 各ページを独立したドキュメントとして作成し、メタデータを保持 this.indexPreciseResults(kb, userId, tenantId, kb.embeddingModelId, result.results).catch((err) => { this.logger.error(`Error indexing precise results for ${kb.id}`, err); }); // 非同期で PDF 変換をトリガー this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => { this.logger.warn(`Initial PDF conversion failed for ${kb.id}`, err); }); // 自動タイトル生成 (非同期的に実行) this.generateTitle(kb.id).catch((err) => { this.logger.error(`Error generating title for file ${kb.id}`, err); }); } catch (error) { this.logger.error(`Vision pipeline error: ${error.message}, falling back to fast mode`); return this.processFastMode(kb, userId, tenantId, config); } } /** * 精密モードの結果をインデックス */ private async indexPreciseResults( kb: KnowledgeBase, userId: string, tenantId: string, embeddingModelId: string, results: any[] ): Promise { this.logger.log(`Indexing ${results.length} precise results for ${kb.id}`); // インデックスの存在を確認 - 実際のモデル次元数を取得 const actualDimensions = await this.getActualModelDimensions(embeddingModelId, userId, tenantId); await this.elasticsearchService.createIndexIfNotExists(actualDimensions); // ベクトル化とインデックスをバッチ処理 const batchSize = parseInt(process.env.CHUNK_BATCH_SIZE || '50'); for (let i = 0; i < results.length; i += batchSize) { const batch = results.slice(i, i + batchSize); const texts = batch.map(r => r.text); try { // ベクトルを生成 const embeddings = await this.embeddingService.getEmbeddings( texts, userId, embeddingModelId ); // 各結果をインデックス for (let j = 0; j < batch.length; j++) { const result = batch[j]; const embedding = embeddings[j]; if (!embedding || embedding.length === 0) { this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorPage', { page: result.pageIndex })); continue; } await this.elasticsearchService.indexDocument( `${kb.id}_page_${result.pageIndex}`, result.text, embedding, { fileId: kb.id, originalName: kb.originalName, mimetype: kb.mimetype, userId: userId, tenantId: tenantId, // New pageNumber: result.pageIndex, images: result.images, layout: result.layout, confidence: result.confidence, source: 'precise', mode: 'vision', } ); } this.logger.log(`バッチ ${Math.floor(i / batchSize) + 1} 完了: ${batch.length} ページ`); } catch (error) { this.logger.error(`バッチ ${Math.floor(i / batchSize) + 1} の処理に失敗しました`, error); } } await this.updateStatus(kb.id, FileStatus.VECTORIZED); this.logger.log(`精密モードのインデックス完了: ${results.length} ページ`); } /** * PDF の特定ページの画像を取得 */ async getPageAsImage(fileId: string, pageIndex: number, userId: string, tenantId: string): Promise { const pdfPath = await this.ensurePDFExists(fileId, userId, tenantId); // 特定のページを変換 const result = await this.pdf2ImageService.convertToImages(pdfPath, { density: 150, quality: 75, format: 'jpeg', }); // 対応するページ番号の画像を見つける const pageImage = result.images.find(img => img.pageIndex === pageIndex + 1); if (!pageImage) { throw new NotFoundException(this.i18nService.formatMessage('pageImageNotFoundDetail', { page: pageIndex + 1 })); } return pageImage.path; } private async vectorizeToElasticsearch( kbId: string, userId: string, tenantId: string, text: string, config?: any, ) { try { const kb = await this.kbRepository.findOne({ where: { id: kbId, tenantId } }); if (!kb) return; // メモリ監視 - ベクトル化前チェック const memBeforeChunk = this.memoryMonitor.getMemoryUsage(); this.logger.log( `ベクトル化前メモリ: ${memBeforeChunk.heapUsed}/${memBeforeChunk.heapTotal}MB`, ); this.logger.debug(`File ${kbId}: Validating chunk config...`); // 1. チャンク設定の検証と修正(モデルの制限と環境変数に基づく) const validatedConfig = await this.chunkConfigService.validateChunkConfig( kb.chunkSize, kb.chunkOverlap, kb.embeddingModelId, userId, ); this.logger.debug(`File ${kbId}: Chunk config validated.`); // 設定が修正された場合、警告を記録しデータベースを更新 if (validatedConfig.warnings.length > 0) { this.logger.warn( this.i18nService.formatMessage('chunkConfigCorrection', { warnings: validatedConfig.warnings.join(', ') }) ); // データベース内の設定を更新 if (validatedConfig.chunkSize !== kb.chunkSize || validatedConfig.chunkOverlap !== kb.chunkOverlap) { await this.kbRepository.update(kbId, { chunkSize: validatedConfig.chunkSize, chunkOverlap: validatedConfig.chunkOverlap, }); } } // 設定サマリーを表示(実際に適用される上限を含む) this.logger.debug(`File ${kbId}: Getting config summary...`); const configSummary = await this.chunkConfigService.getConfigSummary( validatedConfig.chunkSize, validatedConfig.chunkOverlap, kb.embeddingModelId, userId, ); this.logger.log(`チャンク設定: ${configSummary}`); this.logger.log(`設定上限: チャンク=${validatedConfig.effectiveMaxChunkSize}, 重複=${validatedConfig.effectiveMaxOverlapSize}`); // 2. 検証済みの設定を使用してチャンク分割 const chunks = this.textChunkerService.chunkText( text, validatedConfig.chunkSize, validatedConfig.chunkOverlap, ); this.logger.log(`ファイル ${kbId} から ${chunks.length} 個のテキストブロックを分割しました`); if (chunks.length === 0) { this.logger.warn(this.i18nService.formatMessage('noChunksGenerated', { id: kbId })); await this.updateStatus(kbId, FileStatus.VECTORIZED); return; } // 3. チャンク数が妥当か確認 const estimatedChunkCount = this.chunkConfigService.estimateChunkCount( text.length, validatedConfig.chunkSize, ); if (chunks.length > estimatedChunkCount * 1.2) { this.logger.warn( this.i18nService.formatMessage('chunkCountAnomaly', { actual: chunks.length, estimated: estimatedChunkCount }) ); } // 4. 推奨バッチサイズを取得(モデルの制限に基づく) const recommendedBatchSize = await this.chunkConfigService.getRecommendedBatchSize( kb.embeddingModelId, userId, tenantId, parseInt(process.env.CHUNK_BATCH_SIZE || '100'), ); // 5. メモリ使用量を推定 const avgChunkSize = chunks.reduce((sum, c) => sum + c.content.length, 0) / chunks.length; const estimatedMemory = this.memoryMonitor.estimateMemoryUsage( chunks.length, avgChunkSize, parseInt(process.env.DEFAULT_VECTOR_DIMENSIONS || '2560'), ); this.logger.log(`推定メモリ使用量: ${estimatedMemory}MB (バッチサイズ: ${recommendedBatchSize})`); // 6. 実際のモデル次元数を取得し、インデックスの存在を確認 const actualDimensions = await this.getActualModelDimensions(kb.embeddingModelId, userId, tenantId); await this.elasticsearchService.createIndexIfNotExists(actualDimensions); // 7. ベクトル化とインデックス作成をバッチ処理 const useBatching = this.memoryMonitor.shouldUseBatching( chunks.length, avgChunkSize, actualDimensions, ); if (useBatching) { try { await this.processInBatches( chunks, async (batch, batchIndex) => { // バッチサイズがモデルの制限を超えていないか検証 if (batch.length > recommendedBatchSize) { this.logger.warn( this.i18nService.formatMessage('batchSizeExceeded', { index: batchIndex, actual: batch.length, limit: recommendedBatchSize }) ); } const chunkTexts = batch.map((chunk) => chunk.content); const embeddings = await this.embeddingService.getEmbeddings( chunkTexts, userId, kb.embeddingModelId, ); // 次元の整合性を検証 if (embeddings.length > 0 && embeddings[0].length !== actualDimensions) { this.logger.warn( `ベクトル次元が不一致です: 期待値 ${actualDimensions}, 実際 ${embeddings[0].length}` ); } // このバッチデータを即座にインデックス for (let i = 0; i < batch.length; i++) { const chunk = batch[i]; const embedding = embeddings[i]; if (!embedding || embedding.length === 0) { this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index })); continue; } await this.elasticsearchService.indexDocument( `${kb.id}_chunk_${chunk.index}`, chunk.content, embedding, { fileId: kb.id, originalName: kb.originalName, mimetype: kb.mimetype, userId: userId, chunkIndex: chunk.index, startPosition: chunk.startPosition, tenantId, // Passing tenantId to ES } ); } this.logger.log(`バッチ ${batchIndex} 完了: ${batch.length} チャンク`); }, { batchSize: recommendedBatchSize, onBatchComplete: (batchIndex, totalBatches) => { const mem = this.memoryMonitor.getMemoryUsage(); this.logger.log( `バッチ ${batchIndex}/${totalBatches} 完了, メモリ: ${mem.heapUsed}MB`, ); }, }, ); } catch (error) { // コンテキスト長エラーを検出(日本語・中国語・英語に対応) if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) { this.logger.warn(this.i18nService.getMessage('contextLengthErrorFallback')); // 単一テキスト処理にダウングレード for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; try { const embeddings = await this.embeddingService.getEmbeddings( [chunk.content], // 単一テキスト userId, kb.embeddingModelId, ); if (!embeddings[0] || embeddings[0].length === 0) { this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index })); continue; } await this.elasticsearchService.indexDocument( `${kb.id}_chunk_${chunk.index}`, chunk.content, embeddings[0], { fileId: kb.id, originalName: kb.originalName, mimetype: kb.mimetype, userId: userId, chunkIndex: chunk.index, startPosition: chunk.startPosition, endPosition: chunk.endPosition, tenantId, } ); if ((i + 1) % 10 === 0) { this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`); } } catch (chunkError) { this.logger.error( `テキストブロック ${chunk.index} の処理に失敗しました。スキップします: ${chunkError.message}` ); continue; } } this.logger.log(`単一テキスト処理完了: ${chunks.length} チャンク`); } else { // その他のエラーは直接スロー throw error; } } } else { // 小さなファイル、一括処理(ただしバッチ制限の確認が必要) const chunkTexts = chunks.map((chunk) => chunk.content); // チャンク数がモデルのバッチ制限を超える場合は、強制的にバッチ処理 if (chunks.length > recommendedBatchSize) { this.logger.warn( this.i18nService.formatMessage('chunkLimitExceededForceBatch', { actual: chunks.length, limit: recommendedBatchSize }) ); try { await this.processInBatches( chunks, async (batch, batchIndex) => { const batchTexts = batch.map((c) => c.content); const embeddings = await this.embeddingService.getEmbeddings( batchTexts, userId, kb.embeddingModelId, ); for (let i = 0; i < batch.length; i++) { const chunk = batch[i]; const embedding = embeddings[i]; if (!embedding || embedding.length === 0) { this.logger.warn(`空ベクトルのテキストブロック ${chunk.index} をスキップします`); continue; } await this.elasticsearchService.indexDocument( `${kb.id}_chunk_${chunk.index}`, chunk.content, embedding, { fileId: kb.id, originalName: kb.originalName, mimetype: kb.mimetype, userId: userId, chunkIndex: chunk.index, startPosition: chunk.startPosition, endPosition: chunk.endPosition, tenantId, // Passing tenantId to ES metadata } ); } }, ); } catch (error) { // コンテキスト長エラーを検出(日本語・中国語・英語に対応) if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) { this.logger.warn(this.i18nService.getMessage('batchContextLengthErrorFallback')); // 単一テキスト処理にダウングレード for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; try { const embeddings = await this.embeddingService.getEmbeddings( [chunk.content], // 単一テキスト userId, kb.embeddingModelId, ); if (!embeddings[0] || embeddings[0].length === 0) { this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index })); continue; } await this.elasticsearchService.indexDocument( `${kb.id}_chunk_${chunk.index}`, chunk.content, embeddings[0], { fileId: kb.id, originalName: kb.originalName, mimetype: kb.mimetype, userId: userId, tenantId, // Added tenantId chunkIndex: chunk.index, startPosition: chunk.startPosition, endPosition: chunk.endPosition, }, ); if ((i + 1) % 10 === 0) { this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`); } } catch (chunkError) { this.logger.error( this.i18nService.formatMessage('chunkProcessingFailed', { index: chunk.index, message: chunkError.message }) ); continue; } } this.logger.log(this.i18nService.formatMessage('singleTextProcessingComplete', { count: chunks.length })); } else { // その他のエラー、直接スロー throw error; } } } else { // 十分に小さいファイルの場合は一括で処理 try { const embeddings = await this.embeddingService.getEmbeddings( chunkTexts, userId, kb.embeddingModelId, ); for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; const embedding = embeddings[i]; if (!embedding || embedding.length === 0) { this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index })); continue; } await this.elasticsearchService.indexDocument( `${kb.id}_chunk_${chunk.index}`, chunk.content, embedding, { fileId: kb.id, originalName: kb.originalName, mimetype: kb.mimetype, userId: userId, tenantId, // Added tenantId chunkIndex: chunk.index, startPosition: chunk.startPosition, endPosition: chunk.endPosition, }, ); } } catch (error) { // コンテキスト長エラーを検出(日本語・中国語・英語に対応) if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) { this.logger.warn(this.i18nService.getMessage('batchContextLengthErrorFallback')); // 単一テキスト処理にダウングレード for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; try { const embeddings = await this.embeddingService.getEmbeddings( [chunk.content], // 単一テキスト userId, kb.embeddingModelId, ); if (!embeddings[0] || embeddings[0].length === 0) { this.logger.warn(`空ベクトルのテキストブロック ${chunk.index} をスキップします`); continue; } await this.elasticsearchService.indexDocument( `${kb.id}_chunk_${chunk.index}`, chunk.content, embeddings[0], { fileId: kb.id, originalName: kb.originalName, mimetype: kb.mimetype, userId: userId, tenantId, // Added tenantId chunkIndex: chunk.index, startPosition: chunk.startPosition, endPosition: chunk.endPosition, }, ); if ((i + 1) % 10 === 0) { this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`); } } catch (chunkError) { this.logger.error( `テキストブロック ${chunk.index} の処理に失敗しました。スキップします: ${chunkError.message}` ); continue; } } this.logger.log(this.i18nService.formatMessage('singleTextProcessingComplete', { count: chunks.length })); } else { // その他のエラー、直接スロー throw error; } } } } await this.updateStatus(kbId, FileStatus.VECTORIZED); const memAfter = this.memoryMonitor.getMemoryUsage(); this.logger.log( this.i18nService.formatMessage('fileVectorizationComplete', { id: kbId, count: chunks.length, memory: memAfter.heapUsed }) ); } catch (error) { this.logger.error(this.i18nService.formatMessage('fileVectorizationFailed', { id: kbId }), error); // エラー情報を metadata に保存 try { const kb = await this.kbRepository.findOne({ where: { id: kbId } }); if (kb) { const metadata = kb.metadata || {}; metadata.lastError = error.message; metadata.failedAt = new Date().toISOString(); await this.kbRepository.update(kbId, { metadata }); } } catch (e) { this.logger.warn(`Failed to update metadata for failed file ${kbId}`, e); } await this.updateStatus(kbId, FileStatus.FAILED); } } /** * バッチ処理、メモリ制御付き */ private async processInBatches( items: T[], processor: (batch: T[], batchIndex: number) => Promise, options?: { batchSize?: number; onBatchComplete?: (batchIndex: number, totalBatches: number) => void; }, ): Promise { const totalItems = items.length; if (totalItems === 0) return; const startTime = Date.now(); this.logger.log(this.i18nService.formatMessage('batchProcessingStarted', { count: totalItems })); // Use provided batch size or fallback to env/default const initialBatchSize = options?.batchSize || parseInt(process.env.CHUNK_BATCH_SIZE || '100'); const totalBatches = Math.ceil(totalItems / initialBatchSize); for (let i = 0; i < totalItems;) { // メモリを確認し待機 await this.memoryMonitor.waitForMemoryAvailable(); // バッチサイズを動的に調整 (initialBatchSize から開始し、必要に応じてメモリモニターが削減できるようにします) // 注意: memoryMonitor.getDynamicBatchSize はメモリ状況に基づいてより大きな値を返す可能性がありますが、 // モデルの制限 (initialBatchSize) を尊重する必要があります。 const currentMem = this.memoryMonitor.getMemoryUsage().heapUsed; const dynamicBatchSize = this.memoryMonitor.getDynamicBatchSize(currentMem); // Ensure we don't exceed the model's limit (initialBatchSize) even if memory allows more const batchSize = Math.min(dynamicBatchSize, initialBatchSize); // 現在のバッチを取得 const batch = items.slice(i, i + batchSize); const batchIndex = Math.floor(i / batchSize) + 1; this.logger.log( this.i18nService.formatMessage('batchProcessingProgress', { index: batchIndex, total: totalBatches, count: batch.length }) ); // バッチを処理 await processor(batch, batchIndex); // コールバック通知 if (options?.onBatchComplete) { options.onBatchComplete(batchIndex, totalBatches); } // 強制GC(メモリがしきい値に近い場合) if (currentMem > 800) { this.memoryMonitor.forceGC(); } // 参照をクリアしGCを助ける batch.length = 0; i += batchSize; } const duration = ((Date.now() - startTime) / 1000).toFixed(2); this.logger.log(this.i18nService.formatMessage('batchProcessingComplete', { count: totalItems, duration })); } /** * 失敗したファイルのベクトル化を再試行 */ async retryFailedFile(fileId: string, userId: string, tenantId: string): Promise { this.logger.log(`Retrying failed file ${fileId} for user ${userId} in tenant ${tenantId}`); // 1. Get file with tenant restriction const kb = await this.kbRepository.findOne({ where: { id: fileId, tenantId }, }); if (!kb) { throw new NotFoundException('ファイルが存在しません'); } if (kb.status !== FileStatus.FAILED) { throw new Error(this.i18nService.formatMessage('onlyFailedFilesRetryable', { status: kb.status })); } if (!kb.content || kb.content.trim().length === 0) { throw new Error(this.i18nService.getMessage('emptyFileRetryFailed')); } // 2. ステータスを INDEXING にリセット await this.updateStatus(fileId, FileStatus.INDEXING); // 3. 非同期でベクトル化をトリガー(既存ロジックを再利用) this.vectorizeToElasticsearch( fileId, userId, tenantId, kb.content, { chunkSize: kb.chunkSize, chunkOverlap: kb.chunkOverlap, embeddingModelId: kb.embeddingModelId, } ).catch((err) => { this.logger.error(`Retry vectorization failed for file ${fileId}`, err); }); // 4. 更新後のファイルステータスを返却 const updatedKb = await this.kbRepository.findOne({ where: { id: fileId, tenantId } }); if (!updatedKb) { throw new NotFoundException('ファイルが存在しません'); } return updatedKb; } /** * ファイルのすべてのチャンク情報を取得 */ async getFileChunks(fileId: string, userId: string, tenantId: string) { this.logger.log(`Getting chunks for file ${fileId}, user ${userId}, tenant ${tenantId}`); // 1. Get file with tenant check const kb = await this.kbRepository.findOne({ where: { id: fileId, tenantId }, }); if (!kb) { throw new NotFoundException('ファイルが存在しません'); } // 2. Elasticsearch からすべてのチャンクを取得 const chunks = await this.elasticsearchService.getFileChunks(fileId, userId, tenantId); // 3. チャンク情報を返却 return { fileId: kb.id, fileName: kb.originalName, totalChunks: chunks.length, chunkSize: kb.chunkSize, chunkOverlap: kb.chunkOverlap, chunks: chunks.map(chunk => ({ index: chunk.chunkIndex, content: chunk.content, contentLength: chunk.content.length, startPosition: chunk.startPosition, endPosition: chunk.endPosition, })), }; } private async updateStatus(id: string, status: FileStatus) { await this.kbRepository.update(id, { status }); } // PDF プレビュー関連メソッド async ensurePDFExists(fileId: string, userId: string, tenantId: string, force: boolean = false): Promise { const kb = await this.kbRepository.findOne({ where: { id: fileId, tenantId }, }); if (!kb) { throw new NotFoundException(this.i18nService.getMessage('fileNotFound')); } // 元ファイルが PDF の場合は、元ファイルのパスを直接返す if (kb.mimetype === 'application/pdf') { return kb.storagePath; } // プレビュー変換に対応しているか確認(ドキュメント類または画像類のみ許可) const ext = kb.originalName.toLowerCase().split('.').pop() || ''; const isConvertible = [...DOC_EXTENSIONS, ...IMAGE_EXTENSIONS].includes(ext); if (!isConvertible) { this.logger.log(`Skipping PDF conversion for unsupported format: .${ext} (${kb.originalName})`); throw new Error(this.i18nService.getMessage('pdfPreviewNotSupported')); } // PDF フィールドパスを生成 const path = await import('path'); const fs = await import('fs'); const uploadDir = path.dirname(kb.storagePath); const baseName = path.basename(kb.storagePath, path.extname(kb.storagePath)); const pdfPath = path.join(uploadDir, `${baseName}.pdf`); // 強制再生成が指定され、ファイルが存在する場合は削除 if (force && fs.existsSync(pdfPath)) { try { fs.unlinkSync(pdfPath); this.logger.log(`Forced regeneration: Deleted existing PDF ${pdfPath}`); } catch (e) { this.logger.warn(`Failed to delete existing PDF for regeneration: ${e.message}`); } } // 変換済みかつ強制再生成が不要か確認 if (fs.existsSync(pdfPath) && !force) { if (!kb.pdfPath) { await this.kbRepository.update(kb.id, { pdfPath: pdfPath }); } return pdfPath; } // PDF への変換が必要 try { this.logger.log(`Starting PDF conversion for ${kb.originalName} at ${kb.storagePath}`); // ファイルを変換 await this.libreOfficeService.convertToPDF(kb.storagePath); // 変換結果を確認 if (!fs.existsSync(pdfPath)) { throw new Error(`PDF conversion completed but file not found at ${pdfPath}`); } const stats = fs.statSync(pdfPath); if (stats.size === 0) { fs.unlinkSync(pdfPath); throw new Error(`PDF conversion failed: output file is empty`); } await this.kbRepository.update(kb.id, { pdfPath: pdfPath }); this.logger.log(`PDF conversion successful: ${pdfPath}`); return pdfPath; } catch (error) { this.logger.error(`PDF conversion failed for ${fileId}: ${error.message}`, error.stack); throw new Error(this.i18nService.formatMessage('pdfConversionFailedDetail', { id: fileId })); } } async getPDFStatus(fileId: string, userId: string, tenantId: string) { const kb = await this.kbRepository.findOne({ where: { id: fileId, tenantId }, }); if (!kb) { throw new NotFoundException(this.i18nService.getMessage('fileNotFound')); } // 元ファイルが PDF の場合 if (kb.mimetype === 'application/pdf') { const token = this.generateTempToken(fileId, userId, tenantId); return { status: 'ready', url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`, }; } // PDF ファイルパスを生成 const path = await import('path'); const fs = await import('fs'); const uploadDir = path.dirname(kb.storagePath); const baseName = path.basename(kb.storagePath, path.extname(kb.storagePath)); const pdfPath = path.join(uploadDir, `${baseName}.pdf`); // 変換済みか確認 if (fs.existsSync(pdfPath)) { if (!kb.pdfPath) { kb.pdfPath = pdfPath; await this.kbRepository.save(kb); } const token = this.generateTempToken(fileId, userId, tenantId); return { status: 'ready', url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`, }; } // 変換が必要 return { status: 'pending', }; } private generateTempToken(fileId: string, userId: string, tenantId: string): string { const jwt = require('jsonwebtoken'); const secret = process.env.JWT_SECRET; if (!secret) { throw new Error('JWT_SECRET environment variable is required but not set'); } return jwt.sign( { fileId, userId, tenantId, type: 'pdf-access' }, secret, { expiresIn: '1h' } ); } /** * モデルの実際の次元数を取得(キャッシュ確認とプローブロジック付き) */ private async getActualModelDimensions(embeddingModelId: string, userId: string, tenantId: string): Promise { const defaultDimensions = parseInt( process.env.DEFAULT_VECTOR_DIMENSIONS || '2560', ); try { // 1. モデル設定から優先的に取得 const modelConfig = await this.modelConfigService.findOne( embeddingModelId, userId, tenantId, ); if (modelConfig && modelConfig.dimensions) { this.logger.log(`設定から ${modelConfig.name} の次元数を取得しました: ${modelConfig.dimensions}`); return modelConfig.dimensions; } // 2. それ以外の場合はプローブにより取得 this.logger.log(`モデル次元数をプローブ中: ${embeddingModelId}`); const probeEmbeddings = await this.embeddingService.getEmbeddings( ['probe'], userId, embeddingModelId, ); if (probeEmbeddings.length > 0) { const actualDimensions = probeEmbeddings[0].length; this.logger.log(`モデルの実際の次元数を検出しました: ${actualDimensions}`); // 次回利用のためにモデル設定を更新 if (modelConfig) { try { await this.modelConfigService.update(userId, tenantId, modelConfig.id, { dimensions: actualDimensions, }); this.logger.log(`モデル ${modelConfig.name} の次元数設定を ${actualDimensions} に更新しました`); } catch (updateErr) { this.logger.warn(`モデル次元数設定の更新に失敗しました: ${updateErr.message}`); } } return actualDimensions; } } catch (err) { this.logger.warn( `次元数の取得に失敗しました。デフォルト次元数を使用します: ${defaultDimensions}`, err.message, ); } return defaultDimensions; } /** * AIを使用して文書のタイトルを自動生成する */ async generateTitle(kbId: string): Promise { this.logger.log(`Generating automatic title for file ${kbId}`); try { const kb = await this.kbRepository.findOne({ where: { id: kbId } }); if (!kb || !kb.content || kb.content.trim().length === 0) { return null; } const tenantId = kb.tenantId; // すでにタイトルがある場合はスキップ if (kb.title) { return kb.title; } // コンテンツの冒頭サンプルを取得(最大2500文字) const contentSample = kb.content.substring(0, 2500); // ユーザー設定から言語を取得、またはデフォルトを使用 const settings = await this.userSettingService.findOrCreate(kb.userId); const language = settings.language || 'ja'; // プロンプトを構築 const prompt = this.i18nService.getDocumentTitlePrompt(language, contentSample); // LLMを呼び出してタイトルを生成 let generatedTitle: string | undefined; try { generatedTitle = await this.chatService.generateSimpleChat( [{ role: 'user', content: prompt }], kb.userId, kb.tenantId ); } catch (err) { this.logger.warn(`Failed to generate title for document ${kbId} due to LLM configuration issue: ${err.message}`); return null; // Skip title generation if LLM is not configured for this tenant } if (generatedTitle && generatedTitle.trim().length > 0) { // 余分な引用符や改行を除去 const cleanedTitle = generatedTitle.trim().replace(/^["']|["']$/g, '').substring(0, 100); await this.kbRepository.update(kbId, { title: cleanedTitle }); // Elasticsearch のチャンクも更新 await this.elasticsearchService.updateTitleByFileId(kbId, cleanedTitle, tenantId).catch((err) => { this.logger.error(`Failed to update title in Elasticsearch for ${kbId}`, err); }); this.logger.log(`Successfully generated title for ${kbId}: ${cleanedTitle}`); return cleanedTitle; } } catch (error) { this.logger.error(`Failed to generate title for ${kbId}`, error); } return null; } }