| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467 |
- import { Injectable, Logger, NotFoundException, Inject, forwardRef } from '@nestjs/common';
- import { ConfigService } from '@nestjs/config';
- import { I18nService } from '../i18n/i18n.service';
- import { InjectRepository } from '@nestjs/typeorm';
- import { Repository, In } from 'typeorm';
- import { FileStatus, KnowledgeBase, ProcessingMode } from './knowledge-base.entity';
- import { KnowledgeGroup } from '../knowledge-group/knowledge-group.entity';
- import { ElasticsearchService } from '../elasticsearch/elasticsearch.service';
- import { TikaService } from '../tika/tika.service';
- import * as fs from 'fs';
- import * as path from 'path';
- import { EmbeddingService } from './embedding.service';
- import { TextChunkerService } from './text-chunker.service';
- import { ModelConfigService } from '../model-config/model-config.service';
- import { RagService } from '../rag/rag.service';
- import { VisionService } from '../vision/vision.service';
- import { UserSettingService } from '../user-setting/user-setting.service';
- import { MemoryMonitorService } from './memory-monitor.service';
- import { ChunkConfigService } from './chunk-config.service';
- import { VisionPipelineService } from '../vision-pipeline/vision-pipeline.service';
- import { LibreOfficeService } from '../libreoffice/libreoffice.service';
- import { Pdf2ImageService } from '../pdf2image/pdf2image.service';
- import { DOC_EXTENSIONS, IMAGE_EXTENSIONS } from '../common/file-support.constants';
- import { ChatService } from '../chat/chat.service';
- @Injectable()
- export class KnowledgeBaseService {
- private readonly logger = new Logger(KnowledgeBaseService.name);
- constructor(
- @InjectRepository(KnowledgeBase)
- private kbRepository: Repository<KnowledgeBase>,
- @InjectRepository(KnowledgeGroup)
- private groupRepository: Repository<KnowledgeGroup>,
- @Inject(forwardRef(() => ElasticsearchService))
- private elasticsearchService: ElasticsearchService,
- private tikaService: TikaService,
- private embeddingService: EmbeddingService,
- private textChunkerService: TextChunkerService,
- private modelConfigService: ModelConfigService,
- @Inject(forwardRef(() => RagService))
- private ragService: RagService,
- private visionService: VisionService,
- private userSettingService: UserSettingService,
- private memoryMonitor: MemoryMonitorService,
- private chunkConfigService: ChunkConfigService,
- private visionPipelineService: VisionPipelineService,
- private libreOfficeService: LibreOfficeService,
- private pdf2ImageService: Pdf2ImageService,
- private configService: ConfigService,
- private i18nService: I18nService,
- @Inject(forwardRef(() => ChatService))
- private chatService: ChatService,
- ) { }
- async createAndIndex(
- fileInfo: any,
- userId: string,
- tenantId: string,
- config?: any,
- ): Promise<KnowledgeBase> {
- const mode = config?.mode || 'fast';
- const processingMode = mode === 'precise' ? ProcessingMode.PRECISE : ProcessingMode.FAST;
- const kb = this.kbRepository.create({
- originalName: fileInfo.originalname,
- storagePath: fileInfo.path,
- size: fileInfo.size,
- mimetype: fileInfo.mimetype,
- status: FileStatus.PENDING,
- userId: userId,
- tenantId: tenantId,
- chunkSize: config?.chunkSize || 200,
- chunkOverlap: config?.chunkOverlap || 40,
- embeddingModelId: config?.embeddingModelId || null,
- processingMode: processingMode,
- });
- // 分類(グループ)の関連付け
- if (config?.groupIds && config.groupIds.length > 0) {
- const groups = await this.groupRepository.find({
- where: { id: In(config.groupIds), tenantId: tenantId }
- });
- kb.groups = groups;
- }
- const savedKb = await this.kbRepository.save(kb);
- this.logger.log(
- `Created KB record: ${savedKb.id}, mode: ${mode}, file: ${fileInfo.originalname}`
- );
- // ---------------------------------------------------------
- // Move the file to the final partitioned directory
- // source: uploads/{tenantId}/{filename} (or wherever it was)
- // target: uploads/{tenantId}/{savedKb.id}/{filename}
- // ---------------------------------------------------------
- const fs = await import('fs');
- const path = await import('path');
- const uploadPath = process.env.UPLOAD_FILE_PATH || './uploads';
- const targetDir = path.join(uploadPath, tenantId || 'default', savedKb.id);
- const targetPath = path.join(targetDir, fileInfo.filename);
- try {
- if (!fs.existsSync(targetDir)) {
- fs.mkdirSync(targetDir, { recursive: true });
- }
- if (fs.existsSync(fileInfo.path)) {
- fs.renameSync(fileInfo.path, targetPath);
- // Update the DB record with the new path
- savedKb.storagePath = targetPath;
- await this.kbRepository.save(savedKb);
- this.logger.log(`Moved file to partitioned storage: ${targetPath}`);
- }
- } catch (fsError) {
- this.logger.error(`Failed to move file ${savedKb.id} to partitioned storage`, fsError);
- // We will let it continue, but the file might be stuck in the temp/root folder
- }
- // If queue processing is requested, await completion
- if (config?.waitForCompletion) {
- await this.processFile(savedKb.id, userId, tenantId, config);
- } else {
- // Otherwise trigger asynchronously (default)
- this.processFile(savedKb.id, userId, tenantId, config).catch((err) => {
- this.logger.error(`Error processing file ${savedKb.id}`, err);
- });
- }
- return savedKb;
- }
- async findAll(userId: string, tenantId?: string): Promise<KnowledgeBase[]> {
- const where: any = {};
- if (tenantId) {
- where.tenantId = tenantId;
- } else {
- where.userId = userId;
- }
- return this.kbRepository.find({
- where,
- relations: ['groups'], // グループリレーションをロード
- order: { createdAt: 'DESC' },
- });
- }
- async searchKnowledge(userId: string, tenantId: string, query: string, topK: number = 5) {
- try {
- // 環境変数のデフォルト次元数を使用してシミュレーションベクトルを生成
- const defaultDimensions = parseInt(
- process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
- );
- const mockEmbedding = Array.from(
- { length: defaultDimensions },
- () => Math.random() - 0.5,
- );
- const queryVector = mockEmbedding;
- // 2. Search in Elasticsearch
- const searchResults = await this.elasticsearchService.searchSimilar(
- queryVector,
- userId,
- topK,
- tenantId, // Ensure shared visibility within tenant
- );
- // 3. Get file information from database
- const fileIds = [...new Set(searchResults.map((r) => r.fileId))];
- const files = await this.kbRepository.findByIds(fileIds);
- const fileMap = new Map(files.map((f) => [f.id, f]));
- // 4. Combine results with file info
- const results = searchResults.map((result) => {
- const file = fileMap.get(result.fileId);
- return {
- ...result,
- file: file
- ? {
- id: file.id,
- name: file.originalName,
- mimetype: file.mimetype,
- size: file.size,
- createdAt: file.createdAt,
- }
- : null,
- };
- });
- return {
- query,
- results,
- total: results.length,
- };
- } catch (error) {
- this.logger.error(
- `Metadata search failed for tenant ${tenantId}:`,
- error.stack || error.message,
- );
- throw error;
- }
- }
- async ragSearch(userId: string, tenantId: string, query: string, settings: any) {
- this.logger.log(
- `RAG search request: userId=${userId}, query="${query}", settings=${JSON.stringify(settings)}`,
- );
- try {
- const ragResults = await this.ragService.searchKnowledge(
- query,
- userId,
- settings.topK,
- settings.similarityThreshold,
- settings.selectedEmbeddingId,
- settings.enableFullTextSearch,
- settings.enableRerank,
- settings.selectedRerankId,
- undefined,
- undefined,
- settings.rerankSimilarityThreshold,
- tenantId, // Ensure shared visibility within tenant for RAG
- );
- const sources = this.ragService.extractSources(ragResults);
- const ragPrompt = this.ragService.buildRagPrompt(
- query,
- ragResults,
- settings.language || 'ja',
- );
- const result = {
- searchResults: ragResults,
- sources,
- ragPrompt,
- hasRelevantContent: ragResults.length > 0,
- };
- this.logger.log(
- `RAG search completed: found ${ragResults.length} results`,
- );
- return result;
- } catch (error) {
- this.logger.error(
- `RAG search failed for user ${userId}:`,
- error.stack || error.message,
- );
- // エラーをスローするのではなく空の結果を返し、システムの稼働を継続させる
- return {
- searchResults: [],
- sources: [],
- ragPrompt: query, // オリジナルのクエリを使用
- hasRelevantContent: false,
- };
- }
- }
- async deleteFile(fileId: string, userId: string, tenantId: string): Promise<void> {
- this.logger.log(`Deleting file ${fileId} for user ${userId}`);
- try {
- // 1. Get file info
- const file = await this.kbRepository.findOne({
- where: { id: fileId, tenantId }, // Filter by tenantId
- });
- if (!file) {
- throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
- }
- // 2. Delete file from filesystem
- const fs = await import('fs');
- try {
- if (fs.existsSync(file.storagePath)) {
- fs.unlinkSync(file.storagePath);
- this.logger.log(`Deleted file: ${file.storagePath}`);
- }
- } catch (error) {
- this.logger.warn(`Failed to delete file ${file.storagePath}:`, error);
- }
- // 3. Delete from Elasticsearch
- try {
- await this.elasticsearchService.deleteByFileId(fileId, userId, tenantId);
- this.logger.log(`Deleted ES documents for file ${fileId}`);
- } catch (error) {
- this.logger.warn(
- `Failed to delete ES documents for file ${fileId}:`,
- error,
- );
- }
- // 4. Remove from all groups (cleanup M2M relations)
- const fileWithGroups = await this.kbRepository.findOne({
- where: { id: fileId, tenantId },
- relations: ['groups'],
- });
- if (fileWithGroups && fileWithGroups.groups && fileWithGroups.groups.length > 0) {
- // Clear groups to remove entries from join table
- fileWithGroups.groups = [];
- await this.kbRepository.save(fileWithGroups);
- this.logger.log(`Cleared group associations for file ${fileId}`);
- }
- // 5. Delete from SQLite
- await this.kbRepository.delete({ id: fileId });
- this.logger.log(`Deleted database record for file ${fileId}`);
- } catch (error) {
- this.logger.error(`Failed to delete file ${fileId}`, error);
- throw error;
- }
- }
- async clearAll(userId: string, tenantId: string): Promise<void> {
- this.logger.log(`Clearing all knowledge base data for user ${userId} in tenant ${tenantId}`);
- try {
- // Get all files and delete them one by one
- const files = await this.kbRepository.find();
- for (const file of files) {
- await this.deleteFile(file.id, userId, tenantId);
- }
- this.logger.log(`Cleared all knowledge base data for user ${userId}`);
- } catch (error) {
- this.logger.error(
- `Failed to clear knowledge base for user ${userId}`,
- error,
- );
- throw error;
- }
- }
- private async processFile(kbId: string, userId: string, tenantId: string, config?: any) {
- this.logger.log(`Starting processing for file ${kbId}, mode: ${config?.mode || 'fast'}`);
- await this.updateStatus(kbId, FileStatus.INDEXING);
- try {
- const kb = await this.kbRepository.findOne({ where: { id: kbId } });
- if (!kb) {
- this.logger.error(`KB not found: ${kbId}`);
- return;
- }
- // メモリ監視 - 処理前チェック
- const memBefore = this.memoryMonitor.getMemoryUsage();
- this.logger.log(`メモリ状態 - 処理前: ${memBefore.heapUsed}/${memBefore.heapTotal}MB`);
- // モードに基づいて処理フローを選択
- const mode = config?.mode || 'fast';
- if (mode === 'precise') {
- // 精密モード - Vision Pipeline を使用
- await this.processPreciseMode(kb, userId, tenantId, config);
- } else {
- // 高速モード - Tika を使用
- await this.processFastMode(kb, userId, tenantId, config);
- }
- this.logger.log(`File ${kbId} processed successfully in ${mode} mode.`);
- } catch (error) {
- this.logger.error(`Failed to process file ${kbId}`, error);
- await this.updateStatus(kbId, FileStatus.FAILED);
- }
- }
- /**
- * 高速モード処理(既存フロー)
- */
- private async processFastMode(kb: KnowledgeBase, userId: string, tenantId: string, config?: any) {
- // 1. Tika を使用してテキストを抽出
- let text = await this.tikaService.extractText(kb.storagePath);
- // 画像ファイルの場合はビジョンモデルを使用
- if (this.visionService.isImageFile(kb.mimetype)) {
- const visionModelId = await this.userSettingService.getVisionModelId(userId);
- if (visionModelId) {
- const visionModel = await this.modelConfigService.findOne(
- visionModelId,
- userId,
- tenantId,
- );
- if (visionModel && visionModel.type === 'vision' && visionModel.isEnabled !== false) {
- text = await this.visionService.extractImageContent(kb.storagePath, {
- baseUrl: visionModel.baseUrl || '',
- apiKey: visionModel.apiKey || '',
- modelId: visionModel.modelId,
- });
- }
- }
- }
- if (!text || text.trim().length === 0) {
- this.logger.warn(this.i18nService.getMessage('noTextExtracted'));
- }
- // テキストサイズを確認
- const textSizeMB = Math.round(text.length / 1024 / 1024);
- if (textSizeMB > 50) {
- this.logger.warn(this.i18nService.formatMessage('extractedTextTooLarge', { size: textSizeMB }));
- }
- // テキストをデータベースに保存
- await this.kbRepository.update(kb.id, { content: text });
- await this.updateStatus(kb.id, FileStatus.EXTRACTED);
- // 非同期ベクトル化
- await this.vectorizeToElasticsearch(kb.id, userId, tenantId, text, config).catch((err) => {
- this.logger.error(`Error vectorizing file ${kb.id}`, err);
- });
- // 自動タイトル生成 (非同期的に実行)
- this.generateTitle(kb.id).catch((err) => {
- this.logger.error(`Error generating title for file ${kb.id}`, err);
- });
- // 非同期的に PDF 変換をトリガー(ドキュメントファイルの場合)
- this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => {
- this.logger.warn(this.i18nService.formatMessage('pdfConversionFailedDetail', { id: kb.id }), err);
- });
- }
- /**
- * 精密モード処理(新規フロー)
- */
- private async processPreciseMode(kb: KnowledgeBase, userId: string, tenantId: string, config?: any) {
- // 精密モードがサポートされているか確認
- const preciseFormats = ['.pdf', '.doc', '.docx', '.ppt', '.pptx'];
- const ext = kb.originalName.toLowerCase().substring(kb.originalName.lastIndexOf('.'));
- if (!preciseFormats.includes(ext)) {
- this.logger.warn(
- this.i18nService.formatMessage('preciseModeUnsupported', { ext })
- );
- return this.processFastMode(kb, userId, tenantId, config);
- }
- // Vision モデルが設定されているか確認
- const visionModelId = await this.userSettingService.getVisionModelId(userId);
- if (!visionModelId) {
- this.logger.warn(
- this.i18nService.getMessage('visionModelNotConfiguredFallback')
- );
- return this.processFastMode(kb, userId, tenantId, config);
- }
- const visionModel = await this.modelConfigService.findOne(
- visionModelId,
- userId,
- tenantId,
- );
- if (!visionModel || visionModel.type !== 'vision' || visionModel.isEnabled === false) {
- this.logger.warn(
- this.i18nService.getMessage('visionModelInvalidFallback')
- );
- return this.processFastMode(kb, userId, tenantId, config);
- }
- // Vision Pipeline を呼び出し
- try {
- const result = await this.visionPipelineService.processPreciseMode(
- kb.storagePath,
- {
- userId,
- tenantId, // New
- modelId: visionModelId,
- fileId: kb.id,
- fileName: kb.originalName,
- skipQualityCheck: false,
- }
- );
- if (!result.success) {
- this.logger.error(`Vision pipeline failed, falling back to fast mode`);
- this.logger.warn(this.i18nService.getMessage('visionPipelineFailed'));
- return this.processFastMode(kb, userId, tenantId, config);
- }
- // テキスト内容をデータベースに保存
- const combinedText = result.results.map(r => r.text).join('\n\n');
- const metadata = {
- processedPages: result.processedPages,
- failedPages: result.failedPages,
- cost: result.cost,
- duration: result.duration,
- results: result.results.map(r => ({
- pageIndex: r.pageIndex,
- confidence: r.confidence,
- layout: r.layout,
- imageCount: r.images.length,
- })),
- };
- await this.kbRepository.update(kb.id, {
- content: combinedText,
- metadata: metadata as any,
- });
- await this.updateStatus(kb.id, FileStatus.EXTRACTED);
- this.logger.log(
- this.i18nService.formatMessage('preciseModeComplete', { pages: result.processedPages, cost: result.cost.toFixed(2) })
- );
- // 非同期でベクトル化し、Elasticsearch にインデックス
- // 各ページを独立したドキュメントとして作成し、メタデータを保持
- this.indexPreciseResults(kb, userId, tenantId, kb.embeddingModelId, result.results).catch((err) => {
- this.logger.error(`Error indexing precise results for ${kb.id}`, err);
- });
- // 非同期で PDF 変換をトリガー
- this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => {
- this.logger.warn(`Initial PDF conversion failed for ${kb.id}`, err);
- });
- // 自動タイトル生成 (非同期的に実行)
- this.generateTitle(kb.id).catch((err) => {
- this.logger.error(`Error generating title for file ${kb.id}`, err);
- });
- } catch (error) {
- this.logger.error(`Vision pipeline error: ${error.message}, falling back to fast mode`);
- return this.processFastMode(kb, userId, tenantId, config);
- }
- }
- /**
- * 精密モードの結果をインデックス
- */
- private async indexPreciseResults(
- kb: KnowledgeBase,
- userId: string,
- tenantId: string,
- embeddingModelId: string,
- results: any[]
- ): Promise<void> {
- this.logger.log(`Indexing ${results.length} precise results for ${kb.id}`);
- // インデックスの存在を確認 - 実際のモデル次元数を取得
- const actualDimensions = await this.getActualModelDimensions(embeddingModelId, userId, tenantId);
- await this.elasticsearchService.createIndexIfNotExists(actualDimensions);
- // ベクトル化とインデックスをバッチ処理
- const batchSize = parseInt(process.env.CHUNK_BATCH_SIZE || '50');
- for (let i = 0; i < results.length; i += batchSize) {
- const batch = results.slice(i, i + batchSize);
- const texts = batch.map(r => r.text);
- try {
- // ベクトルを生成
- const embeddings = await this.embeddingService.getEmbeddings(
- texts,
- userId,
- embeddingModelId
- );
- // 各結果をインデックス
- for (let j = 0; j < batch.length; j++) {
- const result = batch[j];
- const embedding = embeddings[j];
- if (!embedding || embedding.length === 0) {
- this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorPage', { page: result.pageIndex }));
- continue;
- }
- await this.elasticsearchService.indexDocument(
- `${kb.id}_page_${result.pageIndex}`,
- result.text,
- embedding,
- {
- fileId: kb.id,
- originalName: kb.originalName,
- mimetype: kb.mimetype,
- userId: userId,
- tenantId: tenantId, // New
- pageNumber: result.pageIndex,
- images: result.images,
- layout: result.layout,
- confidence: result.confidence,
- source: 'precise',
- mode: 'vision',
- }
- );
- }
- this.logger.log(`バッチ ${Math.floor(i / batchSize) + 1} 完了: ${batch.length} ページ`);
- } catch (error) {
- this.logger.error(`バッチ ${Math.floor(i / batchSize) + 1} の処理に失敗しました`, error);
- }
- }
- await this.updateStatus(kb.id, FileStatus.VECTORIZED);
- this.logger.log(`精密モードのインデックス完了: ${results.length} ページ`);
- }
- /**
- * PDF の特定ページの画像を取得
- */
- async getPageAsImage(fileId: string, pageIndex: number, userId: string, tenantId: string): Promise<string> {
- const pdfPath = await this.ensurePDFExists(fileId, userId, tenantId);
- // 特定のページを変換
- const result = await this.pdf2ImageService.convertToImages(pdfPath, {
- density: 150,
- quality: 75,
- format: 'jpeg',
- });
- // 対応するページ番号の画像を見つける
- const pageImage = result.images.find(img => img.pageIndex === pageIndex + 1);
- if (!pageImage) {
- throw new NotFoundException(this.i18nService.formatMessage('pageImageNotFoundDetail', { page: pageIndex + 1 }));
- }
- return pageImage.path;
- }
- private async vectorizeToElasticsearch(
- kbId: string,
- userId: string,
- tenantId: string,
- text: string,
- config?: any,
- ) {
- try {
- const kb = await this.kbRepository.findOne({ where: { id: kbId, tenantId } });
- if (!kb) return;
- // メモリ監視 - ベクトル化前チェック
- const memBeforeChunk = this.memoryMonitor.getMemoryUsage();
- this.logger.log(
- `ベクトル化前メモリ: ${memBeforeChunk.heapUsed}/${memBeforeChunk.heapTotal}MB`,
- );
- this.logger.debug(`File ${kbId}: Validating chunk config...`);
- // 1. チャンク設定の検証と修正(モデルの制限と環境変数に基づく)
- const validatedConfig = await this.chunkConfigService.validateChunkConfig(
- kb.chunkSize,
- kb.chunkOverlap,
- kb.embeddingModelId,
- userId,
- );
- this.logger.debug(`File ${kbId}: Chunk config validated.`);
- // 設定が修正された場合、警告を記録しデータベースを更新
- if (validatedConfig.warnings.length > 0) {
- this.logger.warn(
- this.i18nService.formatMessage('chunkConfigCorrection', { warnings: validatedConfig.warnings.join(', ') })
- );
- // データベース内の設定を更新
- if (validatedConfig.chunkSize !== kb.chunkSize ||
- validatedConfig.chunkOverlap !== kb.chunkOverlap) {
- await this.kbRepository.update(kbId, {
- chunkSize: validatedConfig.chunkSize,
- chunkOverlap: validatedConfig.chunkOverlap,
- });
- }
- }
- // 設定サマリーを表示(実際に適用される上限を含む)
- this.logger.debug(`File ${kbId}: Getting config summary...`);
- const configSummary = await this.chunkConfigService.getConfigSummary(
- validatedConfig.chunkSize,
- validatedConfig.chunkOverlap,
- kb.embeddingModelId,
- userId,
- );
- this.logger.log(`チャンク設定: ${configSummary}`);
- this.logger.log(`設定上限: チャンク=${validatedConfig.effectiveMaxChunkSize}, 重複=${validatedConfig.effectiveMaxOverlapSize}`);
- // 2. 検証済みの設定を使用してチャンク分割
- const chunks = this.textChunkerService.chunkText(
- text,
- validatedConfig.chunkSize,
- validatedConfig.chunkOverlap,
- );
- this.logger.log(`ファイル ${kbId} から ${chunks.length} 個のテキストブロックを分割しました`);
- if (chunks.length === 0) {
- this.logger.warn(this.i18nService.formatMessage('noChunksGenerated', { id: kbId }));
- await this.updateStatus(kbId, FileStatus.VECTORIZED);
- return;
- }
- // 3. チャンク数が妥当か確認
- const estimatedChunkCount = this.chunkConfigService.estimateChunkCount(
- text.length,
- validatedConfig.chunkSize,
- );
- if (chunks.length > estimatedChunkCount * 1.2) {
- this.logger.warn(
- this.i18nService.formatMessage('chunkCountAnomaly', { actual: chunks.length, estimated: estimatedChunkCount })
- );
- }
- // 4. 推奨バッチサイズを取得(モデルの制限に基づく)
- const recommendedBatchSize = await this.chunkConfigService.getRecommendedBatchSize(
- kb.embeddingModelId,
- userId,
- tenantId,
- parseInt(process.env.CHUNK_BATCH_SIZE || '100'),
- );
- // 5. メモリ使用量を推定
- const avgChunkSize = chunks.reduce((sum, c) => sum + c.content.length, 0) / chunks.length;
- const estimatedMemory = this.memoryMonitor.estimateMemoryUsage(
- chunks.length,
- avgChunkSize,
- parseInt(process.env.DEFAULT_VECTOR_DIMENSIONS || '2560'),
- );
- this.logger.log(`推定メモリ使用量: ${estimatedMemory}MB (バッチサイズ: ${recommendedBatchSize})`);
- // 6. 実際のモデル次元数を取得し、インデックスの存在を確認
- const actualDimensions = await this.getActualModelDimensions(kb.embeddingModelId, userId, tenantId);
- await this.elasticsearchService.createIndexIfNotExists(actualDimensions);
- // 7. ベクトル化とインデックス作成をバッチ処理
- const useBatching = this.memoryMonitor.shouldUseBatching(
- chunks.length,
- avgChunkSize,
- actualDimensions,
- );
- if (useBatching) {
- try {
- await this.processInBatches(
- chunks,
- async (batch, batchIndex) => {
- // バッチサイズがモデルの制限を超えていないか検証
- if (batch.length > recommendedBatchSize) {
- this.logger.warn(
- this.i18nService.formatMessage('batchSizeExceeded', { index: batchIndex, actual: batch.length, limit: recommendedBatchSize })
- );
- }
- const chunkTexts = batch.map((chunk) => chunk.content);
- const embeddings = await this.embeddingService.getEmbeddings(
- chunkTexts,
- userId,
- kb.embeddingModelId,
- );
- // 次元の整合性を検証
- if (embeddings.length > 0 && embeddings[0].length !== actualDimensions) {
- this.logger.warn(
- `ベクトル次元が不一致です: 期待値 ${actualDimensions}, 実際 ${embeddings[0].length}`
- );
- }
- // このバッチデータを即座にインデックス
- for (let i = 0; i < batch.length; i++) {
- const chunk = batch[i];
- const embedding = embeddings[i];
- if (!embedding || embedding.length === 0) {
- this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
- continue;
- }
- await this.elasticsearchService.indexDocument(
- `${kb.id}_chunk_${chunk.index}`,
- chunk.content,
- embedding,
- {
- fileId: kb.id,
- originalName: kb.originalName,
- mimetype: kb.mimetype,
- userId: userId,
- chunkIndex: chunk.index,
- startPosition: chunk.startPosition,
- tenantId, // Passing tenantId to ES
- }
- );
- }
- this.logger.log(`バッチ ${batchIndex} 完了: ${batch.length} チャンク`);
- },
- {
- batchSize: recommendedBatchSize,
- onBatchComplete: (batchIndex, totalBatches) => {
- const mem = this.memoryMonitor.getMemoryUsage();
- this.logger.log(
- `バッチ ${batchIndex}/${totalBatches} 完了, メモリ: ${mem.heapUsed}MB`,
- );
- },
- },
- );
- } catch (error) {
- // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
- if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
- this.logger.warn(this.i18nService.getMessage('contextLengthErrorFallback'));
- // 単一テキスト処理にダウングレード
- for (let i = 0; i < chunks.length; i++) {
- const chunk = chunks[i];
- try {
- const embeddings = await this.embeddingService.getEmbeddings(
- [chunk.content], // 単一テキスト
- userId,
- kb.embeddingModelId,
- );
- if (!embeddings[0] || embeddings[0].length === 0) {
- this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
- continue;
- }
- await this.elasticsearchService.indexDocument(
- `${kb.id}_chunk_${chunk.index}`,
- chunk.content,
- embeddings[0],
- {
- fileId: kb.id,
- originalName: kb.originalName,
- mimetype: kb.mimetype,
- userId: userId,
- chunkIndex: chunk.index,
- startPosition: chunk.startPosition,
- endPosition: chunk.endPosition,
- tenantId,
- }
- );
- if ((i + 1) % 10 === 0) {
- this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
- }
- } catch (chunkError) {
- this.logger.error(
- `テキストブロック ${chunk.index} の処理に失敗しました。スキップします: ${chunkError.message}`
- );
- continue;
- }
- }
- this.logger.log(`単一テキスト処理完了: ${chunks.length} チャンク`);
- } else {
- // その他のエラーは直接スロー
- throw error;
- }
- }
- } else {
- // 小さなファイル、一括処理(ただしバッチ制限の確認が必要)
- const chunkTexts = chunks.map((chunk) => chunk.content);
- // チャンク数がモデルのバッチ制限を超える場合は、強制的にバッチ処理
- if (chunks.length > recommendedBatchSize) {
- this.logger.warn(
- this.i18nService.formatMessage('chunkLimitExceededForceBatch', { actual: chunks.length, limit: recommendedBatchSize })
- );
- try {
- await this.processInBatches(
- chunks,
- async (batch, batchIndex) => {
- const batchTexts = batch.map((c) => c.content);
- const embeddings = await this.embeddingService.getEmbeddings(
- batchTexts,
- userId,
- kb.embeddingModelId,
- );
- for (let i = 0; i < batch.length; i++) {
- const chunk = batch[i];
- const embedding = embeddings[i];
- if (!embedding || embedding.length === 0) {
- this.logger.warn(`空ベクトルのテキストブロック ${chunk.index} をスキップします`);
- continue;
- }
- await this.elasticsearchService.indexDocument(
- `${kb.id}_chunk_${chunk.index}`,
- chunk.content,
- embedding,
- {
- fileId: kb.id,
- originalName: kb.originalName,
- mimetype: kb.mimetype,
- userId: userId,
- chunkIndex: chunk.index,
- startPosition: chunk.startPosition,
- endPosition: chunk.endPosition,
- tenantId, // Passing tenantId to ES metadata
- }
- );
- }
- },
- );
- } catch (error) {
- // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
- if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
- this.logger.warn(this.i18nService.getMessage('batchContextLengthErrorFallback'));
- // 単一テキスト処理にダウングレード
- for (let i = 0; i < chunks.length; i++) {
- const chunk = chunks[i];
- try {
- const embeddings = await this.embeddingService.getEmbeddings(
- [chunk.content], // 単一テキスト
- userId,
- kb.embeddingModelId,
- );
- if (!embeddings[0] || embeddings[0].length === 0) {
- this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
- continue;
- }
- await this.elasticsearchService.indexDocument(
- `${kb.id}_chunk_${chunk.index}`,
- chunk.content,
- embeddings[0],
- {
- fileId: kb.id,
- originalName: kb.originalName,
- mimetype: kb.mimetype,
- userId: userId,
- tenantId, // Added tenantId
- chunkIndex: chunk.index,
- startPosition: chunk.startPosition,
- endPosition: chunk.endPosition,
- },
- );
- if ((i + 1) % 10 === 0) {
- this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
- }
- } catch (chunkError) {
- this.logger.error(
- this.i18nService.formatMessage('chunkProcessingFailed', { index: chunk.index, message: chunkError.message })
- );
- continue;
- }
- }
- this.logger.log(this.i18nService.formatMessage('singleTextProcessingComplete', { count: chunks.length }));
- } else {
- // その他のエラー、直接スロー
- throw error;
- }
- }
- } else {
- // 十分に小さいファイルの場合は一括で処理
- try {
- const embeddings = await this.embeddingService.getEmbeddings(
- chunkTexts,
- userId,
- kb.embeddingModelId,
- );
- for (let i = 0; i < chunks.length; i++) {
- const chunk = chunks[i];
- const embedding = embeddings[i];
- if (!embedding || embedding.length === 0) {
- this.logger.warn(this.i18nService.formatMessage('skippingEmptyVectorChunk', { index: chunk.index }));
- continue;
- }
- await this.elasticsearchService.indexDocument(
- `${kb.id}_chunk_${chunk.index}`,
- chunk.content,
- embedding,
- {
- fileId: kb.id,
- originalName: kb.originalName,
- mimetype: kb.mimetype,
- userId: userId,
- tenantId, // Added tenantId
- chunkIndex: chunk.index,
- startPosition: chunk.startPosition,
- endPosition: chunk.endPosition,
- },
- );
- }
- } catch (error) {
- // コンテキスト長エラーを検出(日本語・中国語・英語に対応)
- if (error.message && (error.message.includes('context length') || error.message.includes('コンテキスト長が上限を超えています') || error.message.includes('コンテキスト長が上限を超えています'))) {
- this.logger.warn(this.i18nService.getMessage('batchContextLengthErrorFallback'));
- // 単一テキスト処理にダウングレード
- for (let i = 0; i < chunks.length; i++) {
- const chunk = chunks[i];
- try {
- const embeddings = await this.embeddingService.getEmbeddings(
- [chunk.content], // 単一テキスト
- userId,
- kb.embeddingModelId,
- );
- if (!embeddings[0] || embeddings[0].length === 0) {
- this.logger.warn(`空ベクトルのテキストブロック ${chunk.index} をスキップします`);
- continue;
- }
- await this.elasticsearchService.indexDocument(
- `${kb.id}_chunk_${chunk.index}`,
- chunk.content,
- embeddings[0],
- {
- fileId: kb.id,
- originalName: kb.originalName,
- mimetype: kb.mimetype,
- userId: userId,
- tenantId, // Added tenantId
- chunkIndex: chunk.index,
- startPosition: chunk.startPosition,
- endPosition: chunk.endPosition,
- },
- );
- if ((i + 1) % 10 === 0) {
- this.logger.log(`単一処理進捗: ${i + 1}/${chunks.length}`);
- }
- } catch (chunkError) {
- this.logger.error(
- `テキストブロック ${chunk.index} の処理に失敗しました。スキップします: ${chunkError.message}`
- );
- continue;
- }
- }
- this.logger.log(this.i18nService.formatMessage('singleTextProcessingComplete', { count: chunks.length }));
- } else {
- // その他のエラー、直接スロー
- throw error;
- }
- }
- }
- }
- await this.updateStatus(kbId, FileStatus.VECTORIZED);
- const memAfter = this.memoryMonitor.getMemoryUsage();
- this.logger.log(
- this.i18nService.formatMessage('fileVectorizationComplete', { id: kbId, count: chunks.length, memory: memAfter.heapUsed })
- );
- } catch (error) {
- this.logger.error(this.i18nService.formatMessage('fileVectorizationFailed', { id: kbId }), error);
- // エラー情報を metadata に保存
- try {
- const kb = await this.kbRepository.findOne({ where: { id: kbId } });
- if (kb) {
- const metadata = kb.metadata || {};
- metadata.lastError = error.message;
- metadata.failedAt = new Date().toISOString();
- await this.kbRepository.update(kbId, { metadata });
- }
- } catch (e) {
- this.logger.warn(`Failed to update metadata for failed file ${kbId}`, e);
- }
- await this.updateStatus(kbId, FileStatus.FAILED);
- }
- }
- /**
- * バッチ処理、メモリ制御付き
- */
- private async processInBatches<T>(
- items: T[],
- processor: (batch: T[], batchIndex: number) => Promise<void>,
- options?: {
- batchSize?: number;
- onBatchComplete?: (batchIndex: number, totalBatches: number) => void;
- },
- ): Promise<void> {
- const totalItems = items.length;
- if (totalItems === 0) return;
- const startTime = Date.now();
- this.logger.log(this.i18nService.formatMessage('batchProcessingStarted', { count: totalItems }));
- // Use provided batch size or fallback to env/default
- const initialBatchSize = options?.batchSize || parseInt(process.env.CHUNK_BATCH_SIZE || '100');
- const totalBatches = Math.ceil(totalItems / initialBatchSize);
- for (let i = 0; i < totalItems;) {
- // メモリを確認し待機
- await this.memoryMonitor.waitForMemoryAvailable();
- // バッチサイズを動的に調整 (initialBatchSize から開始し、必要に応じてメモリモニターが削減できるようにします)
- // 注意: memoryMonitor.getDynamicBatchSize はメモリ状況に基づいてより大きな値を返す可能性がありますが、
- // モデルの制限 (initialBatchSize) を尊重する必要があります。
- const currentMem = this.memoryMonitor.getMemoryUsage().heapUsed;
- const dynamicBatchSize = this.memoryMonitor.getDynamicBatchSize(currentMem);
- // Ensure we don't exceed the model's limit (initialBatchSize) even if memory allows more
- const batchSize = Math.min(dynamicBatchSize, initialBatchSize);
- // 現在のバッチを取得
- const batch = items.slice(i, i + batchSize);
- const batchIndex = Math.floor(i / batchSize) + 1;
- this.logger.log(
- this.i18nService.formatMessage('batchProcessingProgress', { index: batchIndex, total: totalBatches, count: batch.length })
- );
- // バッチを処理
- await processor(batch, batchIndex);
- // コールバック通知
- if (options?.onBatchComplete) {
- options.onBatchComplete(batchIndex, totalBatches);
- }
- // 強制GC(メモリがしきい値に近い場合)
- if (currentMem > 800) {
- this.memoryMonitor.forceGC();
- }
- // 参照をクリアしGCを助ける
- batch.length = 0;
- i += batchSize;
- }
- const duration = ((Date.now() - startTime) / 1000).toFixed(2);
- this.logger.log(this.i18nService.formatMessage('batchProcessingComplete', { count: totalItems, duration }));
- }
- /**
- * 失敗したファイルのベクトル化を再試行
- */
- async retryFailedFile(fileId: string, userId: string, tenantId: string): Promise<KnowledgeBase> {
- this.logger.log(`Retrying failed file ${fileId} for user ${userId} in tenant ${tenantId}`);
- // 1. Get file with tenant restriction
- const kb = await this.kbRepository.findOne({
- where: { id: fileId, tenantId },
- });
- if (!kb) {
- throw new NotFoundException('ファイルが存在しません');
- }
- if (kb.status !== FileStatus.FAILED) {
- throw new Error(this.i18nService.formatMessage('onlyFailedFilesRetryable', { status: kb.status }));
- }
- if (!kb.content || kb.content.trim().length === 0) {
- throw new Error(this.i18nService.getMessage('emptyFileRetryFailed'));
- }
- // 2. ステータスを INDEXING にリセット
- await this.updateStatus(fileId, FileStatus.INDEXING);
- // 3. 非同期でベクトル化をトリガー(既存ロジックを再利用)
- this.vectorizeToElasticsearch(
- fileId,
- userId,
- tenantId,
- kb.content,
- {
- chunkSize: kb.chunkSize,
- chunkOverlap: kb.chunkOverlap,
- embeddingModelId: kb.embeddingModelId,
- }
- ).catch((err) => {
- this.logger.error(`Retry vectorization failed for file ${fileId}`, err);
- });
- // 4. 更新後のファイルステータスを返却
- const updatedKb = await this.kbRepository.findOne({ where: { id: fileId, tenantId } });
- if (!updatedKb) {
- throw new NotFoundException('ファイルが存在しません');
- }
- return updatedKb;
- }
- /**
- * ファイルのすべてのチャンク情報を取得
- */
- async getFileChunks(fileId: string, userId: string, tenantId: string) {
- this.logger.log(`Getting chunks for file ${fileId}, user ${userId}, tenant ${tenantId}`);
- // 1. Get file with tenant check
- const kb = await this.kbRepository.findOne({
- where: { id: fileId, tenantId },
- });
- if (!kb) {
- throw new NotFoundException('ファイルが存在しません');
- }
- // 2. Elasticsearch からすべてのチャンクを取得
- const chunks = await this.elasticsearchService.getFileChunks(fileId, userId, tenantId);
- // 3. チャンク情報を返却
- return {
- fileId: kb.id,
- fileName: kb.originalName,
- totalChunks: chunks.length,
- chunkSize: kb.chunkSize,
- chunkOverlap: kb.chunkOverlap,
- chunks: chunks.map(chunk => ({
- index: chunk.chunkIndex,
- content: chunk.content,
- contentLength: chunk.content.length,
- startPosition: chunk.startPosition,
- endPosition: chunk.endPosition,
- })),
- };
- }
- private async updateStatus(id: string, status: FileStatus) {
- await this.kbRepository.update(id, { status });
- }
- // PDF プレビュー関連メソッド
- async ensurePDFExists(fileId: string, userId: string, tenantId: string, force: boolean = false): Promise<string> {
- const kb = await this.kbRepository.findOne({
- where: { id: fileId, tenantId },
- });
- if (!kb) {
- throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
- }
- // 元ファイルが PDF の場合は、元ファイルのパスを直接返す
- if (kb.mimetype === 'application/pdf') {
- return kb.storagePath;
- }
- // プレビュー変換に対応しているか確認(ドキュメント類または画像類のみ許可)
- const ext = kb.originalName.toLowerCase().split('.').pop() || '';
- const isConvertible = [...DOC_EXTENSIONS, ...IMAGE_EXTENSIONS].includes(ext);
- if (!isConvertible) {
- this.logger.log(`Skipping PDF conversion for unsupported format: .${ext} (${kb.originalName})`);
- throw new Error(this.i18nService.getMessage('pdfPreviewNotSupported'));
- }
- // PDF フィールドパスを生成
- const path = await import('path');
- const fs = await import('fs');
- const uploadDir = path.dirname(kb.storagePath);
- const baseName = path.basename(kb.storagePath, path.extname(kb.storagePath));
- const pdfPath = path.join(uploadDir, `${baseName}.pdf`);
- // 強制再生成が指定され、ファイルが存在する場合は削除
- if (force && fs.existsSync(pdfPath)) {
- try {
- fs.unlinkSync(pdfPath);
- this.logger.log(`Forced regeneration: Deleted existing PDF ${pdfPath}`);
- } catch (e) {
- this.logger.warn(`Failed to delete existing PDF for regeneration: ${e.message}`);
- }
- }
- // 変換済みかつ強制再生成が不要か確認
- if (fs.existsSync(pdfPath) && !force) {
- if (!kb.pdfPath) {
- await this.kbRepository.update(kb.id, { pdfPath: pdfPath });
- }
- return pdfPath;
- }
- // PDF への変換が必要
- try {
- this.logger.log(`Starting PDF conversion for ${kb.originalName} at ${kb.storagePath}`);
- // ファイルを変換
- await this.libreOfficeService.convertToPDF(kb.storagePath);
- // 変換結果を確認
- if (!fs.existsSync(pdfPath)) {
- throw new Error(`PDF conversion completed but file not found at ${pdfPath}`);
- }
- const stats = fs.statSync(pdfPath);
- if (stats.size === 0) {
- fs.unlinkSync(pdfPath);
- throw new Error(`PDF conversion failed: output file is empty`);
- }
- await this.kbRepository.update(kb.id, { pdfPath: pdfPath });
- this.logger.log(`PDF conversion successful: ${pdfPath}`);
- return pdfPath;
- } catch (error) {
- this.logger.error(`PDF conversion failed for ${fileId}: ${error.message}`, error.stack);
- throw new Error(this.i18nService.formatMessage('pdfConversionFailedDetail', { id: fileId }));
- }
- }
- async getPDFStatus(fileId: string, userId: string, tenantId: string) {
- const kb = await this.kbRepository.findOne({
- where: { id: fileId, tenantId },
- });
- if (!kb) {
- throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
- }
- // 元ファイルが PDF の場合
- if (kb.mimetype === 'application/pdf') {
- const token = this.generateTempToken(fileId, userId, tenantId);
- return {
- status: 'ready',
- url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
- };
- }
- // PDF ファイルパスを生成
- const path = await import('path');
- const fs = await import('fs');
- const uploadDir = path.dirname(kb.storagePath);
- const baseName = path.basename(kb.storagePath, path.extname(kb.storagePath));
- const pdfPath = path.join(uploadDir, `${baseName}.pdf`);
- // 変換済みか確認
- if (fs.existsSync(pdfPath)) {
- if (!kb.pdfPath) {
- kb.pdfPath = pdfPath;
- await this.kbRepository.save(kb);
- }
- const token = this.generateTempToken(fileId, userId, tenantId);
- return {
- status: 'ready',
- url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
- };
- }
- // 変換が必要
- return {
- status: 'pending',
- };
- }
- private generateTempToken(fileId: string, userId: string, tenantId: string): string {
- const jwt = require('jsonwebtoken');
- const secret = process.env.JWT_SECRET;
- if (!secret) {
- throw new Error('JWT_SECRET environment variable is required but not set');
- }
- return jwt.sign(
- { fileId, userId, tenantId, type: 'pdf-access' },
- secret,
- { expiresIn: '1h' }
- );
- }
- /**
- * モデルの実際の次元数を取得(キャッシュ確認とプローブロジック付き)
- */
- private async getActualModelDimensions(embeddingModelId: string, userId: string, tenantId: string): Promise<number> {
- const defaultDimensions = parseInt(
- process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
- );
- try {
- // 1. モデル設定から優先的に取得
- const modelConfig = await this.modelConfigService.findOne(
- embeddingModelId,
- userId,
- tenantId,
- );
- if (modelConfig && modelConfig.dimensions) {
- this.logger.log(`設定から ${modelConfig.name} の次元数を取得しました: ${modelConfig.dimensions}`);
- return modelConfig.dimensions;
- }
- // 2. それ以外の場合はプローブにより取得
- this.logger.log(`モデル次元数をプローブ中: ${embeddingModelId}`);
- const probeEmbeddings = await this.embeddingService.getEmbeddings(
- ['probe'],
- userId,
- embeddingModelId,
- );
- if (probeEmbeddings.length > 0) {
- const actualDimensions = probeEmbeddings[0].length;
- this.logger.log(`モデルの実際の次元数を検出しました: ${actualDimensions}`);
- // 次回利用のためにモデル設定を更新
- if (modelConfig) {
- try {
- await this.modelConfigService.update(userId, tenantId, modelConfig.id, {
- dimensions: actualDimensions,
- });
- this.logger.log(`モデル ${modelConfig.name} の次元数設定を ${actualDimensions} に更新しました`);
- } catch (updateErr) {
- this.logger.warn(`モデル次元数設定の更新に失敗しました: ${updateErr.message}`);
- }
- }
- return actualDimensions;
- }
- } catch (err) {
- this.logger.warn(
- `次元数の取得に失敗しました。デフォルト次元数を使用します: ${defaultDimensions}`,
- err.message,
- );
- }
- return defaultDimensions;
- }
- /**
- * AIを使用して文書のタイトルを自動生成する
- */
- async generateTitle(kbId: string): Promise<string | null> {
- this.logger.log(`Generating automatic title for file ${kbId}`);
- try {
- const kb = await this.kbRepository.findOne({ where: { id: kbId } });
- if (!kb || !kb.content || kb.content.trim().length === 0) {
- return null;
- }
- const tenantId = kb.tenantId;
- // すでにタイトルがある場合はスキップ
- if (kb.title) {
- return kb.title;
- }
- // コンテンツの冒頭サンプルを取得(最大2500文字)
- const contentSample = kb.content.substring(0, 2500);
- // ユーザー設定から言語を取得、またはデフォルトを使用
- const settings = await this.userSettingService.findOrCreate(kb.userId);
- const language = settings.language || 'ja';
- // プロンプトを構築
- const prompt = this.i18nService.getDocumentTitlePrompt(language, contentSample);
- // LLMを呼び出してタイトルを生成
- let generatedTitle: string | undefined;
- try {
- generatedTitle = await this.chatService.generateSimpleChat(
- [{ role: 'user', content: prompt }],
- kb.userId,
- kb.tenantId
- );
- } catch (err) {
- this.logger.warn(`Failed to generate title for document ${kbId} due to LLM configuration issue: ${err.message}`);
- return null; // Skip title generation if LLM is not configured for this tenant
- }
- if (generatedTitle && generatedTitle.trim().length > 0) {
- // 余分な引用符や改行を除去
- const cleanedTitle = generatedTitle.trim().replace(/^["']|["']$/g, '').substring(0, 100);
- await this.kbRepository.update(kbId, { title: cleanedTitle });
- // Elasticsearch のチャンクも更新
- await this.elasticsearchService.updateTitleByFileId(kbId, cleanedTitle, tenantId).catch((err) => {
- this.logger.error(`Failed to update title in Elasticsearch for ${kbId}`, err);
- });
- this.logger.log(`Successfully generated title for ${kbId}: ${cleanedTitle}`);
- return cleanedTitle;
- }
- } catch (error) {
- this.logger.error(`Failed to generate title for ${kbId}`, error);
- }
- return null;
- }
- }
|