import { Injectable, Logger, OnModuleInit } from '@nestjs/common'; import { ConfigService } from '@nestjs/config'; import { Client } from '@elastic/elasticsearch'; @Injectable() export class ElasticsearchService implements OnModuleInit { public readonly client: Client; private readonly logger = new Logger(ElasticsearchService.name); private readonly indexName: string; constructor( private configService: ConfigService, ) { const node = this.configService.get('ELASTICSEARCH_HOST'); // Changed from NODE to HOST this.indexName = this.configService.get( 'ELASTICSEARCH_INDEX', 'knowledge_base', ); if (!node) { throw new Error('ELASTICSEARCH_HOST environment variable not set.'); } this.client = new Client({ node, }); } async onModuleInit() { try { const health = await this.client.cluster.health(); this.logger.log(`Elasticsearch cluster health is: ${health.status}`); // 初期化時にはインデックスを作成せず、実際の使用時にモデルに基づいて動的に作成されるのを待つ } catch (error) { this.logger.error('Failed to connect to Elasticsearch', error); } } async createIndexIfNotExists(vectorDimensions: number) { const indexExists = await this.client.indices.exists({ index: this.indexName, }); if (!indexExists) { this.logger.log( `インデックス ${this.indexName} を作成します。ベクトル次元数: ${vectorDimensions}`, ); await this.createIndex(vectorDimensions); } else { // 既存インデックスのベクトル次元数を確認 const mapping = await this.client.indices.getMapping({ index: this.indexName, }); const vectorMapping = mapping[this.indexName]?.mappings?.properties ?.vector as any; const existingDims = vectorMapping?.dims; if (existingDims && existingDims !== vectorDimensions) { this.logger.warn( `インデックス ${this.indexName} のベクトル次元数 ${existingDims} が現在のモデル次元数 ${vectorDimensions} と一致しません。`, ); this.logger.warn(`原因: 異なる次元数の埋め込みモデルに変更された可能性があります。システムは自動的にインデックスを再作成します。`); // 既存インデックスを削除して再作成 await this.client.indices.delete({ index: this.indexName }); this.logger.log(`旧インデックスを正常に削除しました: ${this.indexName}`); await this.createIndex(vectorDimensions); this.logger.log(`インデックスを再作成しました: ${this.indexName} (次元数: ${vectorDimensions})`); } else { this.logger.log( `インデックス ${this.indexName} は既に存在します。ベクトル次元数: ${existingDims || '未知'}`, ); } } } async indexDocument( documentId: string, content: string, vector: number[], metadata: any, ) { this.logger.log( `Indexing document ${documentId}: content=${content.length} chars, vector=${vector?.length} dims`, ); if (!vector || vector.length === 0) { this.logger.error(`Invalid vector for document ${documentId}`); throw new Error('Vector is required for indexing'); } const document = { content, vector, fileId: metadata.fileId, fileName: metadata.originalName, title: metadata.title || metadata.originalName, fileMimeType: metadata.mimetype, chunkIndex: metadata.chunkIndex, startPosition: metadata.startPosition, endPosition: metadata.endPosition, userId: metadata.userId, tenantId: metadata.tenantId, createdAt: new Date(), }; const result = await this.client.index({ index: this.indexName, id: documentId, document, }); this.logger.log( `Indexed document ${documentId} with ${vector.length}D vector`, ); return result; } async deleteByFileId(fileId: string, userId: string, tenantId?: string) { const filter: any[] = [{ term: { fileId } }]; if (tenantId) { filter.push({ term: { tenantId } }); } else { filter.push({ term: { userId } }); } await this.client.deleteByQuery({ index: this.indexName, query: { bool: { filter }, }, }); } async updateTitleByFileId(fileId: string, title: string, tenantId?: string) { const filter: any[] = [{ term: { fileId } }]; if (tenantId) { filter.push({ term: { tenantId } }); } await this.client.updateByQuery({ index: this.indexName, query: { bool: { filter }, }, script: { source: 'ctx._source.title = params.title', params: { title }, }, refresh: true, // 即座に検索に反映させる }); } async deleteByUserId(userId: string) { // Note: This method should likely only be used by admin functionality // since it deletes all data for a user await this.client.deleteByQuery({ index: this.indexName, query: { term: { userId }, }, }); } async searchSimilar(queryVector: number[], userId: string, topK: number = 5, tenantId?: string) { try { this.logger.log( `Vector search: userId=${userId}, vectorDim=${queryVector?.length}, topK=${topK}`, ); if (!queryVector || queryVector.length === 0) { this.logger.warn('Empty query vector provided'); return []; } const filterClauses: any[] = []; if (tenantId) { filterClauses.push({ term: { tenantId } }); } else { filterClauses.push({ term: { userId } }); } const response = await this.client.search({ index: this.indexName, knn: { field: 'vector', query_vector: queryVector, k: topK, num_candidates: topK * 2, filter: { bool: { must: filterClauses } }, }, size: topK, _source: { excludes: ['vector'], }, }); const results = response.hits.hits.map((hit: any) => ({ id: hit._id, score: this.normalizeScore(hit._score), // スコアの正規化 content: hit._source?.content, fileId: hit._source?.fileId, fileName: hit._source?.fileName, title: hit._source?.title, chunkIndex: hit._source?.chunkIndex, startPosition: hit._source?.startPosition, endPosition: hit._source?.endPosition, })); this.logger.log( `Vector search completed: found ${results.length} results`, ); return results; } catch (error) { this.logger.error('Vector search failed:', error); return []; } } async searchFullText(query: string, userId: string, topK: number = 5, tenantId?: string) { try { this.logger.log( `Full-text search: userId=${userId}, query="${query}", topK=${topK}`, ); if (!query || query.trim().length === 0) { this.logger.warn('Empty query provided for full-text search'); return []; } const filterClauses: any[] = []; if (tenantId) { filterClauses.push({ term: { tenantId } }); } else { filterClauses.push({ term: { userId } }); } const response = await this.client.search({ index: this.indexName, query: { bool: { must: { match: { content: { query: query, fuzziness: 'AUTO', }, }, }, filter: filterClauses, }, }, size: topK, _source: { excludes: ['vector'], }, }); const results = response.hits.hits.map((hit: any) => ({ id: hit._id, score: this.normalizeScore(hit._score), // スコアの正規化 content: hit._source?.content, fileId: hit._source?.fileId, fileName: hit._source?.fileName, title: hit._source?.title, chunkIndex: hit._source?.chunkIndex, startPosition: hit._source?.startPosition, endPosition: hit._source?.endPosition, })); this.logger.log( `Full-text search completed: found ${results.length} results`, ); return results; } catch (error) { this.logger.error('Full-text search failed:', error); return []; } } async hybridSearch( queryVector: number[], query: string, userId: string, topK: number = 5, vectorWeight: number = 0.7, selectedGroups?: string[], // 後方互換性のために残す(未使用) explicitFileIds?: string[], // 明示的に指定されたファイルIDリスト tenantId?: string, ) { // selectedGroups は廃止予定。呼び出し側で fileIds に変換して explicitFileIds を使用してください const fileIds = explicitFileIds; if (fileIds && fileIds.length === 0) { this.logger.log('検索対象ファイルが0件のため、検索をスキップします'); return []; } if (fileIds) { this.logger.log(`最終検索対象ファイル範囲: ${fileIds.length} 個のファイル`); } // ハイブリッド検索:ベクトル検索 + 全文検索 const [vectorResults, textResults] = await Promise.all([ this.searchSimilarWithFileFilter(queryVector, userId, topK, fileIds, tenantId), this.searchFullTextWithFileFilter(query, userId, topK, fileIds, tenantId), ]); // 結果をマージして重複を排除 const combinedResults = new Map(); // 向量搜索結果を追加 vectorResults.forEach((result) => { combinedResults.set(result.id, { ...result, vectorScore: result.score, textScore: 0, combinedScore: result.score * vectorWeight, }); }); // 全文検索結果を追加 textResults.forEach((result) => { if (combinedResults.has(result.id)) { const existing = combinedResults.get(result.id); existing.textScore = result.score; existing.combinedScore = existing.vectorScore * vectorWeight + result.score * (1 - vectorWeight); } else { combinedResults.set(result.id, { ...result, vectorScore: 0, textScore: result.score, combinedScore: result.score * (1 - vectorWeight), }); } }); // 正規化のためにすべての組み合わせスコアを取得 const allScores = Array.from(combinedResults.values()).map(r => r.combinedScore); const maxScore = Math.max(...allScores, 1); // ゼロ除算を避けるため最小1 const minScore = Math.min(...allScores); // 総合スコアでソートして上位 topK の結果を返す return Array.from(combinedResults.values()) .sort((a, b) => b.combinedScore - a.combinedScore) .slice(0, topK) .map((result) => { // combinedScoreは既に0-1の範囲にあるため、追加の正規化は不要 // 0-1の範囲にスコアを保つことで、実際の類似度を正確に反映 let finalScore = result.combinedScore; // スコアが0-1の範囲内に収まるようにクリップ finalScore = Math.max(0, Math.min(1.0, finalScore)); return { id: result.id, score: finalScore, content: result.content, fileId: result.fileId, fileName: result.fileName, title: result.title, chunkIndex: result.chunkIndex, startPosition: result.startPosition, endPosition: result.endPosition, }; }); } private async createIndex(vectorDimensions: number) { const mappings: any = { properties: { // チャンク内容 content: { type: 'text', analyzer: 'standard', }, // ベクトルデータ vector: { type: 'dense_vector', dims: vectorDimensions, index: true, similarity: 'cosine', }, // ファイル関連情報 fileId: { type: 'keyword' }, fileName: { type: 'keyword' }, title: { type: 'text' }, fileMimeType: { type: 'keyword' }, // チャンク情報 chunkIndex: { type: 'integer' }, startPosition: { type: 'integer' }, endPosition: { type: 'integer' }, // ユーザー情報 userId: { type: 'keyword' }, // テナント情報(マルチテナント分離用) tenantId: { type: 'keyword' }, // タイムスタンプ createdAt: { type: 'date' }, }, }; await this.client.indices.create({ index: this.indexName, mappings, }); this.logger.log( `インデックス ${this.indexName} を正常に作成しました。ベクトル次元数: ${vectorDimensions}`, ); } /** * Elasticsearch スコアを 0-1 の範囲に正規化する * Elasticsearch のスコアは 1.0 を超える可能性があるため、正規化が必要 * ただし、kNN検索の類似度スコアは既に0-1の範囲にある(cosine similarity)ので、 * 特別な正規化は不要。必要に応じて最小値保護のみ行う。 */ private normalizeScore(rawScore: number): number { if (!rawScore || rawScore <= 0) return 0; // 最小値は0 // kNN検索の場合は既に0-1の範囲にあるので、1を超えないようにクリップ // cosine similarityの最大値は1なので、1以上になった場合は1とする return Math.min(1.0, rawScore); } // ファイルフィルタ付きのベクトル検索 private async searchSimilarWithFileFilter( queryVector: number[], userId: string, topK: number = 5, fileIds?: string[], tenantId?: string, ) { try { this.logger.log( `Vector search with filter: userId=${userId}, tenantId=${tenantId}, vectorDim=${queryVector?.length}, topK=${topK}, fileIds=${fileIds?.length || 'all'}`, ); if (!queryVector || queryVector.length === 0) { this.logger.warn('Empty query vector provided'); return []; } if (fileIds && fileIds.length === 0) { this.logger.log('Filter resulted in 0 files, returning empty results for vector search'); return []; } const filterClauses: any[] = []; if (fileIds && fileIds.length > 0) { filterClauses.push({ terms: { fileId: fileIds } }); } // Tenant isolation: when tenantId is provided, enforce it if (tenantId) { filterClauses.push({ term: { tenantId } }); } else { // Legacy: fall back to userId-based filter filterClauses.push({ term: { userId } }); } const filter = filterClauses.length > 0 ? { bool: { must: filterClauses } } : undefined; const queryBody: any = { index: this.indexName, knn: { field: 'vector', query_vector: queryVector, k: topK, num_candidates: topK * 2, }, size: topK, _source: { excludes: ['vector'], }, }; if (filter && Object.keys(filter).length > 0) { queryBody.knn.filter = filter; } const response = await this.client.search(queryBody); const results = response.hits.hits.map((hit: any) => ({ id: hit._id, score: this.normalizeScore(hit._score), content: hit._source?.content, fileId: hit._source?.fileId, fileName: hit._source?.fileName, title: hit._source?.title, chunkIndex: hit._source?.chunkIndex, startPosition: hit._source?.startPosition, endPosition: hit._source?.endPosition, })); this.logger.log( `Vector search completed: found ${results.length} results`, ); return results; } catch (error) { this.logger.error('Vector search failed:', error); return []; } } // ファイルフィルタ付きの全文検索 private async searchFullTextWithFileFilter( query: string, userId: string, topK: number = 5, fileIds?: string[], tenantId?: string, ) { try { this.logger.log( `Full-text search with filter: userId=${userId}, query="${query}", topK=${topK}, fileIds=${fileIds?.length || 'all'}`, ); if (!query || query.trim().length === 0) { this.logger.warn('Empty query provided for full-text search'); return []; } if (fileIds && fileIds.length === 0) { this.logger.log('Filter resulted in 0 files, returning empty results for full-text search'); return []; } const mustClause: any[] = [ { match: { content: { query: query, fuzziness: 'AUTO', }, }, }, ]; const filter: any[] = []; if (fileIds && fileIds.length > 0) { filter.push({ terms: { fileId: fileIds } }); } if (tenantId) { filter.push({ term: { tenantId } }); } else { filter.push({ term: { userId } }); } const queryBody: any = { index: this.indexName, query: { bool: { must: mustClause, filter: filter, }, }, size: topK, _source: { excludes: ['vector'], }, }; const response = await this.client.search(queryBody); const results = response.hits.hits.map((hit: any) => ({ id: hit._id, score: this.normalizeScore(hit._score), content: hit._source?.content, fileId: hit._source?.fileId, fileName: hit._source?.fileName, title: hit._source?.title, chunkIndex: hit._source?.chunkIndex, startPosition: hit._source?.startPosition, endPosition: hit._source?.endPosition, })); this.logger.log( `Full-text search completed: found ${results.length} results`, ); return results; } catch (error) { this.logger.error('Full-text search failed:', error); return []; } } /** * 指定されたファイルのすべてのチャンクを取得 */ async getFileChunks(fileId: string, userId: string, tenantId?: string) { try { this.logger.log(`Getting chunks for file ${fileId}`); const filter: any[] = [{ term: { fileId } }]; if (tenantId) { filter.push({ term: { tenantId } }); } else { filter.push({ term: { userId } }); } const response = await this.client.search({ index: this.indexName, query: { bool: { filter }, }, sort: [{ chunkIndex: 'asc' }], size: 10000, // 単一ファイルが 10000 チャンクを超えないと想定 _source: { excludes: ['vector'], // 転送量を減らすため、ベクトルデータは返さない }, }); const chunks = response.hits.hits.map((hit: any) => ({ id: hit._id, chunkIndex: hit._source.chunkIndex, content: hit._source.content, startPosition: hit._source.startPosition, endPosition: hit._source.endPosition, fileName: hit._source.fileName, })); this.logger.log(`Found ${chunks.length} chunks for file ${fileId}`); return chunks; } catch (error) { this.logger.error(`Failed to get chunks for file ${fileId}`, error); return []; } } }