| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633 |
- import { Injectable, Logger, OnModuleInit } from '@nestjs/common';
- import { ConfigService } from '@nestjs/config';
- import { Client } from '@elastic/elasticsearch';
- @Injectable()
- export class ElasticsearchService implements OnModuleInit {
- public readonly client: Client;
- private readonly logger = new Logger(ElasticsearchService.name);
- private readonly indexName: string;
- constructor(
- private configService: ConfigService,
- ) {
- const node = this.configService.get<string>('ELASTICSEARCH_HOST'); // Changed from NODE to HOST
- this.indexName = this.configService.get<string>(
- 'ELASTICSEARCH_INDEX',
- 'knowledge_base',
- );
- if (!node) {
- throw new Error('ELASTICSEARCH_HOST environment variable not set.');
- }
- this.client = new Client({
- node,
- });
- }
- async onModuleInit() {
- try {
- const health = await this.client.cluster.health();
- this.logger.log(`Elasticsearch cluster health is: ${health.status}`);
- // 初期化時にはインデックスを作成せず、実際の使用時にモデルに基づいて動的に作成されるのを待つ
- } catch (error) {
- this.logger.error('Failed to connect to Elasticsearch', error);
- }
- }
- async createIndexIfNotExists(vectorDimensions: number) {
- const indexExists = await this.client.indices.exists({
- index: this.indexName,
- });
- if (!indexExists) {
- this.logger.log(
- `インデックス ${this.indexName} を作成します。ベクトル次元数: ${vectorDimensions}`,
- );
- await this.createIndex(vectorDimensions);
- } else {
- // 既存インデックスのベクトル次元数を確認
- const mapping = await this.client.indices.getMapping({
- index: this.indexName,
- });
- const vectorMapping = mapping[this.indexName]?.mappings?.properties
- ?.vector as any;
- const existingDims = vectorMapping?.dims;
- if (existingDims && existingDims !== vectorDimensions) {
- this.logger.warn(
- `インデックス ${this.indexName} のベクトル次元数 ${existingDims} が現在のモデル次元数 ${vectorDimensions} と一致しません。`,
- );
- this.logger.warn(`原因: 異なる次元数の埋め込みモデルに変更された可能性があります。システムは自動的にインデックスを再作成します。`);
- // 既存インデックスを削除して再作成
- await this.client.indices.delete({ index: this.indexName });
- this.logger.log(`旧インデックスを正常に削除しました: ${this.indexName}`);
- await this.createIndex(vectorDimensions);
- this.logger.log(`インデックスを再作成しました: ${this.indexName} (次元数: ${vectorDimensions})`);
- } else {
- this.logger.log(
- `インデックス ${this.indexName} は既に存在します。ベクトル次元数: ${existingDims || '未知'}`,
- );
- }
- }
- }
- async indexDocument(
- documentId: string,
- content: string,
- vector: number[],
- metadata: any,
- ) {
- this.logger.log(
- `Indexing document ${documentId}: content=${content.length} chars, vector=${vector?.length} dims`,
- );
- if (!vector || vector.length === 0) {
- this.logger.error(`Invalid vector for document ${documentId}`);
- throw new Error('Vector is required for indexing');
- }
- const document = {
- content,
- vector,
- fileId: metadata.fileId,
- fileName: metadata.originalName,
- title: metadata.title || metadata.originalName,
- fileMimeType: metadata.mimetype,
- chunkIndex: metadata.chunkIndex,
- startPosition: metadata.startPosition,
- endPosition: metadata.endPosition,
- userId: metadata.userId,
- tenantId: metadata.tenantId,
- createdAt: new Date(),
- };
- const result = await this.client.index({
- index: this.indexName,
- id: documentId,
- document,
- });
- this.logger.log(
- `Indexed document ${documentId} with ${vector.length}D vector`,
- );
- return result;
- }
- async deleteByFileId(fileId: string, userId: string, tenantId?: string) {
- const filter: any[] = [{ term: { fileId } }];
- if (tenantId) {
- filter.push({ term: { tenantId } });
- } else {
- filter.push({ term: { userId } });
- }
- await this.client.deleteByQuery({
- index: this.indexName,
- query: {
- bool: { filter },
- },
- });
- }
- async updateTitleByFileId(fileId: string, title: string, tenantId?: string) {
- const filter: any[] = [{ term: { fileId } }];
- if (tenantId) {
- filter.push({ term: { tenantId } });
- }
- await this.client.updateByQuery({
- index: this.indexName,
- query: {
- bool: { filter },
- },
- script: {
- source: 'ctx._source.title = params.title',
- params: { title },
- },
- refresh: true, // 即座に検索に反映させる
- });
- }
- async deleteByUserId(userId: string) {
- // Note: This method should likely only be used by admin functionality
- // since it deletes all data for a user
- await this.client.deleteByQuery({
- index: this.indexName,
- query: {
- term: { userId },
- },
- });
- }
- async searchSimilar(queryVector: number[], userId: string, topK: number = 5, tenantId?: string) {
- try {
- this.logger.log(
- `Vector search: userId=${userId}, vectorDim=${queryVector?.length}, topK=${topK}`,
- );
- if (!queryVector || queryVector.length === 0) {
- this.logger.warn('Empty query vector provided');
- return [];
- }
- const filterClauses: any[] = [];
- if (tenantId) {
- filterClauses.push({ term: { tenantId } });
- } else {
- filterClauses.push({ term: { userId } });
- }
- const response = await this.client.search({
- index: this.indexName,
- knn: {
- field: 'vector',
- query_vector: queryVector,
- k: topK,
- num_candidates: topK * 2,
- filter: { bool: { must: filterClauses } },
- },
- size: topK,
- _source: {
- excludes: ['vector'],
- },
- });
- const results = response.hits.hits.map((hit: any) => ({
- id: hit._id,
- score: this.normalizeScore(hit._score), // スコアの正規化
- content: hit._source?.content,
- fileId: hit._source?.fileId,
- fileName: hit._source?.fileName,
- title: hit._source?.title,
- chunkIndex: hit._source?.chunkIndex,
- startPosition: hit._source?.startPosition,
- endPosition: hit._source?.endPosition,
- }));
- this.logger.log(
- `Vector search completed: found ${results.length} results`,
- );
- return results;
- } catch (error) {
- this.logger.error('Vector search failed:', error);
- return [];
- }
- }
- async searchFullText(query: string, userId: string, topK: number = 5, tenantId?: string) {
- try {
- this.logger.log(
- `Full-text search: userId=${userId}, query="${query}", topK=${topK}`,
- );
- if (!query || query.trim().length === 0) {
- this.logger.warn('Empty query provided for full-text search');
- return [];
- }
- const filterClauses: any[] = [];
- if (tenantId) {
- filterClauses.push({ term: { tenantId } });
- } else {
- filterClauses.push({ term: { userId } });
- }
- const response = await this.client.search({
- index: this.indexName,
- query: {
- bool: {
- must: {
- match: {
- content: {
- query: query,
- fuzziness: 'AUTO',
- },
- },
- },
- filter: filterClauses,
- },
- },
- size: topK,
- _source: {
- excludes: ['vector'],
- },
- });
- const results = response.hits.hits.map((hit: any) => ({
- id: hit._id,
- score: this.normalizeScore(hit._score), // スコアの正規化
- content: hit._source?.content,
- fileId: hit._source?.fileId,
- fileName: hit._source?.fileName,
- title: hit._source?.title,
- chunkIndex: hit._source?.chunkIndex,
- startPosition: hit._source?.startPosition,
- endPosition: hit._source?.endPosition,
- }));
- this.logger.log(
- `Full-text search completed: found ${results.length} results`,
- );
- return results;
- } catch (error) {
- this.logger.error('Full-text search failed:', error);
- return [];
- }
- }
- async hybridSearch(
- queryVector: number[],
- query: string,
- userId: string,
- topK: number = 5,
- vectorWeight: number = 0.7,
- selectedGroups?: string[], // 後方互換性のために残す(未使用)
- explicitFileIds?: string[], // 明示的に指定されたファイルIDリスト
- tenantId?: string,
- ) {
- // selectedGroups は廃止予定。呼び出し側で fileIds に変換して explicitFileIds を使用してください
- const fileIds = explicitFileIds;
- if (fileIds && fileIds.length === 0) {
- this.logger.log('検索対象ファイルが0件のため、検索をスキップします');
- return [];
- }
- if (fileIds) {
- this.logger.log(`最終検索対象ファイル範囲: ${fileIds.length} 個のファイル`);
- }
- // ハイブリッド検索:ベクトル検索 + 全文検索
- const [vectorResults, textResults] = await Promise.all([
- this.searchSimilarWithFileFilter(queryVector, userId, topK, fileIds, tenantId),
- this.searchFullTextWithFileFilter(query, userId, topK, fileIds, tenantId),
- ]);
- // 結果をマージして重複を排除
- const combinedResults = new Map();
- // 向量搜索結果を追加
- vectorResults.forEach((result) => {
- combinedResults.set(result.id, {
- ...result,
- vectorScore: result.score,
- textScore: 0,
- combinedScore: result.score * vectorWeight,
- });
- });
- // 全文検索結果を追加
- textResults.forEach((result) => {
- if (combinedResults.has(result.id)) {
- const existing = combinedResults.get(result.id);
- existing.textScore = result.score;
- existing.combinedScore =
- existing.vectorScore * vectorWeight +
- result.score * (1 - vectorWeight);
- } else {
- combinedResults.set(result.id, {
- ...result,
- vectorScore: 0,
- textScore: result.score,
- combinedScore: result.score * (1 - vectorWeight),
- });
- }
- });
- // 正規化のためにすべての組み合わせスコアを取得
- const allScores = Array.from(combinedResults.values()).map(r => r.combinedScore);
- const maxScore = Math.max(...allScores, 1); // ゼロ除算を避けるため最小1
- const minScore = Math.min(...allScores);
- // 総合スコアでソートして上位 topK の結果を返す
- return Array.from(combinedResults.values())
- .sort((a, b) => b.combinedScore - a.combinedScore)
- .slice(0, topK)
- .map((result) => {
- // combinedScoreは既に0-1の範囲にあるため、追加の正規化は不要
- // 0-1の範囲にスコアを保つことで、実際の類似度を正確に反映
- let finalScore = result.combinedScore;
- // スコアが0-1の範囲内に収まるようにクリップ
- finalScore = Math.max(0, Math.min(1.0, finalScore));
- return {
- id: result.id,
- score: finalScore,
- content: result.content,
- fileId: result.fileId,
- fileName: result.fileName,
- title: result.title,
- chunkIndex: result.chunkIndex,
- startPosition: result.startPosition,
- endPosition: result.endPosition,
- };
- });
- }
- private async createIndex(vectorDimensions: number) {
- const mappings: any = {
- properties: {
- // チャンク内容
- content: {
- type: 'text',
- analyzer: 'standard',
- },
- // ベクトルデータ
- vector: {
- type: 'dense_vector',
- dims: vectorDimensions,
- index: true,
- similarity: 'cosine',
- },
- // ファイル関連情報
- fileId: { type: 'keyword' },
- fileName: { type: 'keyword' },
- title: { type: 'text' },
- fileMimeType: { type: 'keyword' },
- // チャンク情報
- chunkIndex: { type: 'integer' },
- startPosition: { type: 'integer' },
- endPosition: { type: 'integer' },
- // ユーザー情報
- userId: { type: 'keyword' },
- // テナント情報(マルチテナント分離用)
- tenantId: { type: 'keyword' },
- // タイムスタンプ
- createdAt: { type: 'date' },
- },
- };
- await this.client.indices.create({
- index: this.indexName,
- mappings,
- });
- this.logger.log(
- `インデックス ${this.indexName} を正常に作成しました。ベクトル次元数: ${vectorDimensions}`,
- );
- }
- /**
- * Elasticsearch スコアを 0-1 の範囲に正規化する
- * Elasticsearch のスコアは 1.0 を超える可能性があるため、正規化が必要
- * ただし、kNN検索の類似度スコアは既に0-1の範囲にある(cosine similarity)ので、
- * 特別な正規化は不要。必要に応じて最小値保護のみ行う。
- */
- private normalizeScore(rawScore: number): number {
- if (!rawScore || rawScore <= 0) return 0; // 最小値は0
- // kNN検索の場合は既に0-1の範囲にあるので、1を超えないようにクリップ
- // cosine similarityの最大値は1なので、1以上になった場合は1とする
- return Math.min(1.0, rawScore);
- }
- // ファイルフィルタ付きのベクトル検索
- private async searchSimilarWithFileFilter(
- queryVector: number[],
- userId: string,
- topK: number = 5,
- fileIds?: string[],
- tenantId?: string,
- ) {
- try {
- this.logger.log(
- `Vector search with filter: userId=${userId}, tenantId=${tenantId}, vectorDim=${queryVector?.length}, topK=${topK}, fileIds=${fileIds?.length || 'all'}`,
- );
- if (!queryVector || queryVector.length === 0) {
- this.logger.warn('Empty query vector provided');
- return [];
- }
- if (fileIds && fileIds.length === 0) {
- this.logger.log('Filter resulted in 0 files, returning empty results for vector search');
- return [];
- }
- const filterClauses: any[] = [];
- if (fileIds && fileIds.length > 0) {
- filterClauses.push({ terms: { fileId: fileIds } });
- }
- // Tenant isolation: when tenantId is provided, enforce it
- if (tenantId) {
- filterClauses.push({ term: { tenantId } });
- } else {
- // Legacy: fall back to userId-based filter
- filterClauses.push({ term: { userId } });
- }
- const filter = filterClauses.length > 0
- ? { bool: { must: filterClauses } }
- : undefined;
- const queryBody: any = {
- index: this.indexName,
- knn: {
- field: 'vector',
- query_vector: queryVector,
- k: topK,
- num_candidates: topK * 2,
- },
- size: topK,
- _source: {
- excludes: ['vector'],
- },
- };
- if (filter && Object.keys(filter).length > 0) {
- queryBody.knn.filter = filter;
- }
- const response = await this.client.search(queryBody);
- const results = response.hits.hits.map((hit: any) => ({
- id: hit._id,
- score: this.normalizeScore(hit._score),
- content: hit._source?.content,
- fileId: hit._source?.fileId,
- fileName: hit._source?.fileName,
- title: hit._source?.title,
- chunkIndex: hit._source?.chunkIndex,
- startPosition: hit._source?.startPosition,
- endPosition: hit._source?.endPosition,
- }));
- this.logger.log(
- `Vector search completed: found ${results.length} results`,
- );
- return results;
- } catch (error) {
- this.logger.error('Vector search failed:', error);
- return [];
- }
- }
- // ファイルフィルタ付きの全文検索
- private async searchFullTextWithFileFilter(
- query: string,
- userId: string,
- topK: number = 5,
- fileIds?: string[],
- tenantId?: string,
- ) {
- try {
- this.logger.log(
- `Full-text search with filter: userId=${userId}, query="${query}", topK=${topK}, fileIds=${fileIds?.length || 'all'}`,
- );
- if (!query || query.trim().length === 0) {
- this.logger.warn('Empty query provided for full-text search');
- return [];
- }
- if (fileIds && fileIds.length === 0) {
- this.logger.log('Filter resulted in 0 files, returning empty results for full-text search');
- return [];
- }
- const mustClause: any[] = [
- {
- match: {
- content: {
- query: query,
- fuzziness: 'AUTO',
- },
- },
- },
- ];
- const filter: any[] = [];
- if (fileIds && fileIds.length > 0) {
- filter.push({ terms: { fileId: fileIds } });
- }
- if (tenantId) {
- filter.push({ term: { tenantId } });
- } else {
- filter.push({ term: { userId } });
- }
- const queryBody: any = {
- index: this.indexName,
- query: {
- bool: {
- must: mustClause,
- filter: filter,
- },
- },
- size: topK,
- _source: {
- excludes: ['vector'],
- },
- };
- const response = await this.client.search(queryBody);
- const results = response.hits.hits.map((hit: any) => ({
- id: hit._id,
- score: this.normalizeScore(hit._score),
- content: hit._source?.content,
- fileId: hit._source?.fileId,
- fileName: hit._source?.fileName,
- title: hit._source?.title,
- chunkIndex: hit._source?.chunkIndex,
- startPosition: hit._source?.startPosition,
- endPosition: hit._source?.endPosition,
- }));
- this.logger.log(
- `Full-text search completed: found ${results.length} results`,
- );
- return results;
- } catch (error) {
- this.logger.error('Full-text search failed:', error);
- return [];
- }
- }
- /**
- * 指定されたファイルのすべてのチャンクを取得
- */
- async getFileChunks(fileId: string) {
- try {
- this.logger.log(`Getting chunks for file ${fileId}`);
- const response = await this.client.search({
- index: this.indexName,
- query: {
- term: { fileId },
- },
- sort: [{ chunkIndex: 'asc' }],
- size: 10000, // 単一ファイルが 10000 チャンクを超えないと想定
- _source: {
- excludes: ['vector'], // 転送量を減らすため、ベクトルデータは返さない
- },
- });
- const chunks = response.hits.hits.map((hit: any) => ({
- id: hit._id,
- chunkIndex: hit._source.chunkIndex,
- content: hit._source.content,
- startPosition: hit._source.startPosition,
- endPosition: hit._source.endPosition,
- fileName: hit._source.fileName,
- }));
- this.logger.log(`Found ${chunks.length} chunks for file ${fileId}`);
- return chunks;
- } catch (error) {
- this.logger.error(`Failed to get chunks for file ${fileId}`, error);
- return [];
- }
- }
- }
|