elasticsearch.service.ts 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. import { Injectable, Logger, OnModuleInit } from '@nestjs/common';
  2. import { ConfigService } from '@nestjs/config';
  3. import { Client } from '@elastic/elasticsearch';
  4. @Injectable()
  5. export class ElasticsearchService implements OnModuleInit {
  6. public readonly client: Client;
  7. private readonly logger = new Logger(ElasticsearchService.name);
  8. private readonly indexName: string;
  9. constructor(
  10. private configService: ConfigService,
  11. ) {
  12. const node = this.configService.get<string>('ELASTICSEARCH_HOST'); // Changed from NODE to HOST
  13. this.indexName = this.configService.get<string>(
  14. 'ELASTICSEARCH_INDEX',
  15. 'knowledge_base',
  16. );
  17. if (!node) {
  18. throw new Error('ELASTICSEARCH_HOST environment variable not set.');
  19. }
  20. this.client = new Client({
  21. node,
  22. });
  23. }
  24. async onModuleInit() {
  25. try {
  26. const health = await this.client.cluster.health();
  27. this.logger.log(`Elasticsearch cluster health is: ${health.status}`);
  28. // 初期化時にはインデックスを作成せず、実際の使用時にモデルに基づいて動的に作成されるのを待つ
  29. } catch (error) {
  30. this.logger.error('Failed to connect to Elasticsearch', error);
  31. }
  32. }
  33. async createIndexIfNotExists(vectorDimensions: number) {
  34. const indexExists = await this.client.indices.exists({
  35. index: this.indexName,
  36. });
  37. if (!indexExists) {
  38. this.logger.log(
  39. `インデックス ${this.indexName} を作成します。ベクトル次元数: ${vectorDimensions}`,
  40. );
  41. await this.createIndex(vectorDimensions);
  42. } else {
  43. // 既存インデックスのベクトル次元数を確認
  44. const mapping = await this.client.indices.getMapping({
  45. index: this.indexName,
  46. });
  47. const vectorMapping = mapping[this.indexName]?.mappings?.properties
  48. ?.vector as any;
  49. const existingDims = vectorMapping?.dims;
  50. if (existingDims && existingDims !== vectorDimensions) {
  51. this.logger.warn(
  52. `インデックス ${this.indexName} のベクトル次元数 ${existingDims} が現在のモデル次元数 ${vectorDimensions} と一致しません。`,
  53. );
  54. this.logger.warn(`原因: 異なる次元数の埋め込みモデルに変更された可能性があります。システムは自動的にインデックスを再作成します。`);
  55. // 既存インデックスを削除して再作成
  56. await this.client.indices.delete({ index: this.indexName });
  57. this.logger.log(`旧インデックスを正常に削除しました: ${this.indexName}`);
  58. await this.createIndex(vectorDimensions);
  59. this.logger.log(`インデックスを再作成しました: ${this.indexName} (次元数: ${vectorDimensions})`);
  60. } else {
  61. this.logger.log(
  62. `インデックス ${this.indexName} は既に存在します。ベクトル次元数: ${existingDims || '未知'}`,
  63. );
  64. }
  65. }
  66. }
  67. async indexDocument(
  68. documentId: string,
  69. content: string,
  70. vector: number[],
  71. metadata: any,
  72. ) {
  73. this.logger.log(
  74. `Indexing document ${documentId}: content=${content.length} chars, vector=${vector?.length} dims`,
  75. );
  76. if (!vector || vector.length === 0) {
  77. this.logger.error(`Invalid vector for document ${documentId}`);
  78. throw new Error('Vector is required for indexing');
  79. }
  80. const document = {
  81. content,
  82. vector,
  83. fileId: metadata.fileId,
  84. fileName: metadata.originalName,
  85. title: metadata.title || metadata.originalName,
  86. fileMimeType: metadata.mimetype,
  87. chunkIndex: metadata.chunkIndex,
  88. startPosition: metadata.startPosition,
  89. endPosition: metadata.endPosition,
  90. userId: metadata.userId,
  91. tenantId: metadata.tenantId,
  92. createdAt: new Date(),
  93. };
  94. const result = await this.client.index({
  95. index: this.indexName,
  96. id: documentId,
  97. document,
  98. });
  99. this.logger.log(
  100. `Indexed document ${documentId} with ${vector.length}D vector`,
  101. );
  102. return result;
  103. }
  104. async deleteByFileId(fileId: string, userId: string, tenantId?: string) {
  105. const filter: any[] = [{ term: { fileId } }];
  106. if (tenantId) {
  107. filter.push({ term: { tenantId } });
  108. } else {
  109. filter.push({ term: { userId } });
  110. }
  111. await this.client.deleteByQuery({
  112. index: this.indexName,
  113. query: {
  114. bool: { filter },
  115. },
  116. });
  117. }
  118. async updateTitleByFileId(fileId: string, title: string, tenantId?: string) {
  119. const filter: any[] = [{ term: { fileId } }];
  120. if (tenantId) {
  121. filter.push({ term: { tenantId } });
  122. }
  123. await this.client.updateByQuery({
  124. index: this.indexName,
  125. query: {
  126. bool: { filter },
  127. },
  128. script: {
  129. source: 'ctx._source.title = params.title',
  130. params: { title },
  131. },
  132. refresh: true, // 即座に検索に反映させる
  133. });
  134. }
  135. async deleteByUserId(userId: string) {
  136. // Note: This method should likely only be used by admin functionality
  137. // since it deletes all data for a user
  138. await this.client.deleteByQuery({
  139. index: this.indexName,
  140. query: {
  141. term: { userId },
  142. },
  143. });
  144. }
  145. async searchSimilar(queryVector: number[], userId: string, topK: number = 5, tenantId?: string) {
  146. try {
  147. this.logger.log(
  148. `Vector search: userId=${userId}, vectorDim=${queryVector?.length}, topK=${topK}`,
  149. );
  150. if (!queryVector || queryVector.length === 0) {
  151. this.logger.warn('Empty query vector provided');
  152. return [];
  153. }
  154. const filterClauses: any[] = [];
  155. if (tenantId) {
  156. filterClauses.push({ term: { tenantId } });
  157. } else {
  158. filterClauses.push({ term: { userId } });
  159. }
  160. const response = await this.client.search({
  161. index: this.indexName,
  162. knn: {
  163. field: 'vector',
  164. query_vector: queryVector,
  165. k: topK,
  166. num_candidates: topK * 2,
  167. filter: { bool: { must: filterClauses } },
  168. },
  169. size: topK,
  170. _source: {
  171. excludes: ['vector'],
  172. },
  173. });
  174. const results = response.hits.hits.map((hit: any) => ({
  175. id: hit._id,
  176. score: this.normalizeScore(hit._score), // スコアの正規化
  177. content: hit._source?.content,
  178. fileId: hit._source?.fileId,
  179. fileName: hit._source?.fileName,
  180. title: hit._source?.title,
  181. chunkIndex: hit._source?.chunkIndex,
  182. startPosition: hit._source?.startPosition,
  183. endPosition: hit._source?.endPosition,
  184. }));
  185. this.logger.log(
  186. `Vector search completed: found ${results.length} results`,
  187. );
  188. return results;
  189. } catch (error) {
  190. this.logger.error('Vector search failed:', error);
  191. return [];
  192. }
  193. }
  194. async searchFullText(query: string, userId: string, topK: number = 5, tenantId?: string) {
  195. try {
  196. this.logger.log(
  197. `Full-text search: userId=${userId}, query="${query}", topK=${topK}`,
  198. );
  199. if (!query || query.trim().length === 0) {
  200. this.logger.warn('Empty query provided for full-text search');
  201. return [];
  202. }
  203. const filterClauses: any[] = [];
  204. if (tenantId) {
  205. filterClauses.push({ term: { tenantId } });
  206. } else {
  207. filterClauses.push({ term: { userId } });
  208. }
  209. const response = await this.client.search({
  210. index: this.indexName,
  211. query: {
  212. bool: {
  213. must: {
  214. match: {
  215. content: {
  216. query: query,
  217. fuzziness: 'AUTO',
  218. },
  219. },
  220. },
  221. filter: filterClauses,
  222. },
  223. },
  224. size: topK,
  225. _source: {
  226. excludes: ['vector'],
  227. },
  228. });
  229. const results = response.hits.hits.map((hit: any) => ({
  230. id: hit._id,
  231. score: this.normalizeScore(hit._score), // スコアの正規化
  232. content: hit._source?.content,
  233. fileId: hit._source?.fileId,
  234. fileName: hit._source?.fileName,
  235. title: hit._source?.title,
  236. chunkIndex: hit._source?.chunkIndex,
  237. startPosition: hit._source?.startPosition,
  238. endPosition: hit._source?.endPosition,
  239. }));
  240. this.logger.log(
  241. `Full-text search completed: found ${results.length} results`,
  242. );
  243. return results;
  244. } catch (error) {
  245. this.logger.error('Full-text search failed:', error);
  246. return [];
  247. }
  248. }
  249. async hybridSearch(
  250. queryVector: number[],
  251. query: string,
  252. userId: string,
  253. topK: number = 5,
  254. vectorWeight: number = 0.7,
  255. selectedGroups?: string[], // 後方互換性のために残す(未使用)
  256. explicitFileIds?: string[], // 明示的に指定されたファイルIDリスト
  257. tenantId?: string,
  258. ) {
  259. // selectedGroups は廃止予定。呼び出し側で fileIds に変換して explicitFileIds を使用してください
  260. const fileIds = explicitFileIds;
  261. if (fileIds && fileIds.length === 0) {
  262. this.logger.log('検索対象ファイルが0件のため、検索をスキップします');
  263. return [];
  264. }
  265. if (fileIds) {
  266. this.logger.log(`最終検索対象ファイル範囲: ${fileIds.length} 個のファイル`);
  267. }
  268. // ハイブリッド検索:ベクトル検索 + 全文検索
  269. const [vectorResults, textResults] = await Promise.all([
  270. this.searchSimilarWithFileFilter(queryVector, userId, topK, fileIds, tenantId),
  271. this.searchFullTextWithFileFilter(query, userId, topK, fileIds, tenantId),
  272. ]);
  273. // 結果をマージして重複を排除
  274. const combinedResults = new Map();
  275. // 向量搜索結果を追加
  276. vectorResults.forEach((result) => {
  277. combinedResults.set(result.id, {
  278. ...result,
  279. vectorScore: result.score,
  280. textScore: 0,
  281. combinedScore: result.score * vectorWeight,
  282. });
  283. });
  284. // 全文検索結果を追加
  285. textResults.forEach((result) => {
  286. if (combinedResults.has(result.id)) {
  287. const existing = combinedResults.get(result.id);
  288. existing.textScore = result.score;
  289. existing.combinedScore =
  290. existing.vectorScore * vectorWeight +
  291. result.score * (1 - vectorWeight);
  292. } else {
  293. combinedResults.set(result.id, {
  294. ...result,
  295. vectorScore: 0,
  296. textScore: result.score,
  297. combinedScore: result.score * (1 - vectorWeight),
  298. });
  299. }
  300. });
  301. // 正規化のためにすべての組み合わせスコアを取得
  302. const allScores = Array.from(combinedResults.values()).map(r => r.combinedScore);
  303. const maxScore = Math.max(...allScores, 1); // ゼロ除算を避けるため最小1
  304. const minScore = Math.min(...allScores);
  305. // 総合スコアでソートして上位 topK の結果を返す
  306. return Array.from(combinedResults.values())
  307. .sort((a, b) => b.combinedScore - a.combinedScore)
  308. .slice(0, topK)
  309. .map((result) => {
  310. // combinedScoreは既に0-1の範囲にあるため、追加の正規化は不要
  311. // 0-1の範囲にスコアを保つことで、実際の類似度を正確に反映
  312. let finalScore = result.combinedScore;
  313. // スコアが0-1の範囲内に収まるようにクリップ
  314. finalScore = Math.max(0, Math.min(1.0, finalScore));
  315. return {
  316. id: result.id,
  317. score: finalScore,
  318. content: result.content,
  319. fileId: result.fileId,
  320. fileName: result.fileName,
  321. title: result.title,
  322. chunkIndex: result.chunkIndex,
  323. startPosition: result.startPosition,
  324. endPosition: result.endPosition,
  325. };
  326. });
  327. }
  328. private async createIndex(vectorDimensions: number) {
  329. const mappings: any = {
  330. properties: {
  331. // チャンク内容
  332. content: {
  333. type: 'text',
  334. analyzer: 'standard',
  335. },
  336. // ベクトルデータ
  337. vector: {
  338. type: 'dense_vector',
  339. dims: vectorDimensions,
  340. index: true,
  341. similarity: 'cosine',
  342. },
  343. // ファイル関連情報
  344. fileId: { type: 'keyword' },
  345. fileName: { type: 'keyword' },
  346. title: { type: 'text' },
  347. fileMimeType: { type: 'keyword' },
  348. // チャンク情報
  349. chunkIndex: { type: 'integer' },
  350. startPosition: { type: 'integer' },
  351. endPosition: { type: 'integer' },
  352. // ユーザー情報
  353. userId: { type: 'keyword' },
  354. // テナント情報(マルチテナント分離用)
  355. tenantId: { type: 'keyword' },
  356. // タイムスタンプ
  357. createdAt: { type: 'date' },
  358. },
  359. };
  360. await this.client.indices.create({
  361. index: this.indexName,
  362. mappings,
  363. });
  364. this.logger.log(
  365. `インデックス ${this.indexName} を正常に作成しました。ベクトル次元数: ${vectorDimensions}`,
  366. );
  367. }
  368. /**
  369. * Elasticsearch スコアを 0-1 の範囲に正規化する
  370. * Elasticsearch のスコアは 1.0 を超える可能性があるため、正規化が必要
  371. * ただし、kNN検索の類似度スコアは既に0-1の範囲にある(cosine similarity)ので、
  372. * 特別な正規化は不要。必要に応じて最小値保護のみ行う。
  373. */
  374. private normalizeScore(rawScore: number): number {
  375. if (!rawScore || rawScore <= 0) return 0; // 最小値は0
  376. // kNN検索の場合は既に0-1の範囲にあるので、1を超えないようにクリップ
  377. // cosine similarityの最大値は1なので、1以上になった場合は1とする
  378. return Math.min(1.0, rawScore);
  379. }
  380. // ファイルフィルタ付きのベクトル検索
  381. private async searchSimilarWithFileFilter(
  382. queryVector: number[],
  383. userId: string,
  384. topK: number = 5,
  385. fileIds?: string[],
  386. tenantId?: string,
  387. ) {
  388. try {
  389. this.logger.log(
  390. `Vector search with filter: userId=${userId}, tenantId=${tenantId}, vectorDim=${queryVector?.length}, topK=${topK}, fileIds=${fileIds?.length || 'all'}`,
  391. );
  392. if (!queryVector || queryVector.length === 0) {
  393. this.logger.warn('Empty query vector provided');
  394. return [];
  395. }
  396. if (fileIds && fileIds.length === 0) {
  397. this.logger.log('Filter resulted in 0 files, returning empty results for vector search');
  398. return [];
  399. }
  400. const filterClauses: any[] = [];
  401. if (fileIds && fileIds.length > 0) {
  402. filterClauses.push({ terms: { fileId: fileIds } });
  403. }
  404. // Tenant isolation: when tenantId is provided, enforce it
  405. if (tenantId) {
  406. filterClauses.push({ term: { tenantId } });
  407. } else {
  408. // Legacy: fall back to userId-based filter
  409. filterClauses.push({ term: { userId } });
  410. }
  411. const filter = filterClauses.length > 0
  412. ? { bool: { must: filterClauses } }
  413. : undefined;
  414. const queryBody: any = {
  415. index: this.indexName,
  416. knn: {
  417. field: 'vector',
  418. query_vector: queryVector,
  419. k: topK,
  420. num_candidates: topK * 2,
  421. },
  422. size: topK,
  423. _source: {
  424. excludes: ['vector'],
  425. },
  426. };
  427. if (filter && Object.keys(filter).length > 0) {
  428. queryBody.knn.filter = filter;
  429. }
  430. const response = await this.client.search(queryBody);
  431. const results = response.hits.hits.map((hit: any) => ({
  432. id: hit._id,
  433. score: this.normalizeScore(hit._score),
  434. content: hit._source?.content,
  435. fileId: hit._source?.fileId,
  436. fileName: hit._source?.fileName,
  437. title: hit._source?.title,
  438. chunkIndex: hit._source?.chunkIndex,
  439. startPosition: hit._source?.startPosition,
  440. endPosition: hit._source?.endPosition,
  441. }));
  442. this.logger.log(
  443. `Vector search completed: found ${results.length} results`,
  444. );
  445. return results;
  446. } catch (error) {
  447. this.logger.error('Vector search failed:', error);
  448. return [];
  449. }
  450. }
  451. // ファイルフィルタ付きの全文検索
  452. private async searchFullTextWithFileFilter(
  453. query: string,
  454. userId: string,
  455. topK: number = 5,
  456. fileIds?: string[],
  457. tenantId?: string,
  458. ) {
  459. try {
  460. this.logger.log(
  461. `Full-text search with filter: userId=${userId}, query="${query}", topK=${topK}, fileIds=${fileIds?.length || 'all'}`,
  462. );
  463. if (!query || query.trim().length === 0) {
  464. this.logger.warn('Empty query provided for full-text search');
  465. return [];
  466. }
  467. if (fileIds && fileIds.length === 0) {
  468. this.logger.log('Filter resulted in 0 files, returning empty results for full-text search');
  469. return [];
  470. }
  471. const mustClause: any[] = [
  472. {
  473. match: {
  474. content: {
  475. query: query,
  476. fuzziness: 'AUTO',
  477. },
  478. },
  479. },
  480. ];
  481. const filter: any[] = [];
  482. if (fileIds && fileIds.length > 0) {
  483. filter.push({ terms: { fileId: fileIds } });
  484. }
  485. if (tenantId) {
  486. filter.push({ term: { tenantId } });
  487. } else {
  488. filter.push({ term: { userId } });
  489. }
  490. const queryBody: any = {
  491. index: this.indexName,
  492. query: {
  493. bool: {
  494. must: mustClause,
  495. filter: filter,
  496. },
  497. },
  498. size: topK,
  499. _source: {
  500. excludes: ['vector'],
  501. },
  502. };
  503. const response = await this.client.search(queryBody);
  504. const results = response.hits.hits.map((hit: any) => ({
  505. id: hit._id,
  506. score: this.normalizeScore(hit._score),
  507. content: hit._source?.content,
  508. fileId: hit._source?.fileId,
  509. fileName: hit._source?.fileName,
  510. title: hit._source?.title,
  511. chunkIndex: hit._source?.chunkIndex,
  512. startPosition: hit._source?.startPosition,
  513. endPosition: hit._source?.endPosition,
  514. }));
  515. this.logger.log(
  516. `Full-text search completed: found ${results.length} results`,
  517. );
  518. return results;
  519. } catch (error) {
  520. this.logger.error('Full-text search failed:', error);
  521. return [];
  522. }
  523. }
  524. /**
  525. * 指定されたファイルのすべてのチャンクを取得
  526. */
  527. async getFileChunks(fileId: string, userId: string, tenantId?: string) {
  528. try {
  529. this.logger.log(`Getting chunks for file ${fileId}`);
  530. const filter: any[] = [{ term: { fileId } }];
  531. if (tenantId) {
  532. filter.push({ term: { tenantId } });
  533. } else {
  534. filter.push({ term: { userId } });
  535. }
  536. const response = await this.client.search({
  537. index: this.indexName,
  538. query: {
  539. bool: { filter },
  540. },
  541. sort: [{ chunkIndex: 'asc' }],
  542. size: 10000, // 単一ファイルが 10000 チャンクを超えないと想定
  543. _source: {
  544. excludes: ['vector'], // 転送量を減らすため、ベクトルデータは返さない
  545. },
  546. });
  547. const chunks = response.hits.hits.map((hit: any) => ({
  548. id: hit._id,
  549. chunkIndex: hit._source.chunkIndex,
  550. content: hit._source.content,
  551. startPosition: hit._source.startPosition,
  552. endPosition: hit._source.endPosition,
  553. fileName: hit._source.fileName,
  554. }));
  555. this.logger.log(`Found ${chunks.length} chunks for file ${fileId}`);
  556. return chunks;
  557. } catch (error) {
  558. this.logger.error(`Failed to get chunks for file ${fileId}`, error);
  559. return [];
  560. }
  561. }
  562. }