jina-ai · shazhou2015 · Mar 26, 2025 · Mar 28, 2025 · Apr 14, 2025 · Apr 14, 2025
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -28,6 +28,7 @@
   "dependencies": {
     "@ai-sdk/google": "^1.0.0",
     "@ai-sdk/openai": "^1.1.9",
+    "@napi-rs/canvas": "^0.1.68",
     "@types/jsdom": "^21.1.7",
     "ai": "^4.1.26",
     "axios": "^1.7.9",

diff --git a/src/agent.ts b/src/agent.ts
@@ -16,7 +16,9 @@ import {
   KnowledgeItem,
   EvaluationType,
   BoostedSearchSnippet,
-  SearchSnippet, EvaluationResponse, Reference, SERPQuery, RepeatEvaluationType, UnNormalizedSearchSnippet, WebContent
+  SearchSnippet, EvaluationResponse, Reference, SERPQuery, RepeatEvaluationType, UnNormalizedSearchSnippet, WebContent,
+  ImageObject,
+  ImageReference
 } from "./types";
 import {TrackerContext} from "./types";
 import {search} from "./tools/jina-search";
@@ -42,7 +44,7 @@ import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas}
 import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
 import {repairUnknownChars} from "./tools/broken-ch-fixer";
 import {reviseAnswer} from "./tools/md-fixer";
-import {buildReferences} from "./tools/build-ref";
+import {buildImageReferences, buildReferences} from "./tools/build-ref";
 
 async function sleep(ms: number) {
   const seconds = Math.ceil(ms / 1000);
@@ -394,7 +396,7 @@ export async function getResponse(question?: string,
                                   onlyHostnames: string[] = [],
                                   maxRef: number = 10,
                                   minRelScore: number = 0.75
-): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> {
+): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[], testImages: string[], imageReferences: ImageReference[] }> {
 
   let step = 0;
   let totalStep = 0;
@@ -445,6 +447,7 @@ export async function getResponse(question?: string,
   const allWebContents: Record<string, WebContent> = {};
   const visitedURLs: string[] = [];
   const badURLs: string[] = [];
+  const imageObjects: ImageObject[] = [];
   const evaluationMetrics: Record<string, RepeatEvaluationType[]> = {};
   // reserve the 10% final budget for the beast mode
   const regularBudget = tokenBudget * 0.85;
@@ -849,6 +852,7 @@ You decided to think out of the box or cut from a completely different angle.
           allURLs,
           visitedURLs,
           badURLs,
+          imageObjects,
           SchemaGen,
           currentQuestion,
           allWebContents
@@ -1010,7 +1014,12 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
       );
   }
 
-  console.log(thisStep)
+  console.log(thisStep);
+  let imageReferences: any;
+  if(imageObjects.length) {
+    imageReferences = await buildImageReferences(answerStep.answer, imageObjects, context, SchemaGen);
+    console.log('**Image references**:', imageReferences);
+  }
 
   // max return 300 urls
   const returnedURLs = weightedURLs.slice(0, numReturnedURLs).map(r => r.url);
@@ -1019,7 +1028,9 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
     context,
     visitedURLs: returnedURLs,
     readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
-    allURLs: weightedURLs.map(r => r.url)
+    allURLs: weightedURLs.map(r => r.url),
+    testImages: imageObjects.map(i => i.url),
+    imageReferences: imageReferences,
   };
 }
 

diff --git a/src/app.ts b/src/app.ts
@@ -7,7 +7,7 @@ import {
   ChatCompletionResponse,
   ChatCompletionChunk,
   AnswerAction,
-  Model, StepAction, VisitAction
+  Model, StepAction, VisitAction,
 } from './types';
 import {TokenTracker} from "./utils/token-tracker";
 import {ActionTracker} from "./utils/action-tracker";
@@ -501,7 +501,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
       // Add content to queue for both thinking steps and final answer
       if (step.action === 'visit') {
         // emit every url in the visit action in url field
-        ((step as VisitAction).URLTargets as string[]).forEach((url) => {
+        ((step as VisitAction).URLTargets as string[])?.forEach((url) => {
           const chunk: ChatCompletionChunk = {
             id: requestId,
             object: 'chat.completion.chunk',
@@ -547,7 +547,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
       result: finalStep,
       visitedURLs,
       readURLs,
-      allURLs
+      allURLs,
+      testImages,
+      imageReferences,
     } = await getResponse(undefined,
       tokenBudget,
       maxBadAttempts,
@@ -632,7 +634,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
         usage,
         visitedURLs,
         readURLs,
-        numURLs: allURLs.length
+        numURLs: allURLs.length,
+        testImages,
+        imageReferences
       };
       res.write(`data: ${JSON.stringify(finalChunk)}\n\n`);
       res.end();
@@ -658,7 +662,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
         usage,
         visitedURLs,
         readURLs,
-        numURLs: allURLs.length
+        numURLs: allURLs.length,
+        testImages,
+        imageReferences: imageReferences,
       };
 
       // Log final response (excluding full content for brevity)
@@ -669,7 +675,8 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
         usage: response.usage,
         visitedURLs: response.visitedURLs,
         readURLs: response.readURLs,
-        numURLs: allURLs.length
+        numURLs: allURLs.length,
+        testImages,
       });
 
       res.json(response);

diff --git a/src/tools/build-ref.ts b/src/tools/build-ref.ts
@@ -1,5 +1,5 @@
 import {segmentText} from './segment';
-import {Reference, TrackerContext, WebContent} from "../types";
+import {ImageObject, ImageReference, Reference, TrackerContext, WebContent} from "../types";
 import {Schemas} from "../utils/schemas";
 import {cosineSimilarity, jaccardRank} from "./cosine";
 import {getEmbeddings} from "./embeddings";
@@ -41,7 +41,7 @@ export async function buildReferences(
       };
 
       // Track valid web chunks (above minimum length)
-      if (chunk.length >= minChunkLength) {
+      if (chunk?.length >= minChunkLength) {
         validWebChunkIndices.add(chunkIndex);
       }
 
@@ -363,4 +363,184 @@ function buildFinalResult(
     answer: modifiedAnswer,
     references
   };
+}
+
+export async function buildImageReferences(
+  answer: string,
+  imageObjects: ImageObject[],
+  context: TrackerContext,
+  schema: Schemas,
+  minChunkLength: number = 80,
+  maxRef: number = 10,
+  minRelScore: number = 0.35
+): Promise<Array<ImageReference>> {
+  console.log(`[buildImageReferences] Starting with maxRef=${maxRef}, minChunkLength=${minChunkLength}, minRelScore=${minRelScore}`);
+  console.log(`[buildImageReferences] Answer length: ${answer.length} chars, Image sources: ${imageObjects.length}`);
+
+  // Step 1: Chunk the answer
+  console.log(`[buildImageReferences] Step 1: Chunking answer text`);
+  const {chunks: answerChunks, chunk_positions: answerChunkPositions} = await segmentText(answer, context);
+  console.log(`[buildImageReferences] Answer segmented into ${answerChunks.length} chunks`);
+
+  // Step 2: Prepare image content
+  console.log(`[buildImageReferences] Step 2: Preparing image content`);
+  const allImageEmbeddings: number[][] = imageObjects.map(img => img.embedding[0]); // Extract embedding
+  const imageToSourceMap: any = {};
+  const validImageIndices = new Set<number>();
+
+  imageObjects.forEach((img, index) => {
+      imageToSourceMap[index] = {
+          url: img.url,
+          altText: img.alt,
+          embedding: img.embedding[0] // Store extracted embedding
+      };
+      validImageIndices.add(index);
+  });
+
+  console.log(`[buildImageReferences] Collected ${allImageEmbeddings.length} image embeddings`);
+
+  if (allImageEmbeddings.length === 0) {
+      console.log(`[buildImageReferences] No image data available, returning empty array`);
+      return [];
+  }
+
+  // Step 3: Filter answer chunks by minimum length
+  console.log(`[buildImageReferences] Step 3: Filtering answer chunks by minimum length`);
+  const validAnswerChunks: string[] = [];
+  const validAnswerChunkIndices: number[] = [];
+  const validAnswerChunkPositions: [number, number][] = [];
+
+  context.actionTracker.trackThink('cross_reference', schema.languageCode);
+
+  for (let i = 0; i < answerChunks.length; i++) {
+      const answerChunk = answerChunks[i];
+      const answerChunkPosition = answerChunkPositions[i];
+
+      if (!answerChunk.trim() || answerChunk.length < minChunkLength) continue;
+
+      validAnswerChunks.push(answerChunk);
+      validAnswerChunkIndices.push(i);
+      validAnswerChunkPositions.push(answerChunkPosition);
+  }
+
+  console.log(`[buildImageReferences] Found ${validAnswerChunks.length}/${answerChunks.length} valid answer chunks above minimum length`);
+
+  if (validAnswerChunks.length === 0) {
+      console.log(`[buildImageReferences] No valid answer chunks, returning empty array`);
+      return [];
+  }
+
+  // Step 4: Get embeddings for answer chunks
+  console.log(`[buildImageReferences] Step 4: Getting embeddings for answer chunks`);
+  const answerEmbeddings: number[][] = [];
+
+  try {
+      //  const embeddingsResult = await getEmbeddings(validAnswerChunks, context.tokenTracker, embeddingOptions); //  No embeddingOptions needed here
+      //   answerEmbeddings.push(...embeddingsResult.embeddings);
+      const embeddingsResult = await getEmbeddings(validAnswerChunks, context.tokenTracker, {
+          dimensions: 512,
+          model: 'jina-clip-v2',
+      });
+      answerEmbeddings.push(...embeddingsResult.embeddings);
+
+      console.log(`[buildImageReferences] Got embeddings for ${answerEmbeddings.length} answer chunks`);
+
+      // Step 5: Compute pairwise cosine similarity
+      console.log(`[buildImageReferences] Step 5: Computing pairwise cosine similarity between answer and image embeddings`);
+      const allMatches = [];
+
+      for (let i = 0; i < validAnswerChunks.length; i++) {
+          const answerChunkIndex = validAnswerChunkIndices[i];
+          const answerChunk = validAnswerChunks[i];
+          const answerChunkPosition = answerChunkPositions[i];
+          const answerEmbedding = answerEmbeddings[i];
+
+          const matchesForChunk = [];
+
+          for (const imageIndex of validImageIndices) {
+              const imageEmbedding = allImageEmbeddings[imageIndex];
+
+              if (imageEmbedding) {
+                  const score = cosineSimilarity(answerEmbedding, imageEmbedding);
+
+                  matchesForChunk.push({
+                      imageIndex,
+                      relevanceScore: score
+                  });
+              }
+          }
+
+          matchesForChunk.sort((a, b) => b.relevanceScore - a.relevanceScore);
+
+          for (const match of matchesForChunk) {
+              allMatches.push({
+                  imageIndex: match.imageIndex,
+                  answerChunkIndex: answerChunkIndex,
+                  relevanceScore: match.relevanceScore,
+                  answerChunk: answerChunk,
+                  answerChunkPosition: answerChunkPosition
+              });
+          }
+
+          console.log(`[buildImageReferences] Processed answer chunk ${i + 1}/${validAnswerChunks.length}, top score: ${matchesForChunk[0]?.relevanceScore.toFixed(4)}`);
+      }
+
+      // Log statistics about relevance scores
+      if (allMatches.length > 0) {
+          const relevanceScores = allMatches.map(match => match.relevanceScore);
+          const minRelevance = Math.min(...relevanceScores);
+          const maxRelevance = Math.max(...relevanceScores);
+          const sumRelevance = relevanceScores.reduce((sum, score) => sum + score, 0);
+          const meanRelevance = sumRelevance / relevanceScores.length;
+
+          console.log('Reference relevance statistics:', {
+              min: minRelevance.toFixed(4),
+              max: maxRelevance.toFixed(4),
+              mean: meanRelevance.toFixed(4),
+              count: relevanceScores.length
+          });
+      }
+
+
+      // Step 6: Sort all matches by relevance
+      allMatches.sort((a, b) => b.relevanceScore - a.relevanceScore);
+      console.log(`[buildImageReferences] Step 6: Sorted ${allMatches.length} potential matches by relevance score`);
+
+      // Step 7: Filter matches
+      console.log(`[buildImageReferences] Step 7: Filtering matches to ensure uniqueness and threshold (min: ${minRelScore})`);
+      const usedImages = new Set();
+      const usedAnswerChunks = new Set();
+      const filteredMatches = [];
+
+      for (const match of allMatches) {
+          if (match.relevanceScore < minRelScore) continue;
+
+          if (!usedImages.has(match.imageIndex) && !usedAnswerChunks.has(match.answerChunkIndex)) {
+              filteredMatches.push(match);
+              usedImages.add(match.imageIndex);
+              usedAnswerChunks.add(match.answerChunkIndex);
+
+              if (filteredMatches.length >= maxRef) break;
+          }
+      }
+
+      console.log(`[buildImageReferences] Selected ${filteredMatches.length}/${allMatches.length} references after filtering`);
+
+      const references: ImageReference[] = filteredMatches.map((match) => {
+          const source = imageToSourceMap[match.imageIndex];
+          return {
+              url: source.url,
+              alt: source.altText,
+              relevanceScore: match.relevanceScore,
+              answerChunk: match.answerChunk,
+              answerChunkPosition: match.answerChunkPosition
+          };
+      });
+
+      return references;
+
+  } catch (error) {
+      console.error('Embedding failed', error);
+      return [];
+  }
 }