Skip to content

feat: gather images to response #98

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
577 changes: 574 additions & 3 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"dependencies": {
"@ai-sdk/google": "^1.0.0",
"@ai-sdk/openai": "^1.1.9",
"@napi-rs/canvas": "^0.1.68",
"@types/jsdom": "^21.1.7",
"ai": "^4.1.26",
"axios": "^1.7.9",
Expand Down
21 changes: 16 additions & 5 deletions src/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ import {
KnowledgeItem,
EvaluationType,
BoostedSearchSnippet,
SearchSnippet, EvaluationResponse, Reference, SERPQuery, RepeatEvaluationType, UnNormalizedSearchSnippet, WebContent
SearchSnippet, EvaluationResponse, Reference, SERPQuery, RepeatEvaluationType, UnNormalizedSearchSnippet, WebContent,
ImageObject,
ImageReference
} from "./types";
import {TrackerContext} from "./types";
import {search} from "./tools/jina-search";
Expand All @@ -42,7 +44,7 @@ import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas}
import {formatDateBasedOnType, formatDateRange} from "./utils/date-tools";
import {repairUnknownChars} from "./tools/broken-ch-fixer";
import {reviseAnswer} from "./tools/md-fixer";
import {buildReferences} from "./tools/build-ref";
import {buildImageReferences, buildReferences} from "./tools/build-ref";

async function sleep(ms: number) {
const seconds = Math.ceil(ms / 1000);
Expand Down Expand Up @@ -394,7 +396,7 @@ export async function getResponse(question?: string,
onlyHostnames: string[] = [],
maxRef: number = 10,
minRelScore: number = 0.75
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> {
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[], testImages: string[], imageReferences: ImageReference[] }> {

let step = 0;
let totalStep = 0;
Expand Down Expand Up @@ -445,6 +447,7 @@ export async function getResponse(question?: string,
const allWebContents: Record<string, WebContent> = {};
const visitedURLs: string[] = [];
const badURLs: string[] = [];
const imageObjects: ImageObject[] = [];
const evaluationMetrics: Record<string, RepeatEvaluationType[]> = {};
// reserve the 10% final budget for the beast mode
const regularBudget = tokenBudget * 0.85;
Expand Down Expand Up @@ -849,6 +852,7 @@ You decided to think out of the box or cut from a completely different angle.
allURLs,
visitedURLs,
badURLs,
imageObjects,
SchemaGen,
currentQuestion,
allWebContents
Expand Down Expand Up @@ -1010,7 +1014,12 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
);
}

console.log(thisStep)
console.log(thisStep);
let imageReferences: any;
if(imageObjects.length) {
imageReferences = await buildImageReferences(answerStep.answer, imageObjects, context, SchemaGen);
console.log('**Image references**:', imageReferences);
}

// max return 300 urls
const returnedURLs = weightedURLs.slice(0, numReturnedURLs).map(r => r.url);
Expand All @@ -1019,7 +1028,9 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
context,
visitedURLs: returnedURLs,
readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
allURLs: weightedURLs.map(r => r.url)
allURLs: weightedURLs.map(r => r.url),
testImages: imageObjects.map(i => i.url),
imageReferences: imageReferences,
};
}

Expand Down
19 changes: 13 additions & 6 deletions src/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import {
ChatCompletionResponse,
ChatCompletionChunk,
AnswerAction,
Model, StepAction, VisitAction
Model, StepAction, VisitAction,
} from './types';
import {TokenTracker} from "./utils/token-tracker";
import {ActionTracker} from "./utils/action-tracker";
Expand Down Expand Up @@ -501,7 +501,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
// Add content to queue for both thinking steps and final answer
if (step.action === 'visit') {
// emit every url in the visit action in url field
((step as VisitAction).URLTargets as string[]).forEach((url) => {
((step as VisitAction).URLTargets as string[])?.forEach((url) => {
const chunk: ChatCompletionChunk = {
id: requestId,
object: 'chat.completion.chunk',
Expand Down Expand Up @@ -547,7 +547,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
result: finalStep,
visitedURLs,
readURLs,
allURLs
allURLs,
testImages,
imageReferences,
} = await getResponse(undefined,
tokenBudget,
maxBadAttempts,
Expand Down Expand Up @@ -632,7 +634,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
usage,
visitedURLs,
readURLs,
numURLs: allURLs.length
numURLs: allURLs.length,
testImages,
imageReferences
};
res.write(`data: ${JSON.stringify(finalChunk)}\n\n`);
res.end();
Expand All @@ -658,7 +662,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
usage,
visitedURLs,
readURLs,
numURLs: allURLs.length
numURLs: allURLs.length,
testImages,
imageReferences: imageReferences,
};

// Log final response (excluding full content for brevity)
Expand All @@ -669,7 +675,8 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
usage: response.usage,
visitedURLs: response.visitedURLs,
readURLs: response.readURLs,
numURLs: allURLs.length
numURLs: allURLs.length,
testImages,
});

res.json(response);
Expand Down
184 changes: 182 additions & 2 deletions src/tools/build-ref.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {segmentText} from './segment';
import {Reference, TrackerContext, WebContent} from "../types";
import {ImageObject, ImageReference, Reference, TrackerContext, WebContent} from "../types";
import {Schemas} from "../utils/schemas";
import {cosineSimilarity, jaccardRank} from "./cosine";
import {getEmbeddings} from "./embeddings";
Expand Down Expand Up @@ -41,7 +41,7 @@ export async function buildReferences(
};

// Track valid web chunks (above minimum length)
if (chunk.length >= minChunkLength) {
if (chunk?.length >= minChunkLength) {
validWebChunkIndices.add(chunkIndex);
}

Expand Down Expand Up @@ -363,4 +363,184 @@ function buildFinalResult(
answer: modifiedAnswer,
references
};
}

export async function buildImageReferences(
answer: string,
imageObjects: ImageObject[],
context: TrackerContext,
schema: Schemas,
minChunkLength: number = 80,
maxRef: number = 10,
minRelScore: number = 0.35
): Promise<Array<ImageReference>> {
console.log(`[buildImageReferences] Starting with maxRef=${maxRef}, minChunkLength=${minChunkLength}, minRelScore=${minRelScore}`);
console.log(`[buildImageReferences] Answer length: ${answer.length} chars, Image sources: ${imageObjects.length}`);

// Step 1: Chunk the answer
console.log(`[buildImageReferences] Step 1: Chunking answer text`);
const {chunks: answerChunks, chunk_positions: answerChunkPositions} = await segmentText(answer, context);
console.log(`[buildImageReferences] Answer segmented into ${answerChunks.length} chunks`);

// Step 2: Prepare image content
console.log(`[buildImageReferences] Step 2: Preparing image content`);
const allImageEmbeddings: number[][] = imageObjects.map(img => img.embedding[0]); // Extract embedding
const imageToSourceMap: any = {};
const validImageIndices = new Set<number>();

imageObjects.forEach((img, index) => {
imageToSourceMap[index] = {
url: img.url,
altText: img.alt,
embedding: img.embedding[0] // Store extracted embedding
};
validImageIndices.add(index);
});

console.log(`[buildImageReferences] Collected ${allImageEmbeddings.length} image embeddings`);

if (allImageEmbeddings.length === 0) {
console.log(`[buildImageReferences] No image data available, returning empty array`);
return [];
}

// Step 3: Filter answer chunks by minimum length
console.log(`[buildImageReferences] Step 3: Filtering answer chunks by minimum length`);
const validAnswerChunks: string[] = [];
const validAnswerChunkIndices: number[] = [];
const validAnswerChunkPositions: [number, number][] = [];

context.actionTracker.trackThink('cross_reference', schema.languageCode);

for (let i = 0; i < answerChunks.length; i++) {
const answerChunk = answerChunks[i];
const answerChunkPosition = answerChunkPositions[i];

if (!answerChunk.trim() || answerChunk.length < minChunkLength) continue;

validAnswerChunks.push(answerChunk);
validAnswerChunkIndices.push(i);
validAnswerChunkPositions.push(answerChunkPosition);
}

console.log(`[buildImageReferences] Found ${validAnswerChunks.length}/${answerChunks.length} valid answer chunks above minimum length`);

if (validAnswerChunks.length === 0) {
console.log(`[buildImageReferences] No valid answer chunks, returning empty array`);
return [];
}

// Step 4: Get embeddings for answer chunks
console.log(`[buildImageReferences] Step 4: Getting embeddings for answer chunks`);
const answerEmbeddings: number[][] = [];

try {
// const embeddingsResult = await getEmbeddings(validAnswerChunks, context.tokenTracker, embeddingOptions); // No embeddingOptions needed here
// answerEmbeddings.push(...embeddingsResult.embeddings);
const embeddingsResult = await getEmbeddings(validAnswerChunks, context.tokenTracker, {
dimensions: 512,
model: 'jina-clip-v2',
});
answerEmbeddings.push(...embeddingsResult.embeddings);

console.log(`[buildImageReferences] Got embeddings for ${answerEmbeddings.length} answer chunks`);

// Step 5: Compute pairwise cosine similarity
console.log(`[buildImageReferences] Step 5: Computing pairwise cosine similarity between answer and image embeddings`);
const allMatches = [];

for (let i = 0; i < validAnswerChunks.length; i++) {
const answerChunkIndex = validAnswerChunkIndices[i];
const answerChunk = validAnswerChunks[i];
const answerChunkPosition = answerChunkPositions[i];
const answerEmbedding = answerEmbeddings[i];

const matchesForChunk = [];

for (const imageIndex of validImageIndices) {
const imageEmbedding = allImageEmbeddings[imageIndex];

if (imageEmbedding) {
const score = cosineSimilarity(answerEmbedding, imageEmbedding);

matchesForChunk.push({
imageIndex,
relevanceScore: score
});
}
}

matchesForChunk.sort((a, b) => b.relevanceScore - a.relevanceScore);

for (const match of matchesForChunk) {
allMatches.push({
imageIndex: match.imageIndex,
answerChunkIndex: answerChunkIndex,
relevanceScore: match.relevanceScore,
answerChunk: answerChunk,
answerChunkPosition: answerChunkPosition
});
}

console.log(`[buildImageReferences] Processed answer chunk ${i + 1}/${validAnswerChunks.length}, top score: ${matchesForChunk[0]?.relevanceScore.toFixed(4)}`);
}

// Log statistics about relevance scores
if (allMatches.length > 0) {
const relevanceScores = allMatches.map(match => match.relevanceScore);
const minRelevance = Math.min(...relevanceScores);
const maxRelevance = Math.max(...relevanceScores);
const sumRelevance = relevanceScores.reduce((sum, score) => sum + score, 0);
const meanRelevance = sumRelevance / relevanceScores.length;

console.log('Reference relevance statistics:', {
min: minRelevance.toFixed(4),
max: maxRelevance.toFixed(4),
mean: meanRelevance.toFixed(4),
count: relevanceScores.length
});
}


// Step 6: Sort all matches by relevance
allMatches.sort((a, b) => b.relevanceScore - a.relevanceScore);
console.log(`[buildImageReferences] Step 6: Sorted ${allMatches.length} potential matches by relevance score`);

// Step 7: Filter matches
console.log(`[buildImageReferences] Step 7: Filtering matches to ensure uniqueness and threshold (min: ${minRelScore})`);
const usedImages = new Set();
const usedAnswerChunks = new Set();
const filteredMatches = [];

for (const match of allMatches) {
if (match.relevanceScore < minRelScore) continue;

if (!usedImages.has(match.imageIndex) && !usedAnswerChunks.has(match.answerChunkIndex)) {
filteredMatches.push(match);
usedImages.add(match.imageIndex);
usedAnswerChunks.add(match.answerChunkIndex);

if (filteredMatches.length >= maxRef) break;
}
}

console.log(`[buildImageReferences] Selected ${filteredMatches.length}/${allMatches.length} references after filtering`);

const references: ImageReference[] = filteredMatches.map((match) => {
const source = imageToSourceMap[match.imageIndex];
return {
url: source.url,
alt: source.altText,
relevanceScore: match.relevanceScore,
answerChunk: match.answerChunk,
answerChunkPosition: match.answerChunkPosition
};
});

return references;

} catch (error) {
console.error('Embedding failed', error);
return [];
}
}
Loading
Loading