Files
2026-03-03 23:49:13 +01:00

106 lines
3.8 KiB
JavaScript

import { formatHtmlForPrompt } from './htmlrewrite.js';
import { createSmartDiff } from './diff-utils.js';
// Store last HTML snapshots per locator/page for diffing
const lastHtmlSnapshots = new WeakMap();
function isPage(obj) {
return obj && typeof obj.content === 'function' && typeof obj.goto === 'function';
}
function isRegExp(value) {
return (typeof value === 'object' && value !== null && typeof value.test === 'function' && typeof value.exec === 'function');
}
function getSnapshotKey(locator) {
if (isPage(locator)) {
return 'page';
}
return `locator:${locator.selector()}`;
}
export async function getCleanHTML(options) {
const { locator, search, showDiffSinceLastCall = !search, includeStyles = false, maxAttrLen = 200, maxContentLen = 500, } = options;
// Get raw HTML
let rawHtml;
let page;
if (isPage(locator)) {
page = locator;
rawHtml = await locator.content();
}
else {
page = locator.page();
rawHtml = await locator.innerHTML();
}
// Clean the HTML using formatHtmlForPrompt
const cleanedHtml = await formatHtmlForPrompt({
html: rawHtml,
keepStyles: includeStyles,
maxAttrLen,
maxContentLen,
});
// Sanitize to remove unpaired surrogates that break JSON encoding
let htmlStr = cleanedHtml.toWellFormed?.() ?? cleanedHtml;
// Store snapshot and handle diffing
let pageSnapshots = lastHtmlSnapshots.get(page);
if (!pageSnapshots) {
pageSnapshots = new Map();
lastHtmlSnapshots.set(page, pageSnapshots);
}
const snapshotKey = getSnapshotKey(locator);
const previousSnapshot = pageSnapshots.get(snapshotKey);
pageSnapshots.set(snapshotKey, htmlStr);
// Diff defaults off when search is provided, but agent can explicitly enable both
if (showDiffSinceLastCall && previousSnapshot) {
const diffResult = createSmartDiff({
oldContent: previousSnapshot,
newContent: htmlStr,
label: 'html',
});
if (diffResult.type === 'no-change') {
return 'No changes since last call. Use showDiffSinceLastCall: false to see full content.';
}
return diffResult.content;
}
// Handle search
if (search) {
const lines = htmlStr.split('\n');
const matchIndices = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
let isMatch = false;
if (isRegExp(search)) {
isMatch = search.test(line);
}
else {
isMatch = line.includes(search);
}
if (isMatch) {
matchIndices.push(i);
if (matchIndices.length >= 10)
break;
}
}
if (matchIndices.length === 0) {
return 'No matches found';
}
// Collect lines with 5 lines of context above and below each match
const CONTEXT_LINES = 5;
const includedLines = new Set();
for (const idx of matchIndices) {
const start = Math.max(0, idx - CONTEXT_LINES);
const end = Math.min(lines.length - 1, idx + CONTEXT_LINES);
for (let i = start; i <= end; i++) {
includedLines.add(i);
}
}
// Build result with separators between non-contiguous sections
const sortedIndices = [...includedLines].sort((a, b) => a - b);
const result = [];
for (let i = 0; i < sortedIndices.length; i++) {
const lineIdx = sortedIndices[i];
if (i > 0 && sortedIndices[i - 1] !== lineIdx - 1) {
result.push('---');
}
result.push(lines[lineIdx]);
}
return result.join('\n');
}
return htmlStr;
}
//# sourceMappingURL=clean-html.js.map