import posthtml from 'posthtml'; import beautify from 'posthtml-beautify'; export async function formatHtmlForPrompt({ html, keepStyles = false, maxAttrLen = 200, maxContentLen = 500, }) { const tagsToRemove = ['hint', 'style', 'link', 'script', 'meta', 'noscript', 'svg', 'head']; const attributesToKeep = [ // Standard descriptive attributes 'label', 'title', 'alt', 'href', 'name', 'value', 'checked', 'placeholder', 'type', 'role', 'target', // Descriptive aria attributes (text content) 'aria-label', 'aria-placeholder', 'aria-valuetext', 'aria-roledescription', // Useful aria state attributes 'aria-hidden', 'aria-expanded', 'aria-checked', 'aria-selected', 'aria-disabled', 'aria-pressed', 'aria-required', 'aria-current', // Test IDs (data-testid, data-test, data-cy, data-qa are covered by data-* prefix) 'testid', 'test-id', 'tid', 'qa', 'qa-id', 'e2e', 'e2e-id', 'automation-id', 'automationid', 'selenium', 'pw', 'vimium-label', // Conditionally added: 'style', 'class' ]; if (keepStyles) { attributesToKeep.push('style', 'class'); } const truncate = (str, maxLen) => { if (str.length <= maxLen) return str; const remaining = str.length - maxLen; return str.slice(0, maxLen) + `...${remaining} more characters`; }; // Create a custom plugin to remove tags and filter attributes const removeTagsAndAttrsPlugin = () => { return (tree) => { // Remove comments at root level tree = tree.filter((item) => { if (typeof item === 'string') { const trimmed = item.trim(); return !(trimmed.startsWith('')); } return true; }); // Process each node recursively const processNode = (node) => { if (typeof node === 'string') { // Truncate text content const trimmed = node.trim(); if (trimmed.length === 0) return node; return truncate(node, maxContentLen); } // Remove unwanted tags if (node.tag && tagsToRemove.includes(node.tag.toLowerCase())) { return null; } // Filter attributes if (node.attrs) { const newAttrs = {}; for (const [attr, value] of Object.entries(node.attrs)) { const shouldKeep = attr.startsWith('data-') || attributesToKeep.includes(attr); if (shouldKeep) { // Truncate attribute values newAttrs[attr] = typeof value === 'string' ? truncate(value, maxAttrLen) : value; } } node.attrs = newAttrs; } // Process content recursively if (node.content && Array.isArray(node.content)) { node.content = node.content.map(processNode).filter((item) => { if (item === null) return false; if (typeof item === 'string') { const trimmed = item.trim(); return !(trimmed.startsWith('')); } return true; }); } return node; }; // Process all root nodes return tree.map(processNode).filter((item) => item !== null); }; }; // Plugin to remove aria-hidden="true" subtrees entirely // These are hidden from assistive tech and usually decorative const removeAriaHiddenPlugin = () => { return (tree) => { const processNode = (node) => { if (typeof node === 'string') return node; if (!node.tag) return node; // Remove if aria-hidden="true" if (node.attrs?.['aria-hidden'] === 'true') { return null; } // Process children recursively if (node.content && Array.isArray(node.content)) { node.content = node.content.map(processNode).filter((item) => item !== null); } return node; }; return tree.map(processNode).filter((item) => item !== null); }; }; // Plugin to remove images with empty alt text (purely decorative) // Runs before decorative subtree pruning so containers become empty const removeEmptyAltImagesPlugin = () => { return (tree) => { const processNode = (node) => { if (typeof node === 'string') return node; if (!node.tag) return node; // Remove img with empty or missing alt if (node.tag.toLowerCase() === 'img') { const alt = node.attrs?.alt; if (alt === '' || alt === undefined) { return null; } } // Process children recursively if (node.content && Array.isArray(node.content)) { node.content = node.content.map(processNode).filter((item) => item !== null); } return node; }; return tree.map(processNode).filter((item) => item !== null); }; }; // Plugin to remove decorative subtrees that have no useful content for agents // A subtree is decorative if it has: // - No text content (leaf text nodes) // - No actionable elements with meaningful attributes const removeDecorativeSubtreesPlugin = () => { const actionableTags = ['button', 'a', 'input', 'select', 'textarea']; const meaningfulAttrs = ['aria-label', 'title', 'alt', 'value', 'placeholder', 'href', 'name']; // Form elements are always actionable, keep unconditionally const formTags = ['input', 'select', 'textarea']; // Check if a subtree has any useful content const hasUsefulContent = (node) => { if (typeof node === 'string') { return node.trim().length > 0; } if (!node.tag) return false; // Form elements are always useful for agents to interact with if (formTags.includes(node.tag.toLowerCase())) { return true; } // Images with non-empty alt text are useful (descriptive content) if (node.tag.toLowerCase() === 'img') { const alt = node.attrs?.alt; if (typeof alt === 'string' && alt.trim().length > 0) { return true; } } // Check if this is an actionable element with meaningful attributes if (actionableTags.includes(node.tag.toLowerCase())) { if (node.attrs) { for (const attr of meaningfulAttrs) { const value = node.attrs[attr]; if (typeof value === 'string' && value.trim().length > 0) { return true; } } } } // Check children recursively if (node.content && Array.isArray(node.content)) { for (const child of node.content) { if (hasUsefulContent(child)) { return true; } } } return false; }; return (tree) => { const processNode = (node) => { if (typeof node === 'string') return node; if (!node.tag) return node; // First process children if (node.content && Array.isArray(node.content)) { node.content = node.content.map(processNode).filter((item) => item !== null); } // After processing children, check if this subtree is now decorative // Skip root-level semantic elements (body, main, etc.) const semanticTags = ['html', 'body', 'main', 'header', 'footer', 'nav', 'section', 'article', 'aside']; if (semanticTags.includes(node.tag.toLowerCase())) { return node; } // If no useful content in this subtree, remove it if (!hasUsefulContent(node)) { return null; } return node; }; return tree.map(processNode).filter((item) => item !== null); }; }; // Plugin to unwrap unnecessary nested wrapper elements // e.g.,

text

->

text

const unwrapNestedWrappersPlugin = () => { return (tree) => { const isWhitespaceOnly = (node) => { return typeof node === 'string' && node.trim().length === 0; }; const hasNoAttrs = (node) => { return !node.attrs || Object.keys(node.attrs).length === 0; }; const unwrapNode = (node) => { if (typeof node === 'string') return node; if (!node.tag) return node; // First, recursively process children if (node.content && Array.isArray(node.content)) { node.content = node.content.map(unwrapNode); } // Check if this node is an unnecessary wrapper: // - has no attributes // - has exactly one non-whitespace child that is an element if (hasNoAttrs(node) && node.content && Array.isArray(node.content)) { const nonWhitespaceChildren = node.content.filter((c) => !isWhitespaceOnly(c)); if (nonWhitespaceChildren.length === 1) { const onlyChild = nonWhitespaceChildren[0]; // If the only child is also an element (not text), unwrap if (typeof onlyChild !== 'string' && onlyChild.tag) { // Replace this node with its child return onlyChild; } } } return node; }; // Apply multiple passes until stable (handles deeply nested wrappers) let result = tree.map(unwrapNode); let prevJson = ''; let currJson = JSON.stringify(result); while (prevJson !== currJson) { prevJson = currJson; result = result.map(unwrapNode); currJson = JSON.stringify(result); } return result; }; }; // Plugin to remove empty elements (no attrs, no content) // Runs repeatedly until no more empty elements exist const removeEmptyElementsPlugin = () => { return (tree) => { const isEmptyElement = (node) => { if (typeof node === 'string') return false; if (!node.tag) return false; const hasAttrs = node.attrs && Object.keys(node.attrs).length > 0; const hasContent = node.content && node.content.some((c) => (typeof c === 'string' ? c.trim().length > 0 : true)); return !hasAttrs && !hasContent; }; const removeEmpty = (content) => { if (!content || !Array.isArray(content)) return content; return content .map((node) => { if (typeof node === 'string') return node; if (node.content) { node.content = removeEmpty(node.content); } return node; }) .filter((node) => !isEmptyElement(node)); }; // Apply multiple passes until stable let result = removeEmpty(tree); let prevJson = ''; let currJson = JSON.stringify(result); while (prevJson !== currJson) { prevJson = currJson; result = removeEmpty(result); currJson = JSON.stringify(result); } return result; }; }; // Process HTML const processor = posthtml() .use(removeTagsAndAttrsPlugin()) .use(removeAriaHiddenPlugin()) .use(removeEmptyAltImagesPlugin()) .use(removeDecorativeSubtreesPlugin()) .use(removeEmptyElementsPlugin()) .use(unwrapNestedWrappersPlugin()) .use(beautify({ rules: { indent: 1, // 1-space indent blankLines: false, // no extra blank lines maxlen: 100000, // effectively never wrap by content length }, jsBeautifyOptions: { wrap_line_length: 0, // disable js-beautify wrapping preserve_newlines: false, // reduce stray newlines }, })); // Process with await const result = await processor.process(html); return result.html; } //# sourceMappingURL=htmlrewrite.js.map