In particular, it tries to maintain the following features:
*
* - Links are maintained, with the 'href' copied over
*
- Information in the <head> is lost
*
*
* @param html the input HTML
* @return the HTML converted, as best as possible, to text
*/
public static function convert($html)
{
$html = self::fixNewLines($html);
$doc = new DOMDocument();
$html = ''.$html;
if (!$doc->loadHTML($html))
{
throw new sfException("Could not load HTML - badly formed?");
}
foreach ($doc->childNodes as $item)
{
if ($item->nodeType == XML_PI_NODE)
{
$doc->removeChild($item); // remove hack
}
}
$output = self::iterateOverNode($doc);
// remove leading and trailing spaces on each line
$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
// remove leading and trailing whitespace
$output = trim($output);
return $output;
}
/**
* Unify newlines; in particular, \r\n becomes \n, and
* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
* all become \ns.
*
* @param text text with any number of \r, \r\n and \n combinations
* @return the fixed text
*/
protected static function fixNewLines($text)
{
// replace \r\n to \n
$text = str_replace("\r\n", "\n", $text);
// remove \rs
$text = str_replace("\r", "\n", $text);
return $text;
}
protected static function nextChildName($node)
{
// get the next child
$nextNode = $node->nextSibling;
while ($nextNode != null)
{
if ($nextNode instanceof DOMElement)
{
break;
}
$nextNode = $nextNode->nextSibling;
}
$nextName = null;
if ($nextNode instanceof DOMElement && $nextNode != null)
{
$nextName = strtolower($nextNode->nodeName);
}
return $nextName;
}
protected static function getHasParentNodeName(DomElement $node, $name)
{
return $node->parentNode ? $node->parentNode->nodeName == $name : null;
}
protected static function getHasChildNodeName(DomElement $node, $name)
{
if ($node->childNodes)
{
foreach ($node->childNodes as $child_node)
{
if ($child_node->nodeName == $name)
{
return true;
}
}
}
return false;
}
protected static function prevChildName($node)
{
// get the previous child
$nextNode = $node->previousSibling;
while ($nextNode != null)
{
if ($nextNode instanceof DOMElement)
{
break;
}
$nextNode = $nextNode->previousSibling;
}
$nextName = null;
if ($nextNode instanceof DOMElement && $nextNode != null)
{
$nextName = strtolower($nextNode->nodeName);
}
return $nextName;
}
protected static function iterateOverNode($node)
{
if ($node instanceof DOMText)
{
return preg_replace("/\\s+/im", " ", $node->wholeText);
}
if ($node instanceof DOMDocumentType)
{
// ignore
return "";
}
$nextName = self::nextChildName($node);
$prevName = self::prevChildName($node);
$name = strtolower($node->nodeName);
// start whitespace
switch ($name)
{
case "hr":
return "------\n";
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
// add two newlines
$output = "\n";
break;
case "li":
$output = "\n - ";
break;
case "p":
case "div":
// add one line
$output = "\n";
break;
default:
// print out contents of unknown tags
$output = "";
break;
}
// debug
//$output .= "[$name,$nextName]";
for ($i = 0; $i < $node->childNodes->length; $i++)
{
$n = $node->childNodes->item($i);
$text = self::iterateOverNode($n);
$output .= $text;
}
// end whitespace
switch ($name)
{
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "li":
$output .= "\n";
break;
case "p":
case "br":
// add one line
if ($nextName != "div")
$output .= "\n";
break;
case "div":
// add one line only if the next child isn't a div
if ($nextName != "div" && $nextName != null)
$output .= "\n";
break;
case "a":
// links are returned in [text](link) format
$href = $node->getAttribute("href");
if ($href == null)
{
// it doesn't link anywhere
if ($node->getAttribute("name") != null)
{
$output = "[$output]";
}
}
else
{
if ($href == $output)
{
// link to the same address: just use link
$output;
}
else
{
// replace it
$output = "[$output]($href)";
}
}
// does the next node require additional whitespace?
switch ($nextName)
{
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
$output .= "\n";
break;
}
default:
// do nothing
}
return $output;
}
}