b2b.redline.com.pl/modules/newsletterpro/libraries/htmltotext/NewsletterProHtml2Text.php

<?php

class NewsletterProHtml2Text
{
    public static function defaultOptions()
    {
        return array(
            'ignore_errors' => false,
            'drop_links'    => false,
        );
    }

    /**
     * Tries to convert the given HTML into a plain text format - best suited for
     * e-mail display, etc.
     *
     * <p>In particular, it tries to maintain the following features:
     * <ul>
     *   <li>Links are maintained, with the 'href' copied over
     *   <li>Information in the &lt;head&gt; is lost
     * </ul>
     *
     * @param string $html the input HTML
     * @param boolean $ignore_error Ignore xml parsing errors
     * @return string the HTML converted, as best as possible, to text
     * @throws NewsletterProHtml2TextException if the HTML could not be loaded as a {@link \DOMDocument}
     */
    public static function convert($html, $options = array())
    {
        if ($options === false || $options === true) {
            // Using old style (< 1.0) of passing in options
            $options = array('ignore_errors' => $options);
        }

        $options = array_merge(static::defaultOptions(), $options);

        // check all options are valid
        foreach ($options as $key => $value) {
            if (!in_array($key, array_keys(static::defaultOptions()))) {
                throw new \InvalidArgumentException("Unknown html2text option '$key'");
            }
        }

        $is_office_document = static::isOfficeDocument($html);

        if ($is_office_document) {
            // remove office namespace
            $html = str_replace(array("<o:p>", "</o:p>"), "", $html);
        }

        $html = static::fixNewlines($html);
        if (mb_detect_encoding($html, "UTF-8", true)) {
            $html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
        }

        $doc = static::getDocument($html, $options['ignore_errors']);

        $output = static::iterateOverNode($doc, null, false, $is_office_document, $options);

        // process output for whitespace/newlines
        $output = static::processWhitespaceNewlines($output);

        return $output;
    }

    /**
     * Unify newlines; in particular, \r\n becomes \n, and
     * then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
     * all become \ns.
     *
     * @param string $text text with any number of \r, \r\n and \n combinations
     * @return string the fixed text
     */
    public static function fixNewlines($text)
    {
        // replace \r\n to \n
        $text = str_replace("\r\n", "\n", $text);
        // remove \rs
        $text = str_replace("\r", "\n", $text);

        return $text;
    }

    public static function nbspCodes()
    {
        return array(
            "\xc2\xa0",
            "\u00a0",
        );
    }

    public static function zwnjCodes()
    {
        return array(
            "\xe2\x80\x8c",
            "\u200c",
        );
    }

    /**
     * Remove leading or trailing spaces and excess empty lines from provided multiline text
     *
     * @param string $text multiline text any number of leading or trailing spaces or excess lines
     * @return string the fixed text
     */
    public static function processWhitespaceNewlines($text)
    {

        // remove excess spaces around tabs
        $text = preg_replace("/ *\t */im", "\t", $text);

        // remove leading whitespace
        $text = ltrim($text);

        // remove leading spaces on each line
        $text = preg_replace("/\n[ \t]*/im", "\n", $text);

        // convert non-breaking spaces to regular spaces to prevent output issues,
        // do it here so they do NOT get removed with other leading spaces, as they
        // are sometimes used for indentation
        $text = static::renderText($text);

        // remove trailing whitespace
        $text = rtrim($text);

        // remove trailing spaces on each line
        $text = preg_replace("/[ \t]*\n/im", "\n", $text);

        // unarmor pre blocks
        $text = static::fixNewLines($text);

        // remove unnecessary empty lines
        $text = preg_replace("/\n\n\n*/im", "\n\n", $text);

        return $text;
    }

    /**
     * Parse HTML into a DOMDocument
     *
     * @param string $html the input HTML
     * @param boolean $ignore_error Ignore xml parsing errors
     * @return \DOMDocument the parsed document tree
     */
    public static function getDocument($html, $ignore_error = false)
    {
        $doc = new \DOMDocument();

        $html = trim($html);

        if (!$html) {
            // DOMDocument doesn't support empty value and throws an error
            // Return empty document instead
            return $doc;
        }

        if ($html[0] !== '<') {
            // If HTML does not begin with a tag, we put a body tag around it.
            // If we do not do this, PHP will insert a paragraph tag around
            // the first block of text for some reason which can mess up
            // the newlines. See pre.html test for an example.
            $html = '<body>' . $html . '</body>';
        }

        if ($ignore_error) {
            $doc->strictErrorChecking = false;
            $doc->recover = true;
            $doc->xmlStandalone = true;
            $old_internal_errors = libxml_use_internal_errors(true);
            $load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
            libxml_use_internal_errors($old_internal_errors);
        } else {
            $load_result = $doc->loadHTML($html);
        }

        if (!$load_result) {
            throw new NewsletterProHtml2TextException("Could not load HTML - badly formed?", $html);
        }

        return $doc;
    }

    /**
     * Can we guess that this HTML is generated by Microsoft Office?
     */
    public static function isOfficeDocument($html)
    {
        return strpos($html, "urn:schemas-microsoft-com:office") !== false;
    }

    /**
     * Replace any special characters with simple text versions, to prevent output issues:
     * - Convert non-breaking spaces to regular spaces; and
     * - Convert zero-width non-joiners to '' (nothing).
     *
     * This is to match our goal of rendering documents as they would be rendered
     * by a browser.
     */
    public static function renderText($text)
    {
        $text = str_replace(static::nbspCodes(), " ", $text);
        $text = str_replace(static::zwnjCodes(), "", $text);
        return $text;
    }

    public static function isWhitespace($text)
    {
        return strlen(trim(static::renderText($text), "\n\r\t ")) === 0;
    }

    public static function nextChildName($node)
    {
        // get the next child
        $nextNode = $node->nextSibling;
        while ($nextNode != null) {
            if ($nextNode instanceof \DOMText) {
                if (!static::isWhitespace($nextNode->wholeText)) {
                    break;
                }
            }

            if ($nextNode instanceof \DOMElement) {
                break;
            }

            $nextNode = $nextNode->nextSibling;
        }

        $nextName = null;
        if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
            $nextName = strtolower($nextNode->nodeName);
        }

        return $nextName;
    }

    public static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options)
    {
        if ($node instanceof \DOMText) {
            // Replace whitespace characters with a space (equivilant to \s)
            if ($in_pre) {
                $text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n";

                // Remove trailing whitespace only
                $text = preg_replace("/[ \t]*\n/im", "\n", $text);

                // armor newlines with \r.
                return str_replace("\n", "\r", $text);
            } else {
                $text = static::renderText($node->wholeText);
                $text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);

                if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
                    return "\n" . $text;
                }
                return $text;
            }
        }

        if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
            // ignore
            return "";
        }

        $name = strtolower($node->nodeName);
        $nextName = static::nextChildName($node);

        // start whitespace
        switch ($name) {
            case "hr":
                $prefix = '';
                if ($prevName != null) {
                    $prefix = "\n";
                }
                return $prefix . "---------------------------------------------------------------\n";

            case "style":
            case "head":
            case "title":
            case "meta":
            case "script":
                // ignore these tags
                return "";

            case "h1":
            case "h2":
            case "h3":
            case "h4":
            case "h5":
            case "h6":
            case "ol":
            case "ul":
            case "pre":
                // add two newlines
                $output = "\n\n";
                break;

            case "td":
            case "th":
                // add tab char to separate table fields
               $output = "\t";
               break;

            case "p":
                // Microsoft exchange emails often include HTML which, when passed through
                // html2text, results in lots of double line returns everywhere.
                //
                // To fix this, for any p element with a className of `MsoNormal` (the standard
                // classname in any Microsoft export or outlook for a paragraph that behaves
                // like a line return) we skip the first line returns and set the name to br.
                if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
                    $output = "";
                    $name = 'br';
                    break;
                }

                // add two lines
                $output = "\n\n";
                break;

            case "tr":
                // add one line
                $output = "\n";
                break;

            case "div":
                $output = "";
                if ($prevName !== null) {
                    // add one line
                    $output .= "\n";
                }
                break;

            case "li":
                $output = "- ";
                break;

            default:
                // print out contents of unknown tags
                $output = "";
                break;
        }

        // debug
        //$output .= "[$name,$nextName]";

        if (isset($node->childNodes)) {
            $n = $node->childNodes->item(0);
            $previousSiblingNames = array();
            $previousSiblingName = null;

            $parts = array();
            $trailing_whitespace = 0;

            while ($n != null) {
                $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);

                // Pass current node name to next child, as previousSibling does not appear to get populated
                if ($n instanceof \DOMDocumentType
                    || $n instanceof \DOMProcessingInstruction
                    || ($n instanceof \DOMText && static::isWhitespace($text))) {
                    // Keep current previousSiblingName, these are invisible
                    $trailing_whitespace++;
                } else {
                    $previousSiblingName = strtolower($n->nodeName);
                    $previousSiblingNames[] = $previousSiblingName;
                    $trailing_whitespace = 0;
                }

                $node->removeChild($n);
                $n = $node->childNodes->item(0);

                $parts[] = $text;
            }

            // Remove trailing whitespace, important for the br check below
            while ($trailing_whitespace-- > 0) {
                array_pop($parts);
            }

            // suppress last br tag inside a node list if follows text
            $last_name = array_pop($previousSiblingNames);
            if ($last_name === 'br') {
                $last_name = array_pop($previousSiblingNames);
                if ($last_name === '#text') {
                    array_pop($parts);
                }
            }

            $output .= implode('', $parts);
        }

        // end whitespace
        switch ($name) {
            case "h1":
            case "h2":
            case "h3":
            case "h4":
            case "h5":
            case "h6":
            case "pre":
            case "p":
                // add two lines
                $output .= "\n\n";
                break;

            case "br":
                // add one line
                $output .= "\n";
                break;

            case "div":
                break;

            case "a":
                // links are returned in [text](link) format
                $href = $node->getAttribute("href");

                $output = trim($output);

                // remove double [[ ]] s from linking images
                if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
                    $output = substr($output, 1, strlen($output) - 2);

                    // for linking images, the title of the <a> overrides the title of the <img>
                    if ($node->getAttribute("title")) {
                        $output = $node->getAttribute("title");
                    }
                }

                // if there is no link text, but a title attr
                if (!$output && $node->getAttribute("title")) {
                    $output = $node->getAttribute("title");
                }

                if ($href == null) {
                    // it doesn't link anywhere
                    if ($node->getAttribute("name") != null) {
                        if ($options['drop_links']) {
                            $output = "$output";
                        } else {
                            $output = "[$output]";
                        }
                    }
                } else {
                    if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
                        // link to the same address: just use link
                        $output = "$output";
                    } else {
                        // replace it
                        if ($output) {
                            if ($options['drop_links']) {
                                $output = "$output";
                            } else {
                                $output = "[$output]($href)";
                            }
                        } else {
                            // empty string
                            $output = "$href";
                        }
                    }
                }

                // does the next node require additional whitespace?
                switch ($nextName) {
                    case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
                        $output .= "\n";
                        break;
                }
                break;

            case "img":
                if ($node->getAttribute("title")) {
                    $output = "[" . $node->getAttribute("title") . "]";
                } elseif ($node->getAttribute("alt")) {
                    $output = "[" . $node->getAttribute("alt") . "]";
                } else {
                    $output = "";
                }
                break;

            case "li":
                $output .= "\n";
                break;

            case "blockquote":
                // process quoted text for whitespace/newlines
                $output = static::processWhitespaceNewlines($output);

                // add leading newline
                $output = "\n" . $output;

                // prepend '> ' at the beginning of all lines
                $output = preg_replace("/\n/im", "\n> ", $output);

                // replace leading '> >' with '>>'
                $output = preg_replace("/\n> >/im", "\n>>", $output);

                // add another leading newline and trailing newlines
                $output = "\n" . $output . "\n\n";
                break;
            default:
                // do nothing
        }

        return $output;
    }
}