download all files

This commit is contained in:
Roman Pyrih
2025-06-24 14:14:35 +02:00
parent ebed09c00b
commit 4c71b5d9c2
72007 changed files with 10407727 additions and 40029 deletions

View File

@@ -0,0 +1,508 @@
<?php
class NewsletterProHtml2Text
{
public static function defaultOptions()
{
return array(
'ignore_errors' => false,
'drop_links' => false,
);
}
/**
* Tries to convert the given HTML into a plain text format - best suited for
* e-mail display, etc.
*
* <p>In particular, it tries to maintain the following features:
* <ul>
* <li>Links are maintained, with the 'href' copied over
* <li>Information in the &lt;head&gt; is lost
* </ul>
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @return string the HTML converted, as best as possible, to text
* @throws NewsletterProHtml2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convert($html, $options = array())
{
if ($options === false || $options === true) {
// Using old style (< 1.0) of passing in options
$options = array('ignore_errors' => $options);
}
$options = array_merge(static::defaultOptions(), $options);
// check all options are valid
foreach ($options as $key => $value) {
if (!in_array($key, array_keys(static::defaultOptions()))) {
throw new \InvalidArgumentException("Unknown html2text option '$key'");
}
}
$is_office_document = static::isOfficeDocument($html);
if ($is_office_document) {
// remove office namespace
$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
}
$html = static::fixNewlines($html);
if (mb_detect_encoding($html, "UTF-8", true)) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}
$doc = static::getDocument($html, $options['ignore_errors']);
$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
return $output;
}
/**
* Unify newlines; in particular, \r\n becomes \n, and
* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
* all become \ns.
*
* @param string $text text with any number of \r, \r\n and \n combinations
* @return string the fixed text
*/
public static function fixNewlines($text)
{
// replace \r\n to \n
$text = str_replace("\r\n", "\n", $text);
// remove \rs
$text = str_replace("\r", "\n", $text);
return $text;
}
public static function nbspCodes()
{
return array(
"\xc2\xa0",
"\u00a0",
);
}
public static function zwnjCodes()
{
return array(
"\xe2\x80\x8c",
"\u200c",
);
}
/**
* Remove leading or trailing spaces and excess empty lines from provided multiline text
*
* @param string $text multiline text any number of leading or trailing spaces or excess lines
* @return string the fixed text
*/
public static function processWhitespaceNewlines($text)
{
// remove excess spaces around tabs
$text = preg_replace("/ *\t */im", "\t", $text);
// remove leading whitespace
$text = ltrim($text);
// remove leading spaces on each line
$text = preg_replace("/\n[ \t]*/im", "\n", $text);
// convert non-breaking spaces to regular spaces to prevent output issues,
// do it here so they do NOT get removed with other leading spaces, as they
// are sometimes used for indentation
$text = static::renderText($text);
// remove trailing whitespace
$text = rtrim($text);
// remove trailing spaces on each line
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// unarmor pre blocks
$text = static::fixNewLines($text);
// remove unnecessary empty lines
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
return $text;
}
/**
* Parse HTML into a DOMDocument
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @return \DOMDocument the parsed document tree
*/
public static function getDocument($html, $ignore_error = false)
{
$doc = new \DOMDocument();
$html = trim($html);
if (!$html) {
// DOMDocument doesn't support empty value and throws an error
// Return empty document instead
return $doc;
}
if ($html[0] !== '<') {
// If HTML does not begin with a tag, we put a body tag around it.
// If we do not do this, PHP will insert a paragraph tag around
// the first block of text for some reason which can mess up
// the newlines. See pre.html test for an example.
$html = '<body>' . $html . '</body>';
}
if ($ignore_error) {
$doc->strictErrorChecking = false;
$doc->recover = true;
$doc->xmlStandalone = true;
$old_internal_errors = libxml_use_internal_errors(true);
$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE);
libxml_use_internal_errors($old_internal_errors);
} else {
$load_result = $doc->loadHTML($html);
}
if (!$load_result) {
throw new NewsletterProHtml2TextException("Could not load HTML - badly formed?", $html);
}
return $doc;
}
/**
* Can we guess that this HTML is generated by Microsoft Office?
*/
public static function isOfficeDocument($html)
{
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
}
/**
* Replace any special characters with simple text versions, to prevent output issues:
* - Convert non-breaking spaces to regular spaces; and
* - Convert zero-width non-joiners to '' (nothing).
*
* This is to match our goal of rendering documents as they would be rendered
* by a browser.
*/
public static function renderText($text)
{
$text = str_replace(static::nbspCodes(), " ", $text);
$text = str_replace(static::zwnjCodes(), "", $text);
return $text;
}
public static function isWhitespace($text)
{
return strlen(trim(static::renderText($text), "\n\r\t ")) === 0;
}
public static function nextChildName($node)
{
// get the next child
$nextNode = $node->nextSibling;
while ($nextNode != null) {
if ($nextNode instanceof \DOMText) {
if (!static::isWhitespace($nextNode->wholeText)) {
break;
}
}
if ($nextNode instanceof \DOMElement) {
break;
}
$nextNode = $nextNode->nextSibling;
}
$nextName = null;
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
}
return $nextName;
}
public static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options)
{
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
$text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n";
// Remove trailing whitespace only
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// armor newlines with \r.
return str_replace("\n", "\r", $text);
} else {
$text = static::renderText($node->wholeText);
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);
if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
return "\n" . $text;
}
return $text;
}
}
if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) {
// ignore
return "";
}
$name = strtolower($node->nodeName);
$nextName = static::nextChildName($node);
// start whitespace
switch ($name) {
case "hr":
$prefix = '';
if ($prevName != null) {
$prefix = "\n";
}
return $prefix . "---------------------------------------------------------------\n";
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "ol":
case "ul":
case "pre":
// add two newlines
$output = "\n\n";
break;
case "td":
case "th":
// add tab char to separate table fields
$output = "\t";
break;
case "p":
// Microsoft exchange emails often include HTML which, when passed through
// html2text, results in lots of double line returns everywhere.
//
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
}
// add two lines
$output = "\n\n";
break;
case "tr":
// add one line
$output = "\n";
break;
case "div":
$output = "";
if ($prevName !== null) {
// add one line
$output .= "\n";
}
break;
case "li":
$output = "- ";
break;
default:
// print out contents of unknown tags
$output = "";
break;
}
// debug
//$output .= "[$name,$nextName]";
if (isset($node->childNodes)) {
$n = $node->childNodes->item(0);
$previousSiblingNames = array();
$previousSiblingName = null;
$parts = array();
$trailing_whitespace = 0;
while ($n != null) {
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
|| $n instanceof \DOMProcessingInstruction
|| ($n instanceof \DOMText && static::isWhitespace($text))) {
// Keep current previousSiblingName, these are invisible
$trailing_whitespace++;
} else {
$previousSiblingName = strtolower($n->nodeName);
$previousSiblingNames[] = $previousSiblingName;
$trailing_whitespace = 0;
}
$node->removeChild($n);
$n = $node->childNodes->item(0);
$parts[] = $text;
}
// Remove trailing whitespace, important for the br check below
while ($trailing_whitespace-- > 0) {
array_pop($parts);
}
// suppress last br tag inside a node list if follows text
$last_name = array_pop($previousSiblingNames);
if ($last_name === 'br') {
$last_name = array_pop($previousSiblingNames);
if ($last_name === '#text') {
array_pop($parts);
}
}
$output .= implode('', $parts);
}
// end whitespace
switch ($name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "pre":
case "p":
// add two lines
$output .= "\n\n";
break;
case "br":
// add one line
$output .= "\n";
break;
case "div":
break;
case "a":
// links are returned in [text](link) format
$href = $node->getAttribute("href");
$output = trim($output);
// remove double [[ ]] s from linking images
if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
$output = substr($output, 1, strlen($output) - 2);
// for linking images, the title of the <a> overrides the title of the <img>
if ($node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
}
// if there is no link text, but a title attr
if (!$output && $node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
if ($href == null) {
// it doesn't link anywhere
if ($node->getAttribute("name") != null) {
if ($options['drop_links']) {
$output = "$output";
} else {
$output = "[$output]";
}
}
} else {
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
// link to the same address: just use link
$output = "$output";
} else {
// replace it
if ($output) {
if ($options['drop_links']) {
$output = "$output";
} else {
$output = "[$output]($href)";
}
} else {
// empty string
$output = "$href";
}
}
}
// does the next node require additional whitespace?
switch ($nextName) {
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
$output .= "\n";
break;
}
break;
case "img":
if ($node->getAttribute("title")) {
$output = "[" . $node->getAttribute("title") . "]";
} elseif ($node->getAttribute("alt")) {
$output = "[" . $node->getAttribute("alt") . "]";
} else {
$output = "";
}
break;
case "li":
$output .= "\n";
break;
case "blockquote":
// process quoted text for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
// add leading newline
$output = "\n" . $output;
// prepend '> ' at the beginning of all lines
$output = preg_replace("/\n/im", "\n> ", $output);
// replace leading '> >' with '>>'
$output = preg_replace("/\n> >/im", "\n>>", $output);
// add another leading newline and trailing newlines
$output = "\n" . $output . "\n\n";
break;
default:
// do nothing
}
return $output;
}
}

View File

@@ -0,0 +1,12 @@
<?php
class NewsletterProHtml2TextException extends Exception
{
public $more_info;
public function __construct($message = "", $more_info = "")
{
parent::__construct($message);
$this->more_info = $more_info;
}
}

View File

@@ -0,0 +1,101 @@
html2text [![Build Status](https://travis-ci.org/soundasleep/html2text.svg?branch=master)](https://travis-ci.org/soundasleep/html2text) [![Total Downloads](https://poser.pugx.org/soundasleep/html2text/downloads.png)](https://packagist.org/packages/soundasleep/html2text)
=========
html2text is a very simple script that uses DOM methods to convert HTML into a format similar to what would be
rendered by a browser - perfect for places where you need a quick text representation. For example:
```html
<html>
<title>Ignored Title</title>
<body>
<h1>Hello, World!</h1>
<p>This is some e-mail content.
Even though it has whitespace and newlines, the e-mail converter
will handle it correctly.
<p>Even mismatched tags.</p>
<div>A div</div>
<div>Another div</div>
<div>A div<div>within a div</div></div>
<a href="http://foo.com">A link</a>
</body>
</html>
```
Will be converted into:
```text
Hello, World!
This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.
Even mismatched tags.
A div
Another div
A div
within a div
[A link](http://foo.com)
```
See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531).
## Installing
You can use [Composer](http://getcomposer.org/) to add the [package](https://packagist.org/packages/soundasleep/html2text) to your project:
```json
{
"require": {
"soundasleep/html2text": "~1.1"
}
}
```
And then use it quite simply:
```php
$text = \Soundasleep\Html2Text::convert($html);
```
You can also include the supplied `html2text.php` and use `$text = convert_html_to_text($html);` instead.
### Options
| Option | Default | Description |
|--------|---------|-------------|
| **ignore_errors** | `false` | Set to `true` to ignore any XML parsing errors. |
| **drop_links** | `false` | Set to `true` to not render links as `[http://foo.com](My Link)`, but rather just `My Link`. |
Pass along options as a second argument to `convert`, for example:
```php
$options = array(
'ignore_errors' => true,
// other options go here
);
$text = \Soundasleep\Html2Text::convert($html, $options);
```
## Tests
Some very basic tests are provided in the `tests/` directory. Run them with `composer install && vendor/bin/phpunit`.
## Troubleshooting
### Class 'DOMDocument' not found
You need to [install the PHP XML extension](https://github.com/soundasleep/html2text/issues/55) for your PHP version. e.g. `apt-get install php7.1-xml`
## License
`html2text` is [licensed under MIT](LICENSE.md), making it suitable for both Eclipse and GPL projects.
## Other versions
Also see [html2text_ruby](https://github.com/soundasleep/html2text_ruby), a Ruby implementation.

View File

@@ -0,0 +1,26 @@
<?php
/**
* Since 2013 Ovidiu Cimpean
*
* Ovidiu Cimpean - Newsletter Pro © All rights reserved.
*
* DISCLAIMER
*
* Do not edit, modify or copy this file.
* If you wish to customize it, contact us at addons4prestashop@gmail.com.
*
* @author Ovidiu Cimpean <addons4prestashop@gmail.com>
* @copyright Since 2013 Ovidiu Cimpean
* @license Do not edit, modify or copy this file
* @version Release: 4
*/
header('Expires: Mon, 26 Jul 1997 05:00:00 GMT');
header('Last-Modified: '.gmdate('D, d M Y H:i:s').' GMT');
header('Cache-Control: no-store, no-cache, must-revalidate');
header('Cache-Control: post-check=0, pre-check=0', false);
header('Pragma: no-cache');
header('Location: ../');
exit;