276 lines
9.6 KiB
PHP
276 lines
9.6 KiB
PHP
<?
|
|
header('Content-Type: text/html; charset=utf-8');
|
|
?>
|
|
<ul style="display: flex; flex-wrap: wrap; gap: 10px; margin: 0; padding: 3px; border-bottom: 1px solid #ccc; list-style-type: none; font-size: 13px;">
|
|
<li>
|
|
<a href="/dpsglinik/" style="display: inline-block; border: 1px solid #ccc; padding: 5px 10px; text-decoration: none;">HOME</a>
|
|
</li>
|
|
</ul>
|
|
<ul style="display: flex; flex-wrap: wrap; gap: 10px; margin: 0; padding: 3px; border-bottom: 1px solid #ccc; list-style-type: none; font-size: 13px;">
|
|
<li>
|
|
<a href="/dpsglinik/?action=get_urls&category=aktualnosci" style="display: inline-block; border: 1px solid #ccc; padding: 5px 10px; text-decoration: none;">aktualności - URL</a>
|
|
</li>
|
|
<li>
|
|
<a href="/dpsglinik/?action=get_xml&category=aktualnosci" style="display: inline-block; border: 1px solid #ccc; padding: 5px 10px; text-decoration: none;">aktualności - XML</a>
|
|
</li>
|
|
<li>
|
|
<a href="/dpsglinik/?action=get_urls&category=galeria" style="display: inline-block; border: 1px solid #ccc; padding: 5px 10px; text-decoration: none;">galeria - URL</a>
|
|
</li>
|
|
<li>
|
|
<a href="/dpsglinik/?action=get_xml&category=galeria" style="display: inline-block; border: 1px solid #ccc; padding: 5px 10px; text-decoration: none;">galeria - XML</a>
|
|
</li>
|
|
</ul>
|
|
</ul>
|
|
<?
|
|
|
|
class SimpleXMLElementExtended extends SimpleXMLElement
|
|
{
|
|
private function addCDataToNode(SimpleXMLElement $node, $value = '')
|
|
{
|
|
if ($domElement = dom_import_simplexml($node))
|
|
{
|
|
$domOwner = $domElement->ownerDocument;
|
|
$domElement->appendChild($domOwner->createCDATASection("{$value}"));
|
|
}
|
|
}
|
|
|
|
public function addChildWithCData($name = '', $value = '')
|
|
{
|
|
$newChild = parent::addChild($name);
|
|
if ($value) $this->addCDataToNode($newChild, "{$value}");
|
|
return $newChild;
|
|
}
|
|
|
|
public function addCData($value = '')
|
|
{
|
|
$this->addCDataToNode($this, "{$value}");
|
|
}
|
|
}
|
|
|
|
function modifyImageAttributes($html) {
|
|
$html = preg_replace_callback('/<img([^>]*)(width="(\d+)")([^>]*)(height="(\d+)")([^>]*)>/', function($matches) {
|
|
$width = $matches[3];
|
|
$height = $matches[6];
|
|
$style = 'style="width: 100%; max-width: ' . $width . 'px; height: auto;"';
|
|
|
|
return '<img' . $matches[1] . $matches[4] . $matches[7] . ' ' . $style . '>';
|
|
}, $html);
|
|
|
|
return $html;
|
|
}
|
|
|
|
function removeStyles($html) {
|
|
// Usuń atrybuty style i class
|
|
$html = preg_replace_callback('/<(\w+)([^>]*)(\s*\/?>)/', function($matches) {
|
|
if (strpos($matches[2], 'src') === false && strpos($matches[2], 'id="gallery"') === false && !($matches[1] === 'a' && strpos($matches[2], 'href') !== false)) {
|
|
return '<' . $matches[1] . $matches[3];
|
|
} else {
|
|
return '<' . $matches[1] . $matches[2] . $matches[3];
|
|
}
|
|
}, $html);
|
|
$html = preg_replace('/\r|\n/', '', $html);
|
|
$html = preg_replace('/\s{2,}/', ' ', $html);
|
|
|
|
do {
|
|
$originalHtml = $html;
|
|
$html = preg_replace('/<(\w+)\b[^>]*>\s*<\/\1>/', '', $html);
|
|
} while ($originalHtml !== $html);
|
|
|
|
do {
|
|
$originalHtml = $html;
|
|
$html = preg_replace('/<div>\s*<div>(.*?)<\/div>\s*<\/div>/', '<div>$1</div>', $html);
|
|
} while ($originalHtml !== $html);
|
|
|
|
return modifyImageAttributes( $html );
|
|
}
|
|
|
|
function extractHrefsFromGallery($html) {
|
|
$hrefs = [];
|
|
|
|
$dom = new DOMDocument;
|
|
@$dom->loadHTML($html);
|
|
|
|
$xpath = new DOMXPath($dom);
|
|
$galleryDiv = $xpath->query('//div[@id="gallery"]');
|
|
|
|
if ($galleryDiv->length > 0) {
|
|
$anchors = $xpath->query('.//a', $galleryDiv->item(0));
|
|
|
|
foreach ($anchors as $anchor) {
|
|
$href = $anchor->getAttribute('href');
|
|
if (!empty($href)) {
|
|
$hrefs[] = $href;
|
|
}
|
|
}
|
|
}
|
|
|
|
return $hrefs;
|
|
}
|
|
|
|
|
|
function removeTagsWithSpecificIdsOrContent($html) {
|
|
// Usuń tagi z id "testss", "gallery" lub "switch-effect"
|
|
$html = preg_replace('/<(\w+)[^>]*\b(?:id="testss"|id="gallery"|id="switch-effect")[^>]*>.*?<\/\1>/', '', $html);
|
|
|
|
// Usuń tagi, których zawartość to "Zobacz pełną listę aktualności"
|
|
$html = preg_replace('/<(\w+)[^>]*>\s*Zobacz pełną listę aktualności\s*<\/\1>/', '', $html);
|
|
|
|
// Usuń tagi, których zawartość to "Oryginalne zdjęcie"
|
|
$html = preg_replace('/<(\w+)[^>]*>\s*Oryginalne zdjęcie\s*<\/\1>/', '', $html);
|
|
|
|
// Usuń tagi <style> i <script> wraz z zawartością
|
|
$html = preg_replace('/<(style|script)[^>]*>.*?<\/\1>/', '', $html);
|
|
|
|
$html = preg_replace('/<option>Wjazd ze środka<\/option>|<option>Wjazd poziomy<\/option>|<option>Wjazd pionowy<\/option>|<option>Przenikanie<\/option>/', '', $html);
|
|
|
|
// Usuń pusty tag <select>
|
|
$html = preg_replace('/<select[^>]*>\s*<\/select>/', '', $html);
|
|
|
|
// Usuń tekst "Efekty : "
|
|
$html = str_replace('Efekty : ', '', $html);
|
|
|
|
return $html;
|
|
}
|
|
|
|
if ( $_GET['category'] == 'aktualnosci' )
|
|
{
|
|
if ( $_GET['action'] == 'get_urls' )
|
|
{
|
|
$get_url = 'http://www.dps-glinikdolny.strzyzowski.pl/aktualnosci/8/[i]/aktualnosci';
|
|
$max = 18;
|
|
$file = 'aktualnosci.txt';
|
|
}
|
|
|
|
if ( $_GET['action'] == 'get_xml' )
|
|
{
|
|
$urls = file_get_contents( 'aktualnosci.txt' );
|
|
$xml_file = 'aktualnosci.xml';
|
|
}
|
|
}
|
|
|
|
if ( $_GET['category'] == 'galeria' )
|
|
{
|
|
if ( $_GET['action'] == 'get_urls' )
|
|
{
|
|
$get_url = 'http://www.dps-glinikdolny.strzyzowski.pl/galeria/page,[i]';
|
|
$max = 35;
|
|
$file = 'galeria.txt';
|
|
}
|
|
|
|
if ( $_GET['action'] == 'get_xml' )
|
|
{
|
|
$urls = file_get_contents( 'galeria.txt' );
|
|
$xml_file = 'galeria.xml';
|
|
}
|
|
}
|
|
|
|
if ( $xml_file )
|
|
{
|
|
$urls = explode( PHP_EOL, $urls );
|
|
|
|
foreach ( $urls as $url )
|
|
{
|
|
if ( !$url )
|
|
continue;
|
|
|
|
$url = explode( '|', $url );
|
|
|
|
$html = file_get_contents( $url[0] );
|
|
$doc = new \DOMDocument();
|
|
libxml_use_internal_errors(true);
|
|
$doc -> loadHTML( $html );
|
|
|
|
$xpath = new DOMXpath( $doc );
|
|
|
|
unset( $news );
|
|
|
|
$news['url'] = $url[0];
|
|
$news['date'] = $url[1];
|
|
|
|
$title = $xpath -> query( "//*[@id=\"middle_center_subpage\"]/div[starts-with(@class,'p_lr10')]/div[1]/div[starts-with(@class,'fl')]/h2[@class='page_title']" ) -> item(0);
|
|
$news['title'] = removeStyles( $title -> textContent );
|
|
|
|
$text = $xpath -> query( "//*[@id=\"middle_center_subpage\"]/div[starts-with(@class,'p_lr10')]/div[starts-with(@class,'mt_10')]" ) -> item(0);
|
|
$news['gallery'] = implode( '|', extractHrefsFromGallery( $doc -> saveHTML( $text ) ) );
|
|
// $news['text'] = removeTagsWithSpecificIdsOrContent( removeStyles( $doc -> saveHTML( $text ) ) );
|
|
|
|
$news_array[] = $news;
|
|
}
|
|
|
|
if ( is_array( $news_array ) and count( $news_array ) )
|
|
{
|
|
$xml = new SimpleXMLElementExtended( '<?xml version="1.0"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:g="http://base.google.com/ns/1.0"/>' );
|
|
$xml -> addChild( 'title', 'aktualnosci' );
|
|
$xml -> addChild( 'updated', date( 'Y-m-d' ) );
|
|
foreach ( $news_array as $news )
|
|
{
|
|
$newsXml = $xml -> addChild( 'item' );
|
|
$newsXml -> addChild( 'title', $news['title'] );
|
|
$newsXml -> addChild( 'date', $news['date'] );
|
|
$newsXml -> addChildWithCData( 'text', $news['text'] );
|
|
$newsXml -> addChild( 'gallery', $news['gallery'] );
|
|
}
|
|
$xml -> asXML( $xml_file );
|
|
echo '<p>Wygenerowałem xml: https://cdn.projectpro.pl/dpsglinik/' . $xml_file . '</p>';
|
|
}
|
|
}
|
|
|
|
if ( $get_url )
|
|
{
|
|
$date = date( 'Y-m-d' );
|
|
for ( $i = 1; $i <= $max; $i++ )
|
|
{
|
|
echo str_replace( '[i]', ( $i - 1 ) * 5, $get_url ) . '<br>';
|
|
$ch = curl_init();
|
|
curl_setopt( $ch, CURLOPT_URL, str_replace( '[i]', ( $i - 1 ) * 5, $get_url ) );
|
|
curl_setopt( $ch, CURLOPT_VERBOSE, 1);
|
|
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt( $ch, CURLOPT_AUTOREFERER, false);
|
|
curl_setopt( $ch, CURLOPT_REFERER, "http://www.google.com" );
|
|
curl_setopt( $ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
|
|
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
|
|
curl_setopt( $ch, CURLOPT_HEADER, 0);
|
|
curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, 60 );
|
|
curl_setopt( $ch, CURLOPT_TIMEOUT, 60 );
|
|
$html = curl_exec( $ch );
|
|
curl_close( $ch );
|
|
|
|
$doc = new \DOMDocument();
|
|
libxml_use_internal_errors(true);
|
|
$doc -> loadHTML( $html );
|
|
$xpath = new DOMXpath( $doc );
|
|
|
|
if ( $file = 'galeria.txt' )
|
|
{
|
|
$urls_tmp = $xpath -> query( "//*[@id=\"middle_center_subpage\"]/div[starts-with(@class,'p_lr10')]/div[@class='news']" );
|
|
for ( $z = 1; $z <= count( $urls_tmp ); $z++ )
|
|
{
|
|
$link = $xpath -> query( "//*[@id=\"middle_center_subpage\"]/div[starts-with(@class,'p_lr10')]/div[@class='news'][" . $z . "]/div[contains(@class,'w_530 ')]/p[2]/a[contains(@class,'akt_more')]/@href" ) -> item(0);
|
|
$date = date( 'Y-m-d', strtotime( '-1 months', strtotime( $date ) ) );
|
|
|
|
$urls[] = trim( strip_tags( $link -> textContent ) ) . '|' . $date;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
$urls_tmp = $xpath -> query( "//*[@id=\"middle_center_subpage\"]/div[starts-with(@class,'p_lr10')]/div[@class='news']" );
|
|
for ( $z = 1; $z <= count( $urls_tmp ); $z++ )
|
|
{
|
|
$link = $xpath -> query( "//*[@id=\"middle_center_subpage\"]/div[starts-with(@class,'p_lr10')]/div[@class='news'][" . $z . "]/div[contains(@class,'pl_10')]/p[2]/a[contains(@class,'akt_more')]/@href" ) -> item(0);
|
|
$data = $xpath -> query( "//*[@id=\"middle_center_subpage\"]/div[starts-with(@class,'p_lr10')]/div[@class='news'][" . $z . "]/div[contains(@class,'pl_10')]/span" ) -> item(0);
|
|
|
|
$urls[] = trim( strip_tags( $link -> textContent ) ) . '|' . trim( strip_tags( $data -> textContent ) );
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( is_array( $urls ) )
|
|
{
|
|
foreach ( $urls as $url )
|
|
{
|
|
$text .= $url . PHP_EOL;
|
|
file_put_contents( $file, $text );
|
|
}
|
|
echo '<p>Pobrałem urle: ' . $get_url . '</p>';
|
|
}
|
|
}
|