205 lines
10 KiB
PHP
205 lines
10 KiB
PHP
<?php
|
||
|
||
class stTextAnalyzer
|
||
{
|
||
protected static $instance = array();
|
||
|
||
protected $stopwords = array();
|
||
protected $reservedChars = '';
|
||
|
||
public function __construct($stopwords)
|
||
{
|
||
mb_internal_encoding("UTF-8");
|
||
mb_regex_encoding("UTF-8");
|
||
|
||
$this->stopwords = '\b('.implode('|', array_map(function($value) {
|
||
return trim($value);
|
||
}, $stopwords)).')\b';
|
||
|
||
$reserved_chars = stConfig::getInstance('stSearchBackend')->get('reserved_chars');
|
||
|
||
$this->reservedChars = preg_quote($reserved_chars, '/');
|
||
}
|
||
|
||
public function analyze($text, $keywords = array(), $weight = 1)
|
||
{
|
||
$text = htmlspecialchars_decode($text);
|
||
$text = preg_replace ('/<[^>]*>/', ' ', $text);
|
||
$text = mb_eregi_replace($this->stopwords, ' ', $text);
|
||
$text = self::unaccent($text);
|
||
$text = preg_replace('/[^A-Za-z0-9'.$this->reservedChars.']/', ' ', $text);
|
||
$text = str_replace(' ', ' ', $text);
|
||
$terms = explode(' ', $text);
|
||
|
||
if ($terms)
|
||
{
|
||
$count = count($terms);
|
||
|
||
foreach ($terms as $term)
|
||
{
|
||
if (empty($term))
|
||
{
|
||
continue;
|
||
}
|
||
|
||
$lower = mb_strtolower(trim($term, '\\-/:'));
|
||
|
||
if ($lower && !isset($keywords[$lower]))
|
||
{
|
||
$keywords[$lower] = $weight;
|
||
}
|
||
}
|
||
}
|
||
|
||
return $keywords;
|
||
}
|
||
|
||
public static function unaccent($string)
|
||
{
|
||
if (!preg_match('/[\x80-\xff]/', $string)) {
|
||
return $string;
|
||
}
|
||
|
||
$chars = array(
|
||
// Greek
|
||
'Α' => 'A', 'Β' => 'B', 'Γ' => 'G', 'Δ' => 'D', 'Ε' => 'E', 'Ζ' => 'Z', 'Η' => 'H', 'Θ' => '8',
|
||
'Ι' => 'I', 'Κ' => 'K', 'Λ' => 'L', 'Μ' => 'M', 'Ν' => 'N', 'Ξ' => '3', 'Ο' => 'O', 'Π' => 'P',
|
||
'Ρ' => 'R', 'Σ' => 'S', 'Τ' => 'T', 'Υ' => 'Y', 'Φ' => 'F', 'Χ' => 'X', 'Ψ' => 'PS', 'Ω' => 'W',
|
||
'Ά' => 'A', 'Έ' => 'E', 'Ί' => 'I', 'Ό' => 'O', 'Ύ' => 'Y', 'Ή' => 'H', 'Ώ' => 'W', 'Ϊ' => 'I',
|
||
'Ϋ' => 'Y',
|
||
'α' => 'a', 'β' => 'b', 'γ' => 'g', 'δ' => 'd', 'ε' => 'e', 'ζ' => 'z', 'η' => 'h', 'θ' => '8',
|
||
'ι' => 'i', 'κ' => 'k', 'λ' => 'l', 'μ' => 'm', 'ν' => 'n', 'ξ' => '3', 'ο' => 'o', 'π' => 'p',
|
||
'ρ' => 'r', 'σ' => 's', 'τ' => 't', 'υ' => 'y', 'φ' => 'f', 'χ' => 'x', 'ψ' => 'ps', 'ω' => 'w',
|
||
'ά' => 'a', 'έ' => 'e', 'ί' => 'i', 'ό' => 'o', 'ύ' => 'y', 'ή' => 'h', 'ώ' => 'w', 'ς' => 's',
|
||
'ϊ' => 'i', 'ΰ' => 'y', 'ϋ' => 'y', 'ΐ' => 'i',
|
||
// Turkish
|
||
'Ş' => 'S', 'İ' => 'I', 'Ç' => 'C', 'Ü' => 'U', 'Ö' => 'O', 'Ğ' => 'G',
|
||
'ş' => 's', 'ı' => 'i', 'ç' => 'c', 'ü' => 'u', 'ö' => 'o', 'ğ' => 'g',
|
||
// Russian
|
||
'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Е' => 'E', 'Ё' => 'Yo', 'Ж' => 'Zh',
|
||
'З' => 'Z', 'И' => 'I', 'Й' => 'J', 'К' => 'K', 'Л' => 'L', 'М' => 'M', 'Н' => 'N', 'О' => 'O',
|
||
'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'У' => 'U', 'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C',
|
||
'Ч' => 'Ch', 'Ш' => 'Sh', 'Щ' => 'Sh', 'Ъ' => '', 'Ы' => 'Y', 'Ь' => '', 'Э' => 'E', 'Ю' => 'Yu',
|
||
'Я' => 'Ya',
|
||
'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'е' => 'e', 'ё' => 'yo', 'ж' => 'zh',
|
||
'з' => 'z', 'и' => 'i', 'й' => 'j', 'к' => 'k', 'л' => 'l', 'м' => 'm', 'н' => 'n', 'о' => 'o',
|
||
'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'у' => 'u', 'ф' => 'f', 'х' => 'h', 'ц' => 'c',
|
||
'ч' => 'ch', 'ш' => 'sh', 'щ' => 'sh', 'ъ' => '', 'ы' => 'y', 'ь' => '', 'э' => 'e', 'ю' => 'yu',
|
||
'я' => 'ya',
|
||
chr(195).chr(128) => 'A', chr(195).chr(129) => 'A',
|
||
chr(195).chr(130) => 'A', chr(195).chr(131) => 'A',
|
||
chr(195).chr(132) => 'A', chr(195).chr(133) => 'A',
|
||
chr(195).chr(135) => 'C', chr(195).chr(136) => 'E',
|
||
chr(195).chr(137) => 'E', chr(195).chr(138) => 'E',
|
||
chr(195).chr(139) => 'E', chr(195).chr(140) => 'I',
|
||
chr(195).chr(141) => 'I', chr(195).chr(142) => 'I',
|
||
chr(195).chr(143) => 'I', chr(195).chr(145) => 'N',
|
||
chr(195).chr(146) => 'O', chr(195).chr(147) => 'O',
|
||
chr(195).chr(148) => 'O', chr(195).chr(149) => 'O',
|
||
chr(195).chr(150) => 'O', chr(195).chr(153) => 'U',
|
||
chr(195).chr(154) => 'U', chr(195).chr(155) => 'U',
|
||
chr(195).chr(156) => 'U', chr(195).chr(157) => 'Y',
|
||
chr(195).chr(159) => 's', chr(195).chr(160) => 'a',
|
||
chr(195).chr(161) => 'a', chr(195).chr(162) => 'a',
|
||
chr(195).chr(163) => 'a', chr(195).chr(164) => 'a',
|
||
chr(195).chr(165) => 'a', chr(195).chr(167) => 'c',
|
||
chr(195).chr(168) => 'e', chr(195).chr(169) => 'e',
|
||
chr(195).chr(170) => 'e', chr(195).chr(171) => 'e',
|
||
chr(195).chr(172) => 'i', chr(195).chr(173) => 'i',
|
||
chr(195).chr(174) => 'i', chr(195).chr(175) => 'i',
|
||
chr(195).chr(177) => 'n', chr(195).chr(178) => 'o',
|
||
chr(195).chr(179) => 'o', chr(195).chr(180) => 'o',
|
||
chr(195).chr(181) => 'o', chr(195).chr(182) => 'o',
|
||
chr(195).chr(182) => 'o', chr(195).chr(185) => 'u',
|
||
chr(195).chr(186) => 'u', chr(195).chr(187) => 'u',
|
||
chr(195).chr(188) => 'u', chr(195).chr(189) => 'y',
|
||
chr(195).chr(191) => 'y',
|
||
// Decompositions for Latin Extended-A
|
||
chr(196).chr(128) => 'A', chr(196).chr(129) => 'a',
|
||
chr(196).chr(130) => 'A', chr(196).chr(131) => 'a',
|
||
chr(196).chr(132) => 'A', chr(196).chr(133) => 'a',
|
||
chr(196).chr(134) => 'C', chr(196).chr(135) => 'c',
|
||
chr(196).chr(136) => 'C', chr(196).chr(137) => 'c',
|
||
chr(196).chr(138) => 'C', chr(196).chr(139) => 'c',
|
||
chr(196).chr(140) => 'C', chr(196).chr(141) => 'c',
|
||
chr(196).chr(142) => 'D', chr(196).chr(143) => 'd',
|
||
chr(196).chr(144) => 'D', chr(196).chr(145) => 'd',
|
||
chr(196).chr(146) => 'E', chr(196).chr(147) => 'e',
|
||
chr(196).chr(148) => 'E', chr(196).chr(149) => 'e',
|
||
chr(196).chr(150) => 'E', chr(196).chr(151) => 'e',
|
||
chr(196).chr(152) => 'E', chr(196).chr(153) => 'e',
|
||
chr(196).chr(154) => 'E', chr(196).chr(155) => 'e',
|
||
chr(196).chr(156) => 'G', chr(196).chr(157) => 'g',
|
||
chr(196).chr(158) => 'G', chr(196).chr(159) => 'g',
|
||
chr(196).chr(160) => 'G', chr(196).chr(161) => 'g',
|
||
chr(196).chr(162) => 'G', chr(196).chr(163) => 'g',
|
||
chr(196).chr(164) => 'H', chr(196).chr(165) => 'h',
|
||
chr(196).chr(166) => 'H', chr(196).chr(167) => 'h',
|
||
chr(196).chr(168) => 'I', chr(196).chr(169) => 'i',
|
||
chr(196).chr(170) => 'I', chr(196).chr(171) => 'i',
|
||
chr(196).chr(172) => 'I', chr(196).chr(173) => 'i',
|
||
chr(196).chr(174) => 'I', chr(196).chr(175) => 'i',
|
||
chr(196).chr(176) => 'I', chr(196).chr(177) => 'i',
|
||
chr(196).chr(178) => 'IJ',chr(196).chr(179) => 'ij',
|
||
chr(196).chr(180) => 'J', chr(196).chr(181) => 'j',
|
||
chr(196).chr(182) => 'K', chr(196).chr(183) => 'k',
|
||
chr(196).chr(184) => 'k', chr(196).chr(185) => 'L',
|
||
chr(196).chr(186) => 'l', chr(196).chr(187) => 'L',
|
||
chr(196).chr(188) => 'l', chr(196).chr(189) => 'L',
|
||
chr(196).chr(190) => 'l', chr(196).chr(191) => 'L',
|
||
chr(197).chr(128) => 'l', chr(197).chr(129) => 'L',
|
||
chr(197).chr(130) => 'l', chr(197).chr(131) => 'N',
|
||
chr(197).chr(132) => 'n', chr(197).chr(133) => 'N',
|
||
chr(197).chr(134) => 'n', chr(197).chr(135) => 'N',
|
||
chr(197).chr(136) => 'n', chr(197).chr(137) => 'N',
|
||
chr(197).chr(138) => 'n', chr(197).chr(139) => 'N',
|
||
chr(197).chr(140) => 'O', chr(197).chr(141) => 'o',
|
||
chr(197).chr(142) => 'O', chr(197).chr(143) => 'o',
|
||
chr(197).chr(144) => 'O', chr(197).chr(145) => 'o',
|
||
chr(197).chr(146) => 'OE',chr(197).chr(147) => 'oe',
|
||
chr(197).chr(148) => 'R', chr(197).chr(149) => 'r',
|
||
chr(197).chr(150) => 'R', chr(197).chr(151) => 'r',
|
||
chr(197).chr(152) => 'R', chr(197).chr(153) => 'r',
|
||
chr(197).chr(154) => 'S', chr(197).chr(155) => 's',
|
||
chr(197).chr(156) => 'S', chr(197).chr(157) => 's',
|
||
chr(197).chr(158) => 'S', chr(197).chr(159) => 's',
|
||
chr(197).chr(160) => 'S', chr(197).chr(161) => 's',
|
||
chr(197).chr(162) => 'T', chr(197).chr(163) => 't',
|
||
chr(197).chr(164) => 'T', chr(197).chr(165) => 't',
|
||
chr(197).chr(166) => 'T', chr(197).chr(167) => 't',
|
||
chr(197).chr(168) => 'U', chr(197).chr(169) => 'u',
|
||
chr(197).chr(170) => 'U', chr(197).chr(171) => 'u',
|
||
chr(197).chr(172) => 'U', chr(197).chr(173) => 'u',
|
||
chr(197).chr(174) => 'U', chr(197).chr(175) => 'u',
|
||
chr(197).chr(176) => 'U', chr(197).chr(177) => 'u',
|
||
chr(197).chr(178) => 'U', chr(197).chr(179) => 'u',
|
||
chr(197).chr(180) => 'W', chr(197).chr(181) => 'w',
|
||
chr(197).chr(182) => 'Y', chr(197).chr(183) => 'y',
|
||
chr(197).chr(184) => 'Y', chr(197).chr(185) => 'Z',
|
||
chr(197).chr(186) => 'z', chr(197).chr(187) => 'Z',
|
||
chr(197).chr(188) => 'z', chr(197).chr(189) => 'Z',
|
||
chr(197).chr(190) => 'z', chr(197).chr(191) => 's',
|
||
// Euro Sign
|
||
chr(226).chr(130).chr(172) => 'E',
|
||
// GBP (Pound) Sign
|
||
chr(194).chr(163) => '',
|
||
'Ä' => 'Ae', 'ä' => 'ae', 'Ü' => 'Ue', 'ü' => 'ue',
|
||
'Ö' => 'Oe', 'ö' => 'oe', 'ß' => 'ss',
|
||
// Norwegian characters
|
||
'Å'=>'Aa','Æ'=>'Ae','Ø'=>'O','æ'=>'a','ø'=>'o','å'=>'aa'
|
||
);
|
||
|
||
return strtr($string, $chars);
|
||
}
|
||
|
||
public static function getInstance($culture)
|
||
{
|
||
if (!isset(self::$instance[$culture]))
|
||
{
|
||
$stopwords = file(dirname(__FILE__).'/../data/stopwords/'.$culture.'.txt');
|
||
self::$instance[$culture] = new stTextAnalyzer($stopwords);
|
||
}
|
||
|
||
return self::$instance[$culture];
|
||
}
|
||
} |