You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
116 lines
5.0 KiB
PHP
116 lines
5.0 KiB
PHP
<?php
|
|
/*
|
|
+---------------------------------------------------------------------------------+
|
|
| Copyright (c) 2013 César Rodas |
|
|
+---------------------------------------------------------------------------------+
|
|
| Redistribution and use in source and binary forms, with or without |
|
|
| modification, are permitted provided that the following conditions are met: |
|
|
| 1. Redistributions of source code must retain the above copyright |
|
|
| notice, this list of conditions and the following disclaimer. |
|
|
| |
|
|
| 2. Redistributions in binary form must reproduce the above copyright |
|
|
| notice, this list of conditions and the following disclaimer in the |
|
|
| documentation and/or other materials provided with the distribution. |
|
|
| |
|
|
| 3. All advertising materials mentioning features or use of this software |
|
|
| must display the following acknowledgement: |
|
|
| This product includes software developed by César D. Rodas. |
|
|
| |
|
|
| 4. Neither the name of the César D. Rodas nor the |
|
|
| names of its contributors may be used to endorse or promote products |
|
|
| derived from this software without specific prior written permission. |
|
|
| |
|
|
| THIS SOFTWARE IS PROVIDED BY CÉSAR D. RODAS ''AS IS'' AND ANY |
|
|
| EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
|
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
|
| DISCLAIMED. IN NO EVENT SHALL CÉSAR D. RODAS BE LIABLE FOR ANY |
|
|
| DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
|
| (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
|
|
| LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
|
| ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
|
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|
|
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE |
|
|
+---------------------------------------------------------------------------------+
|
|
| Authors: César Rodas <crodas@php.net> |
|
|
+---------------------------------------------------------------------------------+
|
|
*/
|
|
namespace crodas\TextRank;
|
|
|
|
use LanguageDetector\Detect;
|
|
|
|
/**
|
|
* Data files has been borrowed from
|
|
* https://github.com/ekorn/Keywords
|
|
*/
|
|
class Stopword extends DefaultEvents
|
|
{
|
|
protected $stopword;
|
|
protected $lang;
|
|
|
|
public function normalize_keywords(Array $keywords)
|
|
{
|
|
$normalized = parent::normalize_keywords($keywords);
|
|
$callback = "stem_{$this->lang}";
|
|
|
|
$tagger = __NAMESPACE__ . '\POS\\' . ucfirst($this->lang) . '\Tagger';
|
|
if (is_callable($callback)) {
|
|
return array_map(function ($keyword) use ($callback) {
|
|
return $callback($keyword);
|
|
}, $normalized);
|
|
}
|
|
return $normalized;
|
|
}
|
|
|
|
public function filter_keywords(Array $keywords)
|
|
{
|
|
$keywords = parent::filter_keywords($keywords);
|
|
$tagger = __NAMESPACE__ . '\POS\\' . ucfirst($this->lang) . '\Tagger';
|
|
|
|
if (class_exists($tagger)) {
|
|
$keywords = $tagger::get($keywords);
|
|
}
|
|
|
|
$temp_words=$this->stopword;
|
|
$keywords = array_filter($keywords, function ($word) {
|
|
$word = mb_strtolower($word);
|
|
return empty($temp_words[$word]);
|
|
});
|
|
|
|
return $keywords;
|
|
}
|
|
|
|
protected function getClassifier()
|
|
{
|
|
static $detect;
|
|
if (empty($detect)) {
|
|
$detect = Detect::initByPath(__DIR__ . '/language-profile.php');
|
|
}
|
|
return $detect;
|
|
}
|
|
protected function getStopwords()
|
|
{
|
|
static $stopwords;
|
|
if (empty($stopwords)) {
|
|
$stopwords = require __DIR__ . '/Stopword/Stopword.php';
|
|
}
|
|
return $stopwords;
|
|
}
|
|
|
|
public function get_words($text)
|
|
{
|
|
$detect = $this->getClassifier();
|
|
$stopwords = $this->getStopwords();
|
|
$lang = $detect->detect($text);
|
|
if (!is_string($lang)) {
|
|
throw new \RuntimeException("Cannot detect the language of the text");
|
|
}
|
|
if (empty($stopwords[$lang])) {
|
|
throw new \RuntimeException("We dont have an stop word for {$lang}, please add it in " . __DIR__ . "/Stopword/{$lang}-stopwords.txt and run generate.php");
|
|
}
|
|
$this->stopword = $stopwords[$lang];
|
|
$this->lang = $lang;
|
|
|
|
return parent::get_words($text);
|
|
}
|
|
}
|