优化ta数据采集,html转换大小阈值修改

hotfix/远程访问多媒体中心
尹诚诚 8 years ago
parent 3b21e3a85f
commit a996da044a

@ -25,44 +25,52 @@ class Index extends CI_Controller {
} }
function auto_update($city = 'Beijing') { function auto_update($city = 'Beijing') {
ini_set('max_execution_time', '100'); ini_set('max_execution_time', '180');
$ta_website = $this->config->item('tripadvisor_website'); $ta_website = $this->config->item('tripadvisor_website');
//分页代码,只查询前三页,反过来查询,越早的越在后面 $nation_mark=array('www.tripadvisor.com','www.tripadvisor.it','www.tripadvisor.jp','www.tripadvisor.es','www.tripadvisor.fr','www.tripadvisor.de');
$page_mark = array('-or20-', '-or10-', '-');
foreach ($ta_website as $key_city => $item_url) { foreach ($ta_website as $key_city => $item_url) {
if ($key_city == $city) { if ($key_city == $city) {
foreach ($page_mark as $page_num) { //采集各个国家的评论
$page_url = str_replace('{PAGENUM}', $page_num, $item_url); foreach($nation_mark as $nation_item){
//使用代理来请求,国内直接访问会很慢 $page_url = str_replace('www.tripadvisor.com', $nation_item, $item_url);
$page_url=str_replace('https://www.tripadvisor.com', 'http://47.91.16.199:5052', $page_url); if($nation_item=='www.tripadvisor.com'){//分页代码,英文站点查询前三页,反过来查询,越早的越在后面
$content = GET_HTTP($page_url); $page_mark = array('-or20-', '-or10-', '-');
if (!empty($content)) { //使用代理来请求,国内直接访问会很慢
$html_object = str_get_html($content); $page_url=str_replace('https://www.tripadvisor.com', 'http://47.91.16.199:5052', $page_url);
foreach ($html_object->find('.reviewSelector') as $review) { }else{
//获取到评论ID $page_mark = array('-');
if (!empty($review->id)) { }
$tr_review_id = str_replace('review_', '', $review->id); foreach ($page_mark as $page_num) {
$tr_review_title = $review->find('div.quote a', 0); $page_url = str_replace('{PAGENUM}', $page_num, $page_url);
if (empty($tr_review_title)) { $content = GET_HTTP($page_url);
$tr_review_title = ''; if (!empty($content)) {
} else { $html_object = str_get_html($content);
$tr_review_title = $tr_review_title->plaintext; foreach ($html_object->find('.reviewSelector') as $review) {
} //获取到评论ID
$review = $this->Tripadvisor_Review_model->detail($tr_review_id); if (!empty($review->id)) {
if (empty($review)) { $tr_review_id = str_replace('review_', '', $review->id);
$Tripadvisor_Review_Data = new StdClass; $tr_review_title = $review->find('div.quote a', 0);
$Tripadvisor_Review_Data->tr_city = $key_city; if (empty($tr_review_title)) {
$Tripadvisor_Review_Data->tr_review_title = $tr_review_title; $tr_review_title = '';
$Tripadvisor_Review_Data->tr_review_id = $tr_review_id; } else {
$Tripadvisor_Review_Data->tr_datetime = date('Y-m-d H:i:s', time()); $tr_review_title = $tr_review_title->plaintext;
$tr_id = $this->Tripadvisor_Review_model->add('Tripadvisor_Review', $Tripadvisor_Review_Data); }
echo '<br/>' . $tr_id . ' ' . $key_city . ' ' . $tr_review_id; $review = $this->Tripadvisor_Review_model->detail($tr_review_id);
} if (empty($review)) {
} $Tripadvisor_Review_Data = new StdClass;
} $Tripadvisor_Review_Data->tr_city = $key_city;
} $Tripadvisor_Review_Data->tr_review_title = $tr_review_title;
} $Tripadvisor_Review_Data->tr_review_id = $tr_review_id;
$Tripadvisor_Review_Data->tr_datetime = date('Y-m-d H:i:s', time());
$tr_id = $this->Tripadvisor_Review_model->add('Tripadvisor_Review', $Tripadvisor_Review_Data);
echo '<br/>' . $tr_id . ' ' . $key_city . ' ' . $tr_review_id;
}
}
}
}
}
}
} }
} }
} }

@ -62,7 +62,7 @@ define('HDOM_INFO_ENDSPACE',7);
define('DEFAULT_TARGET_CHARSET', 'UTF-8'); define('DEFAULT_TARGET_CHARSET', 'UTF-8');
define('DEFAULT_BR_TEXT', "\r\n"); define('DEFAULT_BR_TEXT', "\r\n");
define('DEFAULT_SPAN_TEXT', " "); define('DEFAULT_SPAN_TEXT', " ");
define('MAX_FILE_SIZE', 600000); define('MAX_FILE_SIZE', 6000000);
// helper functions // helper functions
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// get html dom from file // get html dom from file

Loading…
Cancel
Save