优化ta数据采集,html转换大小阈值修改

hotfix/远程访问多媒体中心
尹诚诚 8 years ago
parent 3b21e3a85f
commit a996da044a

@ -25,44 +25,52 @@ class Index extends CI_Controller {
}
function auto_update($city = 'Beijing') {
ini_set('max_execution_time', '100');
ini_set('max_execution_time', '180');
$ta_website = $this->config->item('tripadvisor_website');
//分页代码,只查询前三页,反过来查询,越早的越在后面
$page_mark = array('-or20-', '-or10-', '-');
$nation_mark=array('www.tripadvisor.com','www.tripadvisor.it','www.tripadvisor.jp','www.tripadvisor.es','www.tripadvisor.fr','www.tripadvisor.de');
foreach ($ta_website as $key_city => $item_url) {
if ($key_city == $city) {
foreach ($page_mark as $page_num) {
$page_url = str_replace('{PAGENUM}', $page_num, $item_url);
//使用代理来请求,国内直接访问会很慢
$page_url=str_replace('https://www.tripadvisor.com', 'http://47.91.16.199:5052', $page_url);
$content = GET_HTTP($page_url);
if (!empty($content)) {
$html_object = str_get_html($content);
foreach ($html_object->find('.reviewSelector') as $review) {
//获取到评论ID
if (!empty($review->id)) {
$tr_review_id = str_replace('review_', '', $review->id);
$tr_review_title = $review->find('div.quote a', 0);
if (empty($tr_review_title)) {
$tr_review_title = '';
} else {
$tr_review_title = $tr_review_title->plaintext;
}
$review = $this->Tripadvisor_Review_model->detail($tr_review_id);
if (empty($review)) {
$Tripadvisor_Review_Data = new StdClass;
$Tripadvisor_Review_Data->tr_city = $key_city;
$Tripadvisor_Review_Data->tr_review_title = $tr_review_title;
$Tripadvisor_Review_Data->tr_review_id = $tr_review_id;
$Tripadvisor_Review_Data->tr_datetime = date('Y-m-d H:i:s', time());
$tr_id = $this->Tripadvisor_Review_model->add('Tripadvisor_Review', $Tripadvisor_Review_Data);
echo '<br/>' . $tr_id . ' ' . $key_city . ' ' . $tr_review_id;
}
}
}
}
}
//采集各个国家的评论
foreach($nation_mark as $nation_item){
$page_url = str_replace('www.tripadvisor.com', $nation_item, $item_url);
if($nation_item=='www.tripadvisor.com'){//分页代码,英文站点查询前三页,反过来查询,越早的越在后面
$page_mark = array('-or20-', '-or10-', '-');
//使用代理来请求,国内直接访问会很慢
$page_url=str_replace('https://www.tripadvisor.com', 'http://47.91.16.199:5052', $page_url);
}else{
$page_mark = array('-');
}
foreach ($page_mark as $page_num) {
$page_url = str_replace('{PAGENUM}', $page_num, $page_url);
$content = GET_HTTP($page_url);
if (!empty($content)) {
$html_object = str_get_html($content);
foreach ($html_object->find('.reviewSelector') as $review) {
//获取到评论ID
if (!empty($review->id)) {
$tr_review_id = str_replace('review_', '', $review->id);
$tr_review_title = $review->find('div.quote a', 0);
if (empty($tr_review_title)) {
$tr_review_title = '';
} else {
$tr_review_title = $tr_review_title->plaintext;
}
$review = $this->Tripadvisor_Review_model->detail($tr_review_id);
if (empty($review)) {
$Tripadvisor_Review_Data = new StdClass;
$Tripadvisor_Review_Data->tr_city = $key_city;
$Tripadvisor_Review_Data->tr_review_title = $tr_review_title;
$Tripadvisor_Review_Data->tr_review_id = $tr_review_id;
$Tripadvisor_Review_Data->tr_datetime = date('Y-m-d H:i:s', time());
$tr_id = $this->Tripadvisor_Review_model->add('Tripadvisor_Review', $Tripadvisor_Review_Data);
echo '<br/>' . $tr_id . ' ' . $key_city . ' ' . $tr_review_id;
}
}
}
}
}
}
}
}
}

@ -62,7 +62,7 @@ define('HDOM_INFO_ENDSPACE',7);
define('DEFAULT_TARGET_CHARSET', 'UTF-8');
define('DEFAULT_BR_TEXT', "\r\n");
define('DEFAULT_SPAN_TEXT', " ");
define('MAX_FILE_SIZE', 600000);
define('MAX_FILE_SIZE', 6000000);
// helper functions
// -----------------------------------------------------------------------------
// get html dom from file

Loading…
Cancel
Save