From a996da044ad00b424abbf40e9ef0222c0f4c1bc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=B9=E8=AF=9A=E8=AF=9A?= Date: Thu, 21 Sep 2017 11:00:43 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96ta=E6=95=B0=E6=8D=AE=E9=87=87?= =?UTF-8?q?=E9=9B=86=EF=BC=8Chtml=E8=BD=AC=E6=8D=A2=E5=A4=A7=E5=B0=8F?= =?UTF-8?q?=E9=98=88=E5=80=BC=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tripadvisor_spider/controllers/index.php | 76 ++++++++++--------- lib/simple_html_dom.php | 2 +- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/application/third_party/tripadvisor_spider/controllers/index.php b/application/third_party/tripadvisor_spider/controllers/index.php index eaffa83e..4697c747 100644 --- a/application/third_party/tripadvisor_spider/controllers/index.php +++ b/application/third_party/tripadvisor_spider/controllers/index.php @@ -25,44 +25,52 @@ class Index extends CI_Controller { } function auto_update($city = 'Beijing') { - ini_set('max_execution_time', '100'); + ini_set('max_execution_time', '180'); $ta_website = $this->config->item('tripadvisor_website'); - //分页代码,只查询前三页,反过来查询,越早的越在后面 - $page_mark = array('-or20-', '-or10-', '-'); + $nation_mark=array('www.tripadvisor.com','www.tripadvisor.it','www.tripadvisor.jp','www.tripadvisor.es','www.tripadvisor.fr','www.tripadvisor.de'); foreach ($ta_website as $key_city => $item_url) { if ($key_city == $city) { - foreach ($page_mark as $page_num) { - $page_url = str_replace('{PAGENUM}', $page_num, $item_url); - //使用代理来请求,国内直接访问会很慢 - $page_url=str_replace('https://www.tripadvisor.com', 'http://47.91.16.199:5052', $page_url); - $content = GET_HTTP($page_url); - if (!empty($content)) { - $html_object = str_get_html($content); - foreach ($html_object->find('.reviewSelector') as $review) { - //获取到评论ID - if (!empty($review->id)) { - $tr_review_id = str_replace('review_', '', $review->id); - $tr_review_title = $review->find('div.quote a', 0); - if (empty($tr_review_title)) { - $tr_review_title = ''; - } else { - $tr_review_title = $tr_review_title->plaintext; - } - $review = $this->Tripadvisor_Review_model->detail($tr_review_id); - if (empty($review)) { - $Tripadvisor_Review_Data = new StdClass; - $Tripadvisor_Review_Data->tr_city = $key_city; - $Tripadvisor_Review_Data->tr_review_title = $tr_review_title; - $Tripadvisor_Review_Data->tr_review_id = $tr_review_id; - $Tripadvisor_Review_Data->tr_datetime = date('Y-m-d H:i:s', time()); - $tr_id = $this->Tripadvisor_Review_model->add('Tripadvisor_Review', $Tripadvisor_Review_Data); - echo '
' . $tr_id . ' ' . $key_city . ' ' . $tr_review_id; - } - } - } - } - } + //采集各个国家的评论 + foreach($nation_mark as $nation_item){ + $page_url = str_replace('www.tripadvisor.com', $nation_item, $item_url); + if($nation_item=='www.tripadvisor.com'){//分页代码,英文站点查询前三页,反过来查询,越早的越在后面 + $page_mark = array('-or20-', '-or10-', '-'); + //使用代理来请求,国内直接访问会很慢 + $page_url=str_replace('https://www.tripadvisor.com', 'http://47.91.16.199:5052', $page_url); + }else{ + $page_mark = array('-'); + } + foreach ($page_mark as $page_num) { + $page_url = str_replace('{PAGENUM}', $page_num, $page_url); + $content = GET_HTTP($page_url); + if (!empty($content)) { + $html_object = str_get_html($content); + foreach ($html_object->find('.reviewSelector') as $review) { + //获取到评论ID + if (!empty($review->id)) { + $tr_review_id = str_replace('review_', '', $review->id); + $tr_review_title = $review->find('div.quote a', 0); + if (empty($tr_review_title)) { + $tr_review_title = ''; + } else { + $tr_review_title = $tr_review_title->plaintext; + } + $review = $this->Tripadvisor_Review_model->detail($tr_review_id); + if (empty($review)) { + $Tripadvisor_Review_Data = new StdClass; + $Tripadvisor_Review_Data->tr_city = $key_city; + $Tripadvisor_Review_Data->tr_review_title = $tr_review_title; + $Tripadvisor_Review_Data->tr_review_id = $tr_review_id; + $Tripadvisor_Review_Data->tr_datetime = date('Y-m-d H:i:s', time()); + $tr_id = $this->Tripadvisor_Review_model->add('Tripadvisor_Review', $Tripadvisor_Review_Data); + echo '
' . $tr_id . ' ' . $key_city . ' ' . $tr_review_id; + } + } + } + } + } + } } } } diff --git a/lib/simple_html_dom.php b/lib/simple_html_dom.php index ce412794..7a95367e 100644 --- a/lib/simple_html_dom.php +++ b/lib/simple_html_dom.php @@ -62,7 +62,7 @@ define('HDOM_INFO_ENDSPACE',7); define('DEFAULT_TARGET_CHARSET', 'UTF-8'); define('DEFAULT_BR_TEXT', "\r\n"); define('DEFAULT_SPAN_TEXT', " "); -define('MAX_FILE_SIZE', 600000); +define('MAX_FILE_SIZE', 6000000); // helper functions // ----------------------------------------------------------------------------- // get html dom from file