From 3e25fa5d36ac1ce44b78509e8926bc5f3784ba20 Mon Sep 17 00:00:00 2001
From: cyc
Date: Thu, 23 May 2019 16:14:20 +0800
Subject: [PATCH] =?UTF-8?q?=E9=83=A8=E7=BD=B2TA=E6=8A=93=E5=8F=96=E4=BB=A3?=
=?UTF-8?q?=E7=A0=81=E5=88=B0=E7=BD=91=E5=89=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../tripadvisor_spider/config/config.php | 8 +-
.../tripadvisor_spider/controllers/index.php | 61 +++++++--
.../models/Tripadvisor_Review_model.php | 34 +++++
.../views/third_party_input.php | 122 +++++++++++++++++-
4 files changed, 207 insertions(+), 18 deletions(-)
diff --git a/application/third_party/tripadvisor_spider/config/config.php b/application/third_party/tripadvisor_spider/config/config.php
index 022d61f8..3b452f64 100644
--- a/application/third_party/tripadvisor_spider/config/config.php
+++ b/application/third_party/tripadvisor_spider/config/config.php
@@ -18,10 +18,10 @@ $config['tripadvisor_website'] = array(
'Zhangjiajie' => 'http://www.tripadvisor.com/Attraction_Review-g494933-d8077695-Reviews{PAGENUM}China_Highlights_Zhangjiajie_Day_Tour-Zhangjiajie_Hunan.html',
'HongKong' => 'https://www.tripadvisor.com/Attraction_Review-g294217-d10243951-Reviews{PAGENUM}China_Highlights_Hong_Kong-Hong_Kong.html',
'Panda' => 'https://www.tripadvisor.com/Attraction_Review-g297463-d11489225-Reviews{PAGENUM}China_Highlights-Chengdu_Sichuan.html',
- 'tp_Beijing' => 'https://www.tripadvisor.com/Attraction_Review-g294212-d4006739-Reviews-The_Trippest_Mini_Group_Tours-Beijing.html',
- 'tp_Xian' => 'https://www.tripadvisor.com/Attraction_Review-g298557-d10999897-Reviews-Xi_an_Trippest_Mini_Group_Tours-Xi_an_Shaanxi.html',
- 'tp_Shanghai' => 'https://www.tripadvisor.com/Attraction_Review-g308272-d6222868-Reviews-Shanghai_Trippest_Mini_Group_Tours-Shanghai.html',
- 'tp_Guilin' => 'https://www.tripadvisor.com/Attraction_Review-g298556-d14121459-Reviews-Trippest_Mini_Group_Tours-Guilin_Guangxi.html'
+ 'tp_Beijing' => 'https://www.tripadvisor.com/Attraction_Review-g294212-d4006739-Reviews{PAGENUM}-The_Trippest_Mini_Group_Tours-Beijing.html',
+ 'tp_Xian' => 'https://www.tripadvisor.com/Attraction_Review-g298557-d10999897-Reviews{PAGENUM}-Xi_an_Trippest_Mini_Group_Tours-Xi_an_Shaanxi.html',
+ 'tp_Shanghai' => 'https://www.tripadvisor.com/Attraction_Review-g308272-d6222868-Reviews{PAGENUM}-Shanghai_Trippest_Mini_Group_Tours-Shanghai.html',
+ 'tp_Guilin' => 'https://www.tripadvisor.com/Attraction_Review-g298556-d14121459-Reviews{PAGENUM}-Trippest_Mini_Group_Tours-Guilin_Guangxi.html'
);
diff --git a/application/third_party/tripadvisor_spider/controllers/index.php b/application/third_party/tripadvisor_spider/controllers/index.php
index 6b7e266f..2b8124b6 100644
--- a/application/third_party/tripadvisor_spider/controllers/index.php
+++ b/application/third_party/tripadvisor_spider/controllers/index.php
@@ -267,24 +267,40 @@ class Index extends CI_Controller {
}
}
- public function get_destination_reviews($destination = null){
+ public function get_destination_reviews($destination = null,$pagenum = null){
+ set_time_limit(0);
$ta_website = $this->config->item('tripadvisor_website');
//根据传入的目的地简码获取TA的相应评论列表
if(isset($ta_website[$destination])){
$url = $ta_website[$destination];
+ if($pagenum != ''){
+ $url = str_replace('{PAGENUM}','-or'.$pagenum,$url);
+ }else{
+ $url = str_replace('{PAGENUM}','',$url);
+ }
//根据url获取页面内容
$content = GET_HTTP($url);
-
//进行页面解析
$html_object = str_get_html($content);
-
- //获取第一页列表上的url
- foreach ($html_object->find('.reviewSelector .quote a') as $a_info){
- $url = 'https://www.tripadvisor.com'.$a_info->href;
-
+ $return = new stdClass();
+ $return->urls = array();
+ //获取每个页面上的url
+ foreach ($html_object->find('.reviewSelector .quote a') as $reviews_url){
+ array_push($return->urls,'https://www.tripadvisor.com'.$reviews_url->href);
}
+ print_r(json_encode($return));
+ }
+ }
+
+ //查看抓取到的所有信息
+ public function get_all_reviews($destination){
+ if($destination != ''){
+ $all_reviews = $this->Tripadvisor_Review_model->get_all_reviews($destination);
+ $return = array();
+ $return['list'] = $all_reviews;
+ print_r(json_encode($return));
}
}
@@ -294,8 +310,8 @@ class Index extends CI_Controller {
$destination = $this->input->get_post('destination');
$html_num = $this->input->get_post('html_num');
- //$url = 'https://www.tripadvisor.com/ShowUserReviews-g294212-d4006739-r666168101-The_Trippest_Mini_Group_Tours-Beijing.html';
- $destination = 'tp_Beijing';
+ //$url = 'https://www.tripadvisor.com/ShowUserReviews-g308272-d6222868-r599123490-Shanghai_Trippest_Mini_Group_Tours-Shanghai.html';
+ //$destination = 'tp_Beijing';
if($url != ''){
$content = GET_HTTP($url);
@@ -307,15 +323,28 @@ class Index extends CI_Controller {
//提取局部,不做整个页面的寻找元素,提升效率
$meta_inner = $html_object->find('.meta_inner');
+ $detail_data->user_loc = '';
+ $detail_data->pic = array();
foreach($meta_inner as $detail_info){
//记录该条记录的id
$detail_data->html_id = $html_num;
+
//获取评论者帐号
foreach($detail_info->find('.info_text') as $review_name){
$detail_data->review_name = $review_name->first_child()->innertext;
}
+ //获取评论者帐号
+ foreach($detail_info->find('.info_text .userLoc strong') as $user_loc){
+ $detail_data->user_loc = $user_loc->innertext;
+ }
+
+ //抓取评论时间
+ foreach($detail_info->find('.ratingDate') as $ratingDate){
+ $detail_data->rating_date = date('Y-m-d',strtotime($ratingDate->title));
+ }
+
//获取评论者ID
foreach($detail_info->find('.reviewSelector') as $review_id){
$detail_data->review_id = str_replace('review_','',$review_id->id);
@@ -337,13 +366,21 @@ class Index extends CI_Controller {
$detail_data->content = $content->innertext;
}
- //获取评论时间
- foreach($detail_info->find('.prw_reviews_stay_date_hsx') as $review_date){
- $detail_data->review_date = str_replace('Date of experience: ','',$review_date->innertext);
+ //获取体验时间
+ foreach($detail_info->find('.prw_reviews_stay_date_hsx') as $experience_date){
+ $detail_data->experience_date = date('Y-m-d',strtotime(str_replace('Date of experience: ','',$experience_date->innertext)));
+ }
+
+ //抓取图片
+ foreach($detail_info->find('.imgWrap .noscript') as $imgWrap){
+ $imgWrap->src = str_replace('photo-l','photo-s',$imgWrap->src);
+ array_push($detail_data->pic,$imgWrap->src);
}
}
//拿到数据后进行入库
+ $this->Tripadvisor_Review_model->add_reviews($detail_data);
+
print_r(json_encode($detail_data));
}
}
diff --git a/application/third_party/tripadvisor_spider/models/Tripadvisor_Review_model.php b/application/third_party/tripadvisor_spider/models/Tripadvisor_Review_model.php
index 25414fc7..d2f8cf21 100644
--- a/application/third_party/tripadvisor_spider/models/Tripadvisor_Review_model.php
+++ b/application/third_party/tripadvisor_spider/models/Tripadvisor_Review_model.php
@@ -161,5 +161,39 @@ class Tripadvisor_Review_model extends CI_Model {
$result = $query->result();
return $result;
}
+
+ public function add_reviews($detail_data){
+ $sql = "
+ IF NOT EXISTS(
+ select tr_review_id from Ta_Reviews where tr_review_id = ?
+ )
+ insert into Ta_Reviews
+ (
+ tr_destination,
+ tr_review_id,
+ tr_review_title,
+ tr_content,
+ tr_member_name,
+ tr_member_loc,
+ tr_member_starts,
+ tr_review_date,
+ tr_visited_date,
+ tr_review_pics,
+ tr_gri_no,
+ tr_tgi_sn,
+ tr_datetime
+ )values(
+ ?,?,?,?,?,?,?,?,?,?,?,?,GETDATE()
+ )
+ ";
+ $query = $this->INFO->query($sql, array($detail_data->review_id,$detail_data->destination,$detail_data->review_id,$detail_data->title,$detail_data->content,$detail_data->review_name,$detail_data->user_loc,$detail_data->star_nums,$detail_data->rating_date,$detail_data->experience_date,json_encode($detail_data->pic),'',''));
+ //$result = $query->result();
+ }
+
+ public function get_all_reviews($destination){
+ $sql = 'select * from Ta_Reviews where tr_destination = ? order by tr_review_date desc';
+ $query = $this->INFO->query($sql,array($destination));
+ return $query->result();
+ }
}
diff --git a/application/third_party/tripadvisor_spider/views/third_party_input.php b/application/third_party/tripadvisor_spider/views/third_party_input.php
index 0643046e..eb84da06 100644
--- a/application/third_party/tripadvisor_spider/views/third_party_input.php
+++ b/application/third_party/tripadvisor_spider/views/third_party_input.php
@@ -10,9 +10,15 @@
手动录入
+
+ 列表抓取
+
excel导入
+
+ 数据预览
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -70,6 +124,7 @@ $(function(){
//获取填写的url
var ta_url = $('#ta_url').val();
var stars = '';
+ var pic_htm = '';
if(ta_url == ''){
alert('请填写需要采集的TA地址');
}else{
@@ -80,8 +135,14 @@ $(function(){
var data = $.parseJSON(json);
console.log(data);
$('.ta_content').html(data.content);
- $('.review_date').html('Date of experience: '+data.review_date);
+ $('.experience_date').html('Date of experience: '+data.experience_date);
$('.review_name').html(data.review_name);
+ $('.user_loc').html(data.user_loc);
+ $('.rating_date').html('Reviewed:'+data.rating_date);
+ for(var i=0;i
';
+ }
+ $('.review_pic').html(pic_htm);
$('.ta_title').html(''+data.title+'');
if(data.star_nums){
for(var i=0;i';
+ html += '
'+jsondata.list[y].tr_member_name+'
'+jsondata.list[y].tr_member_loc+'
';
+ for(var i=0;i';
+ }
+ html += '
'+jsondata.list[y].tr_review_title+'
Reviewed: '+jsondata.list[y].tr_review_date+'
'+jsondata.list[y].tr_content+'
';
+ html += '
';
+
+ if($.parseJSON(jsondata.list[y].tr_review_pics).length > 0){
+ for(var j=0;j<$.parseJSON(jsondata.list[y].tr_review_pics).length;j++){
+ html += '
[j]+')
';
+ }
+ }
+
+ html += '
Date of experience: '+jsondata.list[y].tr_visited_date+'
';
+ }
+ html += 'total nums : '+jsondata.list.length+'
'
+ $('#list_view_content').html(html);
+ }
+ });
+ }
+ });
});
\ No newline at end of file