You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
information-system/application/third_party/tripadvisor_spider/controllers/index.php

498 lines
18 KiB
PHP

<?php
if (!defined('BASEPATH'))
exit('No direct script access allowed');
//加载第三方用于解析html的类
require '/lib/simple_html_dom.php';
class Index extends CI_Controller {
public function __construct() {
parent::__construct();
//$this->output->enable_profiler(TRUE);
header('Access-Control-Allow-Origin:*');
header('Access-Control-Allow-Methods:POST, GET');
header('Access-Control-Max-Age:0');
header('Access-Control-Allow-Headers:x-requested-with, Content-Type');
header('Access-Control-Allow-Credentials:true');
$this->load->model('Tripadvisor_Review_model');
}
public function index($city = 'Beijing') {
$this->permission->is_admin();
$data = array();
$data['city'] = $city;
$data['ta_review_list'] = $this->Tripadvisor_Review_model->search(200, $data['city']);
$this->load->view('bootstrap3/header', $data);
$this->load->view('welcome');
$this->load->view('bootstrap3/footer');
}
function auto_update($city = 'Beijing') {
ini_set('max_execution_time', '180');
$ta_website = $this->config->item('tripadvisor_website');
$nation_mark=array('www.tripadvisor.com','www.tripadvisor.it','www.tripadvisor.jp','www.tripadvisor.es','www.tripadvisor.fr','www.tripadvisor.de');
foreach ($ta_website as $key_city => $item_url) {
if ($key_city == $city) {
//采集各个国家的评论
foreach($nation_mark as $nation_item){
$page_url = str_replace('www.tripadvisor.com', $nation_item, $item_url);
if($nation_item=='www.tripadvisor.com'){//分页代码,英文站点查询前三页,反过来查询,越早的越在后面
$page_mark = array('-or20-', '-or10-', '-');
//使用代理来请求,国内直接访问会很慢
$page_url=str_replace('https://www.tripadvisor.com', 'http://47.74.2.87:5052', $page_url);
}else{
$page_mark = array('-');
}
foreach ($page_mark as $page_num) {
$page_url = str_replace('{PAGENUM}', $page_num, $page_url);
$content = GET_HTTP($page_url);
if (!empty($content)) {
$html_object = str_get_html($content);
foreach ($html_object->find('.reviewSelector') as $review) {
//获取到评论ID
if (!empty($review->id)) {
$tr_review_id = str_replace('review_', '', $review->id);
$tr_review_title = $review->find('div.quote a', 0);
if (empty($tr_review_title)) {
$tr_review_title = '';
} else {
$tr_review_title = $tr_review_title->plaintext;
}
$review = $this->Tripadvisor_Review_model->detail($tr_review_id);
if (empty($review)) {
$Tripadvisor_Review_Data = new StdClass;
$Tripadvisor_Review_Data->tr_city = $key_city;
$Tripadvisor_Review_Data->tr_review_title = $tr_review_title;
$Tripadvisor_Review_Data->tr_review_id = $tr_review_id;
$Tripadvisor_Review_Data->tr_datetime = date('Y-m-d H:i:s', time());
$tr_id = $this->Tripadvisor_Review_model->add('Tripadvisor_Review', $Tripadvisor_Review_Data);
echo '<br/>' . $tr_id . ' ' . $key_city . ' ' . $tr_review_id;
}
}
}
}
}
}
}
}
}
//获取内容更新
function auto_update_content() {
ini_set('max_execution_time', '100');
$update_list = $this->Tripadvisor_Review_model->update_list(10);
if (empty($update_list)) {
echo 'all done';
return true;
}
$tr_review_id_string = '';
foreach ($update_list as $key => $item) {
$tr_review_id_string.=$item->tr_review_id . ',';
}
$tr_review_id_string.='0';
$url = "https://www.tripadvisor.com/ExpandedUserReviews-g298556-d4418151?target=480111710&context=1&reviews=480111710,$tr_review_id_string&servlet=Attraction_Review&expand=1";
echo $url . '<br/>';
$content = GET_HTTP($url);
if (empty($content)) {
echo 'error gethttp:' . $url;
return FALSE;
}
//echo $content;
$html_object = str_get_html($content);
foreach ($html_object->find('div .extended') as $review) {
if (!empty($review->id)) {
//评论ID
$tr_review_id = str_replace('UR', '', $review->id);
//用户ID
$tr_member_id = $review->find('div.member_info div.memberOverlayLink', 0);
if (isset($tr_member_id) && isset($tr_member_id->id)) {
//UID_A50920FC5494D02709AA8F0E12294AAB-SRC_494596572
$tr_member_id = substr($tr_member_id->id, 4, 32);
} else {
$tr_member_id = 0;
}
//用户名
$tr_member_name = $review->find('div.username span.expand_inline', 0);
if (empty($tr_member_name)) {
$tr_member_name = 'A TripAdvisor Member'; //看不到客人名字
} else {
$tr_member_name = $tr_member_name->plaintext;
}
//TA内容
$tr_content = $review->find('div.entry', 0);
if (empty($tr_content)) {
$tr_content = '';
} else {
$tr_content = $tr_content->innertext;
}
//游览时间
$tr_visited_date = $review->find('span.recommend-titleInline', 0);
if (empty($tr_visited_date)) {
$tr_visited_date = '';
} else {
$tr_visited_date = str_replace('Visited ', '', $tr_visited_date->plaintext);
}
//评论时间
$tr_review_date = $review->find('span.ratingDate', 0);
if (empty($tr_review_date->title)) {
$tr_review_date = str_replace('Reviewed ', '', $tr_review_date->innertext);
} else {
$tr_review_date = $tr_review_date->title;
}
$Tripadvisor_Review_Data = new StdClass;
$Tripadvisor_Review_Data->tr_member_id = $tr_member_id;
$Tripadvisor_Review_Data->tr_member_name = $tr_member_name;
$Tripadvisor_Review_Data->tr_content = $tr_content;
$Tripadvisor_Review_Data->tr_visited_date = $tr_visited_date;
$Tripadvisor_Review_Data->tr_review_date = $tr_review_date;
$where = array('tr_review_id' => $tr_review_id);
$this->Tripadvisor_Review_model->update('Tripadvisor_Review', $Tripadvisor_Review_Data, $where);
//print_r($Tripadvisor_Review_Data);
echo $tr_review_id . ' ' . $tr_member_id . ' ' . $tr_member_name . ' ' . $tr_content . ' ' . $tr_visited_date . ' ' . $tr_review_date . '<br/>';
}
}
}
//分析评论,找出可能的团号和导游
public function analysis_ta_review($tr_id) {
$data = array();
$data['ta_review'] = $this->Tripadvisor_Review_model->detail_tr_id($tr_id);
if (empty($data['ta_review'])) {
echo '找不到评论内容';
return;
}
$start_date = date('Y-m-d', strtotime($data['ta_review']->tr_visited_date));
$end_date = date("Y-m-d", strtotime("$start_date +31 day"));
$group_list = $this->Tripadvisor_Review_model->find_group($data['ta_review']->tr_city, $start_date, $end_date);
if (empty($group_list)) {
echo json_encode(array('group_result' => '<span class="text-primary">没有找到匹配团信息</span>', 'tr_content' => $data['ta_review']->tr_content));
return;
}
$data['match_group_list'] = array();
//echo $data['ta_review']->tr_content . '<br/>';
foreach ($group_list as $item) {
//echo $item->GuideName . '<br/>';
if ((trim($item->GuideName) <> '') && (stripos($data['ta_review']->tr_content, $item->GuideName) !== false)) {
//查询团队中客人信息
$item->customer_list = $this->Tripadvisor_Review_model->get_customer_info($item->coli_sn);
//填充匹配到的导游到匹配记录中
$data['match_group_list'][] = $item;
//高亮导游名字
//防止重复替换
$data['ta_review']->tr_content = str_ireplace('<span class="bg-danger text-danger">' . $item->GuideName . '</span>', $item->GuideName, $data['ta_review']->tr_content);
$data['ta_review']->tr_content = str_ireplace($item->GuideName, '<span class="bg-danger text-danger">' . $item->GuideName . '</span>', $data['ta_review']->tr_content);
//echo $data['ta_review']->tr_content;
//echo $item->GRI_No . ',' . date("Y-m-d", strtotime($item->EOI_GetDate)) . ',' . $item->GuideName . ',' . $customer_string . '<br/>';
}
}
echo json_encode(array('group_result' => $this->load->view('find_group_result', $data, true), 'tr_content' => $data['ta_review']->tr_content));
}
//第三方数据导入
public function third_party_input(){
$this->load->view('bootstrap3/header');
$this->load->view('third_party_input');
$this->load->view('bootstrap3/footer');
}
function ensure_writable_dir($dir) {
if(!file_exists($dir)) {
mkdir($dir, 0766, true);
chmod($dir, 0766);
chmod($dir, 0777);
}else if(!is_writable($dir)) {
chmod($dir, 0766);
chmod($dir, 0777);
if(!is_writable($dir)) {
throw new FileSystemException("目录 $dir 不可写");
}
}
}
//第三方数据录入
public function analysis_excel(){
$filename = date('Y').date('m').date('d').date('h').date('i').date('s').'.'.explode('.',$_FILES['fileArray']['name'])[1];
$tmp = $_FILES['fileArray']['tmp_name'];
$error = $_FILES['fileArray']['error'];
if($error > 0){
header("HTTP/1.1 404 Not Found");
echo '{"status":404,"message":'.$_FILES["fileArray"]["error"].'}';
}else{
$path = 'upload/'.date('Y').'/'.date('m').'/';
$this->ensure_writable_dir($path);
if(move_uploaded_file($tmp,$path.$filename)){
require_once "PHPExcel/IOFactory.php";
$phpExcel = PHPExcel_IOFactory::load($path.$filename);
//创建返回的数组
$data = [];
foreach ($phpExcel->getSheetNames() as $key=>$form_name){
$data[$key] = new stdClass();
$data[$key]->form_name = $form_name;
$data[$key]->list_name = array();
$data[$key]->list_data = array();
//循环获取每个表格的行/列数
$row = $phpExcel->getSheet($key)->getHighestRow();
$column = $phpExcel->getSheet($key)->getHighestColumn();
$column = 'D';
$j = 0;
// 行数循环
for ($i = 1; $i <= $row; $i++) {
// 列数循环
for ($c = 'A'; $c <= $column; $c++) {
if($phpExcel->getSheet($key)->getCell('A' . $i)->getValue() == ''){
continue;
}else{
if($i == 1){
array_push($data[$key]->list_name,$phpExcel->getSheet($key)->getCell($c . $i)->getValue());
}else{
$data[$key]->list_data[$j][] = $phpExcel->getSheet($key)->getCell($c . $i)->getValue();
}
}
}
$j++;
}
}
//返回处理完后的json
print_r(json_encode($data));
}else{
header("HTTP/1.1 404 Not Found");
echo '{"status":404,"message":"文件上传失败!","picname":""}';
}
}
}
//获取目的地
public function get_destination_reviews(){
set_time_limit(0);
$ta_website = $this->config->item('tripadvisor_website');
$destination = $this->input->get_post('destination');
$pagenum = $this->input->get_post('pagenum');
$product_links = $this->input->get_post('product_links');
if($product_links != ''){
if($pagenum != ''){
$product_links = str_replace('{PAGENUM}','-or'.$pagenum,$product_links);
}else{
$product_links = str_replace('{PAGENUM}','',$product_links);
}
$content = GET_HTTP($product_links);
$html_object = str_get_html($content);
$urlList = $html_object->find('._1T1U92WJ ._2cigFICy a');
$data = new stdClass();
$data->urls = array();
foreach ($urlList as $key=>$url){
$data->urls[$key] = 'https://www.tripadvisor.com'.$url->href;
}
print_r(json_encode($data));
}else{
return ;
}
}
//获取产品内评论URL
public function get_reviews_url(){
set_time_limit(0);
$url = $this->input->get_post('url');
$url = 'https://www.tripadvisor.com/AttractionProductReview-g294212-d11463418-Mini_Group_2_Day_Beijing_Highlights_and_Great_Wall_Tour-Beijing.html';
if($url != ''){
$content = GET_HTTP($url);
$html_object = str_get_html($content);
$urlList = $html_object->find('._1T1U92WJ ._2cigFICy a');
$data = new stdClass();
$data->urls = array();
foreach ($urlList as $key=>$url){
$data->urls[$key] = 'https://www.tripadvisor.com'.$url->href;
}
print_r(json_encode($data));
}else{
return ;
}
}
//查看抓取到的所有信息
public function get_all_reviews($destination){
if($destination != ''){
$all_reviews = $this->Tripadvisor_Review_model->get_all_reviews($destination);
$return = array();
$return['list'] = $all_reviews;
print_r(json_encode($return));
}
}
function get_reviews_detail(){
set_time_limit(0);
$url = $this->input->get_post('url');
$destination = $this->input->get_post('destination');
$html_num = $this->input->get_post('html_num');
$group_name = $this->input->get_post('group_name');
$guidename = $this->input->get_post('guidename');
$product_code = $this->input->get_post('product_code');
//$url = 'https://www.tripadvisor.com/ShowUserReviews-g308272-d6222868-r599123490-Shanghai_Trippest_Mini_Group_Tours-Shanghai.html';
//$destination = 'tp_Beijing';
if($url != ''){
$content = GET_HTTP($url);
$html_object = str_get_html($content);
//做一个数组用于存储数据
$detail_data = new stdClass();
$detail_data->destination = $destination;
$detail_data->group_name = $group_name;
$detail_data->links = $url;
$detail_data->guidename = $guidename;
$detail_data->product_code = $product_code;
//提取局部,不做整个页面的寻找元素,提升效率
$meta_inner = $html_object->find('.meta_inner');
$detail_data->user_loc = '';
$detail_data->pic = array();
foreach($meta_inner as $detail_info){
//记录该条记录的id
$detail_data->html_id = $html_num;
//获取评论者帐号
foreach($detail_info->find('.info_text') as $review_name){
$detail_data->review_name = $review_name->first_child()->innertext;
}
//获取评论者帐号
foreach($detail_info->find('.info_text .userLoc strong') as $user_loc){
$detail_data->user_loc = $user_loc->innertext;
}
//抓取评论时间
foreach($detail_info->find('.ratingDate') as $ratingDate){
$detail_data->rating_date = date('Y-m-d',strtotime($ratingDate->title));
}
//获取评论者ID
foreach($detail_info->find('.reviewSelector') as $review_id){
$detail_data->review_id = str_replace('review_','',$review_id->id);
}
//获取标题
foreach($detail_info->find('#HEADING') as $title){
$detail_data->title = $title->innertext;
}
//获取星级
foreach($detail_info->find('.ui_bubble_rating') as $star_nums){
$detail_data->star_nums = str_replace('ui_bubble_rating ','',$star_nums->getAttribute('class'));
$detail_data->star_nums = str_replace(array('bubble_50','bubble_40','bubble_30','bubble_20'),array(5,4,3,2),$detail_data->star_nums);
}
//获取评论内容
foreach($detail_info->find('.partial_entry .fullText') as $content){
$detail_data->content = $content->innertext;
}
//获取体验时间
foreach($detail_info->find('.prw_reviews_stay_date_hsx') as $experience_date){
$detail_data->experience_date = date('Y-m-d',strtotime(str_replace('<span class="stay_date_label">Date of experience:</span> ','',$experience_date->innertext)));
}
//抓取图片
foreach($detail_info->find('.imgWrap .noscript') as $imgWrap){
$imgWrap->src = str_replace('photo-l','photo-s',$imgWrap->src);
array_push($detail_data->pic,$imgWrap->src);
}
}
//拿到数据后进行入库
$this->Tripadvisor_Review_model->add_reviews($detail_data);
print_r(json_encode($detail_data));
}
}
public function ajax_get_reviews(){
$destination = $this->input->get_post('destination');
if(empty($destination)){
header("HTTP/1.1 404 Not Found");
exit('{"status":"404","reason":"请输入站点代码!"}');
}else{
$product_code = $this->input->get_post('productCode');
$num = $this->input->get_post("num");
if (!isset($num)){
$num=5;
}
if(empty($product_code)){
$return_data = array();
$ta_website = $this->config->item('tripadvisor_website');
if(empty($ta_website[$destination])){
header("HTTP/1.1 404 Not Found");
exit('{"status":"404","reason":"请输入正确的站点代码!"}');
}
$return_data['list_url'] = str_replace('{PAGENUM}','',$ta_website[$destination]);
}
//根据站点获取评论
$list_reviews = $this->Tripadvisor_Review_model->get_fivelatestreviews($destination,$product_code,$num);
$return_data['list_reviews'] = $list_reviews;
print_r(json_encode($return_data));
}
}
public function get_production_code(){
$destination = $this->input->get_post('destination');
if($destination != ''){
$productions_info = $this->Tripadvisor_Review_model->get_productions_info($destination);
print_r(json_encode($productions_info));
}
}
public function add_production(){
$config_destination = $this->input->get_post('config_destination');
$production_code = $this->input->get_post('production_code');
$production_link = $this->input->get_post('production_link');
if($production_link != ''){
//添加换页参数
$arr = explode('-',$production_link);
$arr['2'] = $arr['2'].'{PAGENUM}';
$production_link = implode('-',$arr);
if($config_destination && $production_code && $production_link){
$flag = $this->Tripadvisor_Review_model->add_config_production($config_destination,$production_code,$production_link);
exit('{"status":"200","reason":"添加成功"}');
}
}else{
exit('请输入参数');
}
}
public function deleteconfig(){
$configId = $this->input->get_post('config_id');
if(!empty($configId)){
$this->Tripadvisor_Review_model->delete_config($configId);
}
}
}