<?php
/**
* 爬取 国外高校犯罪信息数据
* Class SchoolSpider
*/
class SchoolSpider
{
private $schoolName;
//学校名检查api
private $schoolApi = 'https://ope.ed.gov/campussafety/api/institution/names?filter=';
//校区
private $schoolSerachApi = 'https://ope.ed.gov/campussafety/api/institution/search';
//数据 校区
private $dataApi = 'https://ope.ed.gov/campussafety/api/campus/';
//整个学校
private $totalApi = 'https://ope.ed.gov/campussafety/api/institution/';
//校区
private $campisesData;
//详情数据
private $dataDetail;
//详情数据
public $error;
public $result=[];
public function __construct()
{
//请求时间不限
set_time_limit(0);
}
public function spiderData($schoolName)
{
$this->schoolName = $schoolName;
$this->result['school_name'] = $this->schoolName;
if (!$this->checkName() || !$this->getCampises()) {
return ['code' => -1, 'msg' => $this->error];
}
//总的数据
$this->formatData($this->campisesData);
//各个分校区数据
$this->formatData($this->campisesData[0]['Campuses'],1);
return ['code' => 0, 'msg' => $this->error,'data'=>$this->result];
}
/**
* @name 数据解析
* @param $campisesData 数据
* @param $isCampise 是否是分校区数据
*/
private function formatData($campisesData,$isCampise=false){
foreach ($campisesData as $campise) {
if($isCampise){
//校区名
$res = $this->getDetailCampuse($campise['UnitID']);
}else{
$res = $this->getDetail($campise['UnitID']);
}
if(!$res||isset($this->result[$campise['UnitID']])){
continue;
}
$dataDetail = $this->dataDetail;
//校名
$this->result[$campise['UnitID']]['campuse'] = $campise['Name'];
foreach ($dataDetail as $k => $data) {
if ($k == 0)
continue;
$this->result[$campise['UnitID']]['data'][] = empty($data) ? [] : array_column($data['Cells'], 'Html');
}
}
}
/**
* 检查学校名
* @return array
*/
private function checkName()
{
$school = json_decode($this->httpRequest($this->schoolApi . urlencode($this->schoolName)), true);
if (!isset($school[0]['Code']) || !$school[0]['Code']) {
$this->error = '检查学校名 error';
return false;
}
return true;
}
/**
* 获取校区
* @return array
*/
private function getCampises()
{
$schoolSerachData = "{\"name\":\"" . $this->schoolName . "\",\"city\":\"\",\"state\":[],\"country\":[],\"countryNames\":[],\"institutionType\":[],\"institutionProgram\":[],\"campusLocation\":\"-1\",\"onlyResidentialCampuses\":false,\"enrollmentRange\":[],\"sort\":\"name\",\"sortDirection\":\"asc\",\"all\":false,\"pageNumber\":0,\"fromFavorites\":false}";
//获取校区列表
$campisesData = json_decode($this->httpRequest($this->schoolSerachApi, $schoolSerachData), true);
if (!isset($campisesData['Results'][0]['Campuses']) || empty($campisesData['Results'][0]['Campuses'])) {
$this->error = '获取校区 error';
return false;
}
$this->campisesData = $campisesData['Results'];
return true;
}
/**
* @name 获取全部详情
* @param $unitId
* @return array
*/
private function getDetail($unitId)
{
$dataDetail = json_decode($this->httpRequest($this->totalApi . $unitId), true);
if (!isset($dataDetail['Groups'][0]['Screens'][0]['Rows']) || empty($dataDetail['Groups'][0]['Screens'][0]['Rows'])) {
$this->error = '获取详情 error';
return false;
}
$this->dataDetail = $dataDetail['Groups'][0]['Screens'][0]['Rows'];
return true;
}
/**
* @name 获取分校区详情
* @param $unitId
* @return array
*/
private function getDetailCampuse($unitId)
{
$dataDetail = json_decode($this->httpRequest($this->dataApi . $unitId), true);
if (!isset($dataDetail['Groups'][0]['Screens'][0]['Rows']) || empty($dataDetail['Groups'][0]['Screens'][0]['Rows'])) {
$this->error = '获取详情 error';
return false;
}
$this->dataDetail = $dataDetail['Groups'][0]['Screens'][0]['Rows'];
return true;
}
/**
* @name 请求方法
* @param $url
* @param $data
* @return false|mixed|string
*/
private function httpRequest($url, $data = '')
{
$curl = curl_init();
$method = $data ? 'POST' : 'GET';
curl_setopt_array($curl, array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => "",
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 30,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => $method,
CURLOPT_POSTFIELDS => $data,
CURLOPT_HTTPHEADER => array(
"Content-Type: application/json;charset=UTF-8",
"Postman-Token: a30f51b0-cc09-4bfb-801e-830a931994f4",
"cache-control: no-cache"
),
));
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
return json_encode([]);
} else {
return strip_tags($response);
}
}
}
$a = new SchoolSpider();
$r = $a->spiderData('Harvard University');
echo json_encode($r);