PHP code example of yangze / spiderx
1. Go to this page and download the library: Download yangze/spiderx library . Choose the download type require .
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
yangze / spiderx example snippets
if (!is_file('./vendor/autoload.php')) {
exec("composer = [
'name' => 'sina',
'tasknum' => 1,
'start' => [
'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml',
],
'rule' => [
[
'name' => 'list',
'type' => 'list',
'url' => '#gdxw1/index_\d+.shtml#',
'data' => [
'title' => function ($pageInfo, $html, $data) {
preg_match_all('/<li><a href=".*?" target="_blank">(.*?)<\/a><span>/i', $html, $matches);
return $matches[1];
}
]
],
]
];
$spider = new SpiderX\SpiderX($config);
$spider->on_fetch_list = function ($pageInfo, $html, $data) {
file_put_contents(__DIR__ . '/data.txt', implode("\n", $data['title']) . "\n", FILE_APPEND | LOCK_EX);
};
$spider->start();
php index.php run
on_start = function() use($spiderx) {
// 可以在此方法中添加用户登录,增加url队列操作
//$spiderx->addUrl([]);
}
on_finish = function() {
// 任务执行完成,可以发送通知,导入数据库,删除日志文件等
}
on_add_url = function($pageInfo) {
// 如果调转当前回调,需要返回true,才会向队列中添加数据
}
on_retry_page = function($pageInfo) {
//返回true表示需要重试
}
setGetHtml = function($pageInfo) {
return file_get_contents($pageInfo['url']);
}
setGetLinks = function($html) {
}
on_loadding_{news,需要替换不同的name值} = function($pageInfo) {
// pageInfo 为当前页面的相关信息
//返回true表示需要请求这个页面
}
on_loaded_{news,需要替换不同的name值} = function($pageInfo, $html) {
//html表示当前的html数据
}
on_fetch_{news,需要替换不同的name值} = function($pageInfo, $html, $data) {
//data值为解析的数据
}
//字符串模式
\SpiderX\Lib\Util::subStrByStr($start, $end, $html, true);
//正则模型
\SpiderX\Lib\Util::subStrByPreg($start, $end, $html, true);
$crawler = new \Symfony\Component\DomCrawler\Crawler();
$crawler->addHtmlContent($html);
$crawler->filterXPath('//h3')
$crawler->filterXPath('//h3')->text();
$crawler->filterXPath('//h3')->nodeName();
$crawler->filterXPath('//h3')->attr('class');
$attributes = $crawler
->filterXpath('//body/p')
->extract(array('_text', 'class'));
$crawler->filterXPath('//*[@id="YKTabCon2_10"]//tr')->each(function ($node, $i){
//
});
preg_match();
$spider->setGetHtml = function ($pageInfo) {
$pageInfo['cookie'] = '...';
$pageInfo['extra'] = [
'headers' => [
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'Referer' => '...',
]
];
return Url::getHtml($pageInfo);
};
$spider->on_start = function () use ($spider) {
$pageInfo = [
'type' => 'index',
'name' => 'login',
'method' => 'post', // 发送post提交
'url' => 'http://127.0.0.1:3200/index.php/action/login?_=b0fd8734e0687a6cfe352e3f0fcbc5f6',
'query' => [
'name' => 'admin',
'password' => 'admin',
'referer' => 'http://127.0.0.1:3200/admin/',
], // 请求参数
'cookie' => true, // 需要共享cookie
'extra' => [
'headers' => [
'User-Agent' => 'testing/1.0',
'Accept' => 'application/json',
]
]// 添加额外参数,参考
];
$spider->addUrl($pageInfo);
return true;
};
$spider->addUrl([
'type' => 'detail', // 保持和单元的name,type一致
'name' => 'detail',
'url' => 'http://smeimdf.mofcom.gov.cn/news/searchEntpAudit.jsp',
'method' => 'post', // 请求方式
'query' => [ // 请求参数
'fund_type' => $fund_type,
'province' => 340000,
],
'context' => [ // 上下文数据,可以很方便的在多任务中传数据
'fund_type' => $fund_type,
'province' => '-',
'province_name' => '-',
]
]);
$dataList = (new \SpiderX\Lib\UtilXpath)->setAttr(['title'])->setHtml($html)->setRange('//table[@id="YKTabCon2_10"]')->getResult();
foreach($dataList as $data) {
array_walk($data, function (&$item) {
$item = str_ireplace(',', ',', $item);
$item = trim($item);
});
file_put_contents('data.csv', implode(',', $data) . "\n", FILE_APPEND | LOCK_EX);
}