PHP code example of yangze / spiderx

1. Go to this page and download the library: Download yangze/spiderx library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.

/* Start to develop here. Best regards */


yangze / spiderx example snippets

if (!is_file('./vendor/autoload.php')) {
    exec("composer = [
    'name' => 'sina',
    'tasknum' => 1,
    'start' => [
    'rule' => [
            'name' => 'list',
            'type' => 'list',
            'url' => '#gdxw1/index_\d+.shtml#',
            'data' => [
                'title' => function ($pageInfo, $html, $data) {
                    preg_match_all('/<li><a href=".*?" target="_blank">(.*?)<\/a><span>/i', $html, $matches);
                    return $matches[1];

$spider = new SpiderX\SpiderX($config);
$spider->on_fetch_list = function ($pageInfo, $html, $data) {
    file_put_contents(__DIR__ . '/data.txt', implode("\n", $data['title']) . "\n", FILE_APPEND | LOCK_EX);

php index.php run

on_start = function() use($spiderx) {
	// 可以在此方法中添加用户登录,增加url队列操作

on_finish = function() {
	// 任务执行完成,可以发送通知,导入数据库,删除日志文件等

on_add_url = function($pageInfo) {
	// 如果调转当前回调,需要返回true,才会向队列中添加数据

on_retry_page = function($pageInfo) {

setGetHtml = function($pageInfo) {
	return file_get_contents($pageInfo['url']);

setGetLinks = function($html) {

on_loadding_{news,需要替换不同的name值} = function($pageInfo) {
	// pageInfo 为当前页面的相关信息

on_loaded_{news,需要替换不同的name值} = function($pageInfo, $html) {

on_fetch_{news,需要替换不同的name值} = function($pageInfo, $html, $data) {

\SpiderX\Lib\Util::subStrByStr($start, $end, $html, true);
\SpiderX\Lib\Util::subStrByPreg($start, $end, $html, true);

$crawler = new \Symfony\Component\DomCrawler\Crawler();
$attributes = $crawler
    ->extract(array('_text', 'class'));
$crawler->filterXPath('//*[@id="YKTabCon2_10"]//tr')->each(function ($node, $i){


$spider->setGetHtml = function ($pageInfo) {
    $pageInfo['cookie'] = '...';
    $pageInfo['extra'] = [
        'headers' => [
            'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            'Referer' => '...',
    return Url::getHtml($pageInfo);

$spider->on_start = function () use ($spider) {
    $pageInfo = [
        'type' => 'index',
        'name' => 'login',
        'method' => 'post', // 发送post提交
        'url' => '',
        'query' => [
            'name' => 'admin',
            'password' => 'admin',
            'referer' => '',
        ], // 请求参数
        'cookie' => true, // 需要共享cookie
        'extra' => [
            'headers' => [
                'User-Agent' => 'testing/1.0',
                'Accept'     => 'application/json',
        ]// 添加额外参数,参考
    return true;

    'type' => 'detail', // 保持和单元的name,type一致
    'name' => 'detail',
    'url' => '',
    'method' => 'post', // 请求方式
    'query' => [ // 请求参数
        'fund_type' => $fund_type,
        'province' => 340000,
    'context' => [ // 上下文数据,可以很方便的在多任务中传数据
        'fund_type' => $fund_type,
        'province' => '-',
        'province_name' => '-',

    $dataList = (new \SpiderX\Lib\UtilXpath)->setAttr(['title'])->setHtml($html)->setRange('//table[@id="YKTabCon2_10"]')->getResult();
    foreach($dataList as $data) {
        array_walk($data, function (&$item) {
            $item = str_ireplace(',', ',', $item);
            $item = trim($item);
        file_put_contents('data.csv', implode(',', $data) . "\n", FILE_APPEND | LOCK_EX);