PHP code example of yangze / spiderx

1. Go to this page and download the library: Download yangze/spiderx library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.

    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

yangze / spiderx example snippets


if (!is_file('./vendor/autoload.php')) {
    exec("composer = [
    'name' => 'sina',
    'tasknum' => 1,
    'start' => [
        'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml',
    ],
    'rule' => [
        [
            'name' => 'list',
            'type' => 'list',
            'url' => '#gdxw1/index_\d+.shtml#',
            'data' => [
                'title' => function ($pageInfo, $html, $data) {
                    preg_match_all('/<li><a href=".*?" target="_blank">(.*?)<\/a><span>/i', $html, $matches);
                    return $matches[1];
                }
            ]
        ],
    ]
];

$spider = new SpiderX\SpiderX($config);
$spider->on_fetch_list = function ($pageInfo, $html, $data) {
    file_put_contents(__DIR__ . '/data.txt', implode("\n", $data['title']) . "\n", FILE_APPEND | LOCK_EX);
};
$spider->start();


php index.php run


on_start = function() use($spiderx) {
	// 可以在此方法中添加用户登录，增加url队列操作
    //$spiderx->addUrl([]);
}


on_finish = function() {
	// 任务执行完成，可以发送通知，导入数据库，删除日志文件等
}


on_add_url = function($pageInfo) {
	// 如果调转当前回调，需要返回true,才会向队列中添加数据
}


on_retry_page = function($pageInfo) {
	//返回true表示需要重试
}


setGetHtml = function($pageInfo) {
	return file_get_contents($pageInfo['url']);
}


setGetLinks = function($html) {
	
}


on_loadding_{news,需要替换不同的name值} = function($pageInfo) {
	// pageInfo 为当前页面的相关信息
	//返回true表示需要请求这个页面
}


on_loaded_{news,需要替换不同的name值} = function($pageInfo, $html) {
	//html表示当前的html数据
}


on_fetch_{news,需要替换不同的name值} = function($pageInfo, $html, $data) {
	//data值为解析的数据
}


//字符串模式
\SpiderX\Lib\Util::subStrByStr($start, $end, $html, true);
//正则模型
\SpiderX\Lib\Util::subStrByPreg($start, $end, $html, true);


$crawler = new \Symfony\Component\DomCrawler\Crawler();
$crawler->addHtmlContent($html);
$crawler->filterXPath('//h3')
$crawler->filterXPath('//h3')->text();
$crawler->filterXPath('//h3')->nodeName();
$crawler->filterXPath('//h3')->attr('class');
$attributes = $crawler
    ->filterXpath('//body/p')
    ->extract(array('_text', 'class'));
$crawler->filterXPath('//*[@id="YKTabCon2_10"]//tr')->each(function ($node, $i){
	//
});


preg_match();


$spider->setGetHtml = function ($pageInfo) {
    $pageInfo['cookie'] = '...';
    $pageInfo['extra'] = [
        'headers' => [
            'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
            'Referer' => '...',
        ]
    ];
    return Url::getHtml($pageInfo);
};


$spider->on_start = function () use ($spider) {
    $pageInfo = [
        'type' => 'index',
        'name' => 'login',
        'method' => 'post', // 发送post提交
        'url' => 'http://127.0.0.1:3200/index.php/action/login?_=b0fd8734e0687a6cfe352e3f0fcbc5f6',
        'query' => [
            'name' => 'admin',
            'password' => 'admin',
            'referer' => 'http://127.0.0.1:3200/admin/',
        ], // 请求参数
        'cookie' => true, // 需要共享cookie
        'extra' => [
            'headers' => [
                'User-Agent' => 'testing/1.0',
                'Accept'     => 'application/json',
            ]
        ]// 添加额外参数，参考
    ];
    $spider->addUrl($pageInfo);
    return true;
};


$spider->addUrl([
    'type' => 'detail', // 保持和单元的name，type一致
    'name' => 'detail',
    'url' => 'http://smeimdf.mofcom.gov.cn/news/searchEntpAudit.jsp',
    'method' => 'post', // 请求方式
    'query' => [ // 请求参数
        'fund_type' => $fund_type,
        'province' => 340000,
    ],
    'context' => [ // 上下文数据，可以很方便的在多任务中传数据
        'fund_type' => $fund_type,
        'province' => '-',
        'province_name' => '-',
    ]
]);


    $dataList = (new \SpiderX\Lib\UtilXpath)->setAttr(['title'])->setHtml($html)->setRange('//table[@id="YKTabCon2_10"]')->getResult();
    foreach($dataList as $data) {
        array_walk($data, function (&$item) {
            $item = str_ireplace(',', '，', $item);
            $item = trim($item);
        });
        file_put_contents('data.csv', implode(',', $data) . "\n", FILE_APPEND | LOCK_EX);
    }