php爬虫教程(四)抓取数据并进行处理

经过链接的分析,数据的分析,再加上规则的验证。

很容易的我们就get到了我们打算抓取到的数据,

so,我们就可以做我们想做的事情了。例如:

<?php  
header("Content-type:text/html;charset=utf8");  
set_time_limit(0);  
require('client.php');  
$client = new client();  
$base_url = "http://t.pp.cc/";  
for($i=0;$i<5;$i++) {  
    echo 'page:',$i,"\n";  
    $client->setHeader('Cookie', 'pt2gguin=o0056707892; RK=MBl/Y/W2em; ptcz=3c94d72206e5c146a03701b2cd5baa2dbf898ced78a80ca14afcb1c4347815d3; pgv_pvid=9725655970; g_ut=2; 3g_guest_id=-9042816631926882304; o_cookie=56707892; pgv_pvi=1429736448; eas_sid=K1S4H5o7F6b68265o2T8t240H5; luin=o0056707892; lskey=00010000d8b324c3df16b631120077e9d27f35b7d564ebc529087b9dcbc2f7556d9126fe81efd33c2d046cfd; pgv_si=s9506151424; pgv_info=pgvReferrer=&ssid=s6703251255; ptisp=ctc; ptui_loginuin=; uin=; skey=@5ZzsPWzRc; verifysession=h01a106acab1cddfbb02999f5bd471c902ebe5ab556be3b40de657fe21ffea2f01c24e692c37c2bd63c; IED_LOG_INFO=uin*|nick*%u7B11%u7740%u770B%u4F60%u54ED%20|time*1461910804; qzone_check=56707892_1461913345; rv2=802C9F7A654B37CD767C9691A7A5A7BF7F09CAB51D6341AA0B; property20=41424F4482BCD05C0A25B282DF8B360B38C86FEB7860B26C51C256022F9C1879FF87187E60572F65; qqmusic_uin=0056707892; qqmusic_key=@5ZzsPWzRc; qqmusic_fromtag=6');  
    $client->setHeader('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8');  
    $client->setHeader('Accept-Language', 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3');  
  
  
  
    $url = 'https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=1312024342&inCharset=utf-8&outCharset=utf-8&hostUin=1312024342¬ice=0&sort=0&pos=40&num=20&cgi_host=http%3A%2F%2Ftaotao.qq.com%2Fcgi-bin%2Femotion_cgi_msglist_v6&code_version=1&format=jsonp&need_private_comment=1&g_tk=978158941';  
    $res = $client->get($url);  
    $res = substr($res, 10);  
    $res = substr($res, 0, -2);  
    //echo $res;die;  
  
    $res = json_decode($res, true);  
    if(@$res['msglist'] == '') continue;  
    foreach ($res['msglist'] as $k => $v) {  
        if (!empty($v['commentlist'])) {  
            foreach ($v['commentlist'] as $k2 => $v2) {  
                if(isset($arr[$v2['name']])){  
                    $arr[$v2['name']]['num'] = $arr[$v2['name']]['num']+1;  
                }else{  
                    $client->setHeader('Cookie', 'randomSeed=824410; QZ_FE_WEBP_SUPPORT=0; cpu_performance_v8=31; pt2gguin=; RK=MBl/Y/W2em; ptcz=3c94d72206e5c146a03701b2cd5baa2dbf898ced78a80ca14afcb1c4347815d3; pgv_pvid=9725655970; g_ut=2; 3g_guest_id=-9042816631926882304; o_cookie=; pgv_pvi=1429736448; eas_sid=K1S4H5o7F6b68265o2T8t240H5; luin=; lskey=00010000d8b324c3df16b631120077e9d27f35b7d564ebc529087b9dcbc2f7556d9126fe81efd33c2d046cfd; pgv_si=s9506151424; pgv_info=pgvReferrer=&ssid=s6703251255; ptisp=ctc; ptui_loginuin=675365043; uin=; skey=@5ZzsPWzRc; verifysession=h01a106acab1cddfbb02999f5bd471c902ebe5ab556be3b40de657fe21ffea2f01c24e692c37c2bd63c; IED_LOG_INFO=uin*675365043|nick*%u7B11%u7740%u770B%u4F60%u54ED%20|time*1461910804; zzpaneluin=; zzpanelkey=; p_skey=bAQZCU78gH4Qy0BSWeZ5pOsOdoKEnmVDRCdEi2HTIUY_; pt4_token=MNU3KRdqZCn9wQhASxnjt2lE*Ikt29Yf-6r8jHUPFMw_; p_uin=; qzone_check=56707892_1461913345; rv2=802C9F7A654B37CD767C9691A7A5A7BF7F09CAB51D6341AA0B; property20=41424F4482BCD05C0A25B282DF8B360B38C86FEB7860B26C51C256022F9C1879FF87187E60572F65; qqmusic_uin=; qqmusic_key=@5ZzsPWzRc; qqmusic_fromtag=6; __Q_w_s_hat_seed=1');  
//                    $url2="http://base.s21.qzone.qq.com/cgi-bin/user/cgi_userinfo_get_all?uin=". $v2['uin']."&vuin=56707892&fupdate=1&rd=0.8304920770656681&g_tk=1551039607";  
                    $url2="http://base.s11.qzone.qq.com/cgi-bin/user/cgi_userinfo_get_all?uin=". $v2['uin']."&vuin=56707892&fupdate=1&rd=0.3045121533378856&g_tk=1845089435";  
                    $res2 = $client->get($url2);  
                    $res2 = substr($res2, 10);  
                    $res2 = substr($res2, 0, -2);  
                    $res2 = json_decode($res2, true);  
                    $arr[$v2['name']]['qq'] = $v2['uin'];  
                    $arr[$v2['name']]['num'] = 1;  
                    $arr[$v2['name']]['sex'] = $res2['data']['sex'];  
                    $arr[$v2['name']]['age'] = $res2['data']['age'];  
                    $arr[$v2['name']]['birthday'] = $res2['data']['birthday'];  
                }  
            }  
        }  
    }  
    sleep('1');  
}  
  
if(empty($arr)) die;  
$ages = array();  
foreach ($arr as $k=>$v) {  
    $ages[] = $v['num'];  
}  
$num=$num2=$num3=0;  
array_multisort($ages, SORT_DESC, $arr);  
foreach($arr as $k3=>$v3){  
    echo "昵称:",$k3,'账号:',$v3['qq'],'访问次数:',$v3['num'],'性别',$v3['sex'],'年龄',$v3['age'],'生日',$v3['birthday'],"\n";  
    $v3['sex']==2? $num++:$num2++;  
    $num3 = $v3['num']>$num3?$v3['num']:$num3;  
//    echo $v3,"\n";  
}  
echo  "共有妹子:$num 人,其他:$num2 人,最高访问次数:$num3";  

这是我之前写过的一个抓取qq好友空间所有点过赞,评过论的用户,也就是他的QQ好友 :)

并且进行数据的整理和分析,找出

//共有妹子:$num 人,其他:$num2 人,最高访问次数:$num3  

一些好玩的数据

这个脚本是半自动的需要手动的写入cookie保持登陆的状态。

想写一个全自动的来着,实在是搞不懂tx的加密规则就放弃了(破涕为笑)

总结:至此恭喜会抓取数据了,但是人的创造力是无限的。

 

原文链接:https://blog.csdn.net/u014017080/article/details/52369752

点赞