原文鏈接地址:「docker實戰(zhàn)篇」python的docker爬蟲技術(shù)-python腳本app抓取(13)
上次已經(jīng)分析出來具體的app的請求連接了,本次主要說說python的開發(fā),抓取APP里面的信息。源碼:https://github.com/limingios/dockerpython.git
分析app數(shù)據(jù)包
查看分析
解析出來的header
夜神配置
python代碼,爬取分類
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import requests
#header內(nèi)容比較多,因為各個廠家的思路不同,
#fiddler爬取出來的字段比較多,有些內(nèi)容應(yīng)該是非必填的,只能在實際的時候嘗試注釋一些來試。
def handle_request(url,data):
header ={
client: 4,
version: 6916.2,
device: SM-G955N,
sdk: 22,5.1.1,
imei: 354730010002552,
channel: zhuzhan,
mac: 00:FF:E2:A2:7B:58,
resolution: 1440*900,
dpi:2.0,
android-id:bcdaf527105cc26f,
pseudo-id:354730010002552,
brand:samsung,
scale:2.0,
timezone:28800,
language:zh,
cns:3,
carrier: Android,
#imsi: 310260000000000,
user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
lon: 105.566938,
lat: 29.99831,
cid: 512000,
Content-Type: application/x-www-form-urlencoded; charset=utf-8,
Accept-Encoding: gzip, deflate,
Connection: Keep-Alive,
# Cookie: duid=58349118,
Host: api.douguo.net,
#Content-Length: 65
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = http://api.douguo.net/recipe/flatcatalogs
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
client:4,
_session:1547000257341354730010002552,
v:1503650468,
_vs:0
}
response = handle_request(url,data)
print(response.text)
handle_index()
爬取詳情,信息通過分類找到里面的詳情
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
#創(chuàng)建隊列
queue_list = Queue()
def handle_request(url,data):
header ={
client: 4,
version: 6916.2,
device: SM-G955N,
sdk: 22,5.1.1,
imei: 354730010002552,
channel: zhuzhan,
mac: 00:FF:E2:A2:7B:58,
resolution: 1440*900,
dpi:2.0,
android-id:bcdaf527105cc26f,
pseudo-id:354730010002552,
brand:samsung,
scale:2.0,
timezone:28800,
language:zh,
cns:3,
carrier: Android,
#imsi: 310260000000000,
user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
lon: 105.566938,
lat: 29.99831,
cid: 512000,
Content-Type: application/x-www-form-urlencoded; charset=utf-8,
Accept-Encoding: gzip, deflate,
Connection: Keep-Alive,
# Cookie: duid=58349118,
Host: api.douguo.net,
#Content-Length: 65
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = http://api.douguo.net/recipe/flatcatalogs
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
client:4,
_session:1547000257341354730010002552,
v:1503650468,
_vs:0
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic[result][cs]:
# print(item_index)
for item_index_cs in item_index[cs]:
# print(item_index_cs)
for item in item_index_cs[cs]:
#print(item)
data_2 ={
client:4,
_session:1547000257341354730010002552,
keyword:item[name],
_vs :400
}
#print(data_2)
queue_list.put(data_2)
handle_index()
print(queue_list.qsize())
分類菜譜內(nèi)部的詳情信息
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
#創(chuàng)建隊列
queue_list = Queue()
def handle_request(url,data):
header ={
client: 4,
version: 6916.2,
device: SM-G955N,
sdk: 22,5.1.1,
imei: 354730010002552,
channel: zhuzhan,
mac: 00:FF:E2:A2:7B:58,
resolution: 1440*900,
dpi:2.0,
android-id:bcdaf527105cc26f,
pseudo-id:354730010002552,
brand:samsung,
scale:2.0,
timezone:28800,
language:zh,
cns:3,
carrier: Android,
#imsi: 310260000000000,
user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
lon: 105.566938,
lat: 29.99831,
cid: 512000,
Content-Type: application/x-www-form-urlencoded; charset=utf-8,
Accept-Encoding: gzip, deflate,
Connection: Keep-Alive,
# Cookie: duid=58349118,
Host: api.douguo.net,
#Content-Length: 65
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = http://api.douguo.net/recipe/flatcatalogs
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
client:4,
_session:1547000257341354730010002552,
v:1503650468,
_vs:0
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic[result][cs]:
# print(item_index)
for item_index_cs in item_index[cs]:
# print(item_index_cs)
for item in item_index_cs[cs]:
#print(item)
data_2 ={
client:4,
#_session:1547000257341354730010002552,
keyword:item[name],
_vs :400,
order:0
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print(當(dāng)前的食材:,data[keyword])
caipu_list_url = http://api.douguo.net/recipe/s/0/20;
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict[result][list]:
caipu_info ={}
caipu_info[shicai] = data[keyword]
if caipu_item[type]==13:
caipu_info[user_name] = caipu_item[r][an]
caipu_info[shicai_id] = caipu_item[r][id]
caipu_info[describe] = caipu_item[r][cookstory].replace(\\\\n,).replace( ,)
caipu_info[caipu_name] = caipu_item[r][n]
caipu_info[zuoliao_list] = caipu_item[r][major]
print(caipu_info)
else:
continue
handle_index()
handle_caipu_list(queue_list.get())
菜品內(nèi)部的詳情信息
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
#創(chuàng)建隊列
queue_list = Queue()
def handle_request(url,data):
header ={
client: 4,
version: 6916.2,
device: SM-G955N,
sdk: 22,5.1.1,
imei: 354730010002552,
channel: zhuzhan,
mac: 00:FF:E2:A2:7B:58,
resolution: 1440*900,
dpi:2.0,
android-id:bcdaf527105cc26f,
pseudo-id:354730010002552,
brand:samsung,
scale:2.0,
timezone:28800,
language:zh,
cns:3,
carrier: Android,
#imsi: 310260000000000,
user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
lon: 105.566938,
lat: 29.99831,
cid: 512000,
Content-Type: application/x-www-form-urlencoded; charset=utf-8,
Accept-Encoding: gzip, deflate,
Connection: Keep-Alive,
# Cookie: duid=58349118,
Host: api.douguo.net,
#Content-Length: 65
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = http://api.douguo.net/recipe/flatcatalogs
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
client:4,
_session:1547000257341354730010002552,
v:1503650468,
_vs:0
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic[result][cs]:
# print(item_index)
for item_index_cs in item_index[cs]:
# print(item_index_cs)
for item in item_index_cs[cs]:
#print(item)
data_2 ={
client:4,
#_session:1547000257341354730010002552,
keyword:item[name],
_vs :400,
order:0
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print(當(dāng)前的食材:,data[keyword])
caipu_list_url = http://api.douguo.net/recipe/s/0/20;
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict[result][list]:
caipu_info ={}
caipu_info[shicai] = data[keyword]
if caipu_item[type]==13:
caipu_info[user_name] = caipu_item[r][an]
caipu_info[shicai_id] = caipu_item[r][id]
caipu_info[describe] = caipu_item[r][cookstory].replace(\\\\n,).replace( ,)
caipu_info[caipu_name] = caipu_item[r][n]
caipu_info[zuoliao_list] = caipu_item[r][major]
#print(caipu_info)
detail_url = http://api.douguo.net/recipe/detail/ str(caipu_info[shicai_id])
detail_data ={
client:4,
_session:1547000257341354730010002552,
author_id:0,
_vs:2803,
ext:\\\'{query: {kw: \\\' data[keyword] \\\', src: 2803, idx: 1, type: 13, id: \\\' str(caipu_info[shicai_id]) \\\'}}\\\'
}
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info[tips] = detail_reponse_dic[result][recipe][tips]
caipu_info[cookstep] = detail_reponse_dic[result][recipe][cookstep]
print(json.dumps(caipu_info))
else:
continue
handle_index()
handle_caipu_list(queue_list.get())
將數(shù)據(jù)保存在MongoDB中
通過vagrant 安裝虛擬機
vagrant up
進入虛擬機
ip 192.168.66.100
su -
#密碼:vagrant
docker
拉取mongodb的鏡像
https://hub.docker.com/r/bitnami/mongodb
默認端口:27017docker pull bitnami/mongodb:latest
創(chuàng)建mongodb的容器
mkdir bitnami
cd bitnami
mkdir mongodb
docker run -d -v /path/to/mongodb-persistence:/root/bitnami -p 27017:27017 bitnami/mongodb:latest
#關(guān)閉防火墻
systemctl stop firewalld
>用第三方工具連接

>連接mongodb的工具
``` python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/11 0:53
# @Author : liming
# @Site :
# @File : handle_mongodb.py
# @url : idig8.com
# @Software: PyCharm
import pymongo
from pymongo.collection import Collection
class Connect_mongo(object):
def __init__(self):
self.client = pymongo.MongoClient(host=192.168.66.100,port=27017)
self.db_data = self.client[dou_guo_mei_shi]
def insert_item(self,item):
db_collection = Collection(self.db_data,\\\'dou_guo_mei_shi_item\\\')
db_collection.insert(item)
# 暴露出來
mongo_info = Connect_mongo()
python爬取的數(shù)據(jù)通過mongo的工具保存到centos7的docker鏡像中
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
from handle_mongo import mongo_info
#創(chuàng)建隊列
queue_list = Queue()
def handle_request(url,data):
header ={
client: 4,
version: 6916.2,
device: SM-G955N,
sdk: 22,5.1.1,
imei: 354730010002552,
channel: zhuzhan,
mac: 00:FF:E2:A2:7B:58,
resolution: 1440*900,
dpi:2.0,
android-id:bcdaf527105cc26f,
pseudo-id:354730010002552,
brand:samsung,
scale:2.0,
timezone:28800,
language:zh,
cns:3,
carrier: Android,
#imsi: 310260000000000,
user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
lon: 105.566938,
lat: 29.99831,
cid: 512000,
Content-Type: application/x-www-form-urlencoded; charset=utf-8,
Accept-Encoding: gzip, deflate,
Connection: Keep-Alive,
# Cookie: duid=58349118,
Host: api.douguo.net,
#Content-Length: 65
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = http://api.douguo.net/recipe/flatcatalogs
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
client:4,
_session:1547000257341354730010002552,
v:1503650468,
_vs:0
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic[result][cs]:
# print(item_index)
for item_index_cs in item_index[cs]:
# print(item_index_cs)
for item in item_index_cs[cs]:
#print(item)
data_2 ={
client:4,
#_session:1547000257341354730010002552,
keyword:item[name],
_vs :400,
order:0
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print(當(dāng)前的食材:,data[keyword])
caipu_list_url = http://api.douguo.net/recipe/s/0/20;
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict[result][list]:
caipu_info ={}
caipu_info[shicai] = data[keyword]
if caipu_item[type]==13:
caipu_info[user_name] = caipu_item[r][an]
caipu_info[shicai_id] = caipu_item[r][id]
caipu_info[describe] = caipu_item[r][cookstory].replace(\\\\n,).replace( ,)
caipu_info[caipu_name] = caipu_item[r][n]
caipu_info[zuoliao_list] = caipu_item[r][major]
#print(caipu_info)
detail_url = http://api.douguo.net/recipe/detail/ str(caipu_info[shicai_id])
detail_data ={
client:4,
_session:1547000257341354730010002552,
author_id:0,
_vs:2803,
ext:\\\'{query: {kw: \\\' data[keyword] \\\', src: 2803, idx: 1, type: 13, id: \\\' str(caipu_info[shicai_id]) \\\'}}\\\'
}
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info[tips] = detail_reponse_dic[result][recipe][tips]
caipu_info[cookstep] = detail_reponse_dic[result][recipe][cookstep]
#print(json.dumps(caipu_info))
mongo_info.insert_item(caipu_info)
else:
continue
handle_index()
handle_caipu_list(queue_list.get())
通過python多線程-線程池抓取
python3通過concurrent.futures import ThreadPoolExecutor
引用線程池
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor
#創(chuàng)建隊列
queue_list = Queue()
def handle_request(url,data):
header ={
client: 4,
version: 6916.2,
device: SM-G955N,
sdk: 22,5.1.1,
imei: 354730010002552,
channel: zhuzhan,
mac: 00:FF:E2:A2:7B:58,
resolution: 1440*900,
dpi:2.0,
android-id:bcdaf527105cc26f,
pseudo-id:354730010002552,
brand:samsung,
scale:2.0,
timezone:28800,
language:zh,
cns:3,
carrier: Android,
#imsi: 310260000000000,
user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
lon: 105.566938,
lat: 29.99831,
cid: 512000,
Content-Type: application/x-www-form-urlencoded; charset=utf-8,
Accept-Encoding: gzip, deflate,
Connection: Keep-Alive,
# Cookie: duid=58349118,
Host: api.douguo.net,
#Content-Length: 65
}
response = requests.post(url=url,headers=header,data=data)
return response
def handle_index():
url = http://api.douguo.net/recipe/flatcatalogs
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
client:4,
_session:1547000257341354730010002552,
v:1503650468,
_vs:0
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic[result][cs]:
# print(item_index)
for item_index_cs in item_index[cs]:
# print(item_index_cs)
for item in item_index_cs[cs]:
#print(item)
data_2 ={
client:4,
#_session:1547000257341354730010002552,
keyword:item[name],
_vs :400,
order:0
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print(當(dāng)前的食材:,data[keyword])
caipu_list_url = http://api.douguo.net/recipe/s/0/20;
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict[result][list]:
caipu_info ={}
caipu_info[shicai] = data[keyword]
if caipu_item[type]==13:
caipu_info[user_name] = caipu_item[r][an]
caipu_info[shicai_id] = caipu_item[r][id]
caipu_info[describe] = caipu_item[r][cookstory].replace(\\\\n,).replace( ,)
caipu_info[caipu_name] = caipu_item[r][n]
caipu_info[zuoliao_list] = caipu_item[r][major]
#print(caipu_info)
detail_url = http://api.douguo.net/recipe/detail/ str(caipu_info[shicai_id])
detail_data ={
client:4,
_session:1547000257341354730010002552,
author_id:0,
_vs:2803,
ext:\\\'{query: {kw: \\\' data[keyword] \\\', src: 2803, idx: 1, type: 13, id: \\\' str(caipu_info[shicai_id]) \\\'}}\\\'
}
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info[tips] = detail_reponse_dic[result][recipe][tips]
caipu_info[cookstep] = detail_reponse_dic[result][recipe][cookstep]
#print(json.dumps(caipu_info))
mongo_info.insert_item(caipu_info)
else:
continue
handle_index()
pool = ThreadPoolExecutor(max_workers=20)
while queue_list.qsize()>0:
pool.submit(handle_caipu_list,queue_list.get())
通過使用代理IP隱藏爬蟲
當(dāng)app運維人員,發(fā)現(xiàn)我們的一直在請求他們的服務(wù)器,很可能就把咱們的ip給封了,通過代理ip的方式。隱藏自我。
注冊申請 abuyun.com
一個小時1元,我申請了一個小時咱們一起使用下
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/11 2:40
# @Author : Aries
# @Site :
# @File : handle_proxy.py
# @Software: PyCharm
#60.17.177.187 代理出來的ip
import requests
url = \\\'http://ip.hahado.cn/ip\\\'
proxy = {\\\'http\\\':\\\'http://H79623F667Q3936C:84F1527F3EE09817@http-cla.abuyun.com:9030\\\'}
response = requests.get(url=url,proxies=proxy)
print(response.text)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/1/9 11:06
# @Author : lm
# @Url : idig8.com
# @Site :
# @File : spider_douguomeishi.py
# @Software: PyCharm
import json
import requests
from multiprocessing import Queue
from handle_mongo import mongo_info
from concurrent.futures import ThreadPoolExecutor
#創(chuàng)建隊列
queue_list = Queue()
def handle_request(url,data):
header ={
client: 4,
version: 6916.2,
device: SM-G955N,
sdk: 22,5.1.1,
imei: 354730010002552,
channel: zhuzhan,
mac: 00:FF:E2:A2:7B:58,
resolution: 1440*900,
dpi:2.0,
android-id:bcdaf527105cc26f,
pseudo-id:354730010002552,
brand:samsung,
scale:2.0,
timezone:28800,
language:zh,
cns:3,
carrier: Android,
#imsi: 310260000000000,
user-agent: Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36,
lon: 105.566938,
lat: 29.99831,
cid: 512000,
Content-Type: application/x-www-form-urlencoded; charset=utf-8,
Accept-Encoding: gzip, deflate,
Connection: Keep-Alive,
# Cookie: duid=58349118,
Host: api.douguo.net,
#Content-Length: 65
}
proxy = {\\\'http\\\': \\\'http://H79623F667Q3936C:84F1527F3EE09817@http-cla.abuyun.com:9030\\\'}
response = requests.post(url=url,headers=header,data=data,proxies=proxy)
return response
def handle_index():
url = http://api.douguo.net/recipe/flatcatalogs
# client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0
data ={
client:4,
_session:1547000257341354730010002552,
v:1503650468,
_vs:0
}
response = handle_request(url,data)
# print(response.text)
index_response_dic = json.loads(response.text)
for item_index in index_response_dic[result][cs]:
# print(item_index)
for item_index_cs in item_index[cs]:
# print(item_index_cs)
for item in item_index_cs[cs]:
#print(item)
data_2 ={
client:4,
#_session:1547000257341354730010002552,
keyword:item[name],
_vs :400,
order:0
}
#print(data_2)
queue_list.put(data_2)
def handle_caipu_list(data):
print(當(dāng)前的食材:,data[keyword])
caipu_list_url = http://api.douguo.net/recipe/s/0/20;
caipu_response = handle_request(caipu_list_url, data)
caipu_response_dict = json.loads(caipu_response.text)
for caipu_item in caipu_response_dict[result][list]:
caipu_info ={}
caipu_info[shicai] = data[keyword]
if caipu_item[type]==13:
caipu_info[user_name] = caipu_item[r][an]
caipu_info[shicai_id] = caipu_item[r][id]
caipu_info[describe] = caipu_item[r][cookstory].replace(\\\\n,).replace( ,)
caipu_info[caipu_name] = caipu_item[r][n]
caipu_info[zuoliao_list] = caipu_item[r][major]
#print(caipu_info)
detail_url = http://api.douguo.net/recipe/detail/ str(caipu_info[shicai_id])
detail_data ={
client:4,
_session:1547000257341354730010002552,
author_id:0,
_vs:2803,
ext:\\\'{query: {kw: \\\' data[keyword] \\\', src: 2803, idx: 1, type: 13, id: \\\' str(caipu_info[shicai_id]) \\\'}}\\\'
}
detail_reponse = handle_request(detail_url,detail_data)
detail_reponse_dic = json.loads(detail_reponse.text)
caipu_info[tips] = detail_reponse_dic[result][recipe][tips]
caipu_info[cookstep] = detail_reponse_dic[result][recipe][cookstep]
#print(json.dumps(caipu_info))
mongo_info.insert_item(caipu_info)
else:
continue
handle_index()
pool = ThreadPoolExecutor(max_workers=2)
while queue_list.qsize()>0:
pool.submit(handle_caipu_list,queue_list.get())
PS:本次是app數(shù)據(jù)抓取的入門。首先是通過模擬器的代理服務(wù),到本地的電腦(安裝fiddler),這樣fiddler就可以抓取數(shù)據(jù)了,分析數(shù)據(jù)這塊要憑借自己的經(jīng)驗找到對應(yīng)的url,如果能分析到url,基本爬蟲就寫一半。封裝請求頭。通過fiddler獲取的。里面header內(nèi)容比較多,嘗試刪除最簡化,也是一種反爬蟲的策略,有的數(shù)據(jù)放進去到容易被發(fā)現(xiàn)是爬蟲了,例如cookies等等,但是有的爬蟲爬取數(shù)據(jù)需要cookies。通過代理的方式設(shè)置代理ip,防止爬取過程中同一個ip,一直請求一個接口被發(fā)現(xiàn)是爬蟲。引入了隊列的目的就是為了使用線程池的時候方便提取。然后放入mongodb中。這樣使用多線程的app數(shù)據(jù)就完成了。
更多關(guān)于云服務(wù)器,域名注冊,虛擬主機的問題,請訪問三五互聯(lián)官網(wǎng):m.shinetop.cn