# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/12/28 0028 下午 12:34
# @Author : XXX
# @function :spider of mafengwo
# @File : mafengwoSpider.py
# @IDE :python 3.7
import time
import urllib
import random
import socket
socket.setdefaulttimeout(10)
times = int(round(time.time() * 1000))
import requests
import json
from lxml import etree
import os
# 改变当前工作目录到指定的路径
os.chdir(r'E:\dataSet\马蜂窝\travel_notes')
# 定义爬虫函数,输入参数为页码
def spider(pageid):
#url = 'http://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?'
params = "%22type%22:0,%22objid%22:0,%22page%22:"+str(pageid)+",%22ajax%22:1,%22retina%22:0"
url = "http://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?¶ms={"
url = url + str(params)+"}"
headers = {
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Cookie':'mfw_uuid=5c25ab21-48da-4194-aa9a-6dceda2939f4; _r=baidu; _rp=a%3A2%3A%7Bs%3A1%3Ap%3Bs%3A19%3Awww.baidu.com%2Fbaidu%3Bs%3A1%3At%3Bi%3A1546046770%3B%7D; oad_n=a%3A6%3A%7Bs%3A5%3Arefer%3Bs%3A21%3Ahttps%3A%2F%2Fwww.baidu.com%3Bs%3A2%3Awk%3Bs%3A6%3A%C2%ED%B7%E4%CE%D1%3Bs%3A2%3Ahp%3Bs%3A13%3Awww.baidu.com%3Bs%3A3%3Aoid%3Bi%3A3546%3Bs%3A2%3Adm%3Bs%3A15%3Awww.mafengwo.cn%3Bs%3A2%3Aft%3Bs%3A19%3A2018-12-28+12%3A48%3A33%3B%7D; __mfwlv=1546078578; __mfwvn=3; __mfwlt=1546080051; uva=s%3A1855%3Aa%3A4%3A%7Bs%3A13%3Ahost_pre_time%3Bs%3A10%3A2018-12-28%3Bs%3A2%3Alt%3Bi%3A1545972514%3Bs%3A10%3Alast_refer%3Bs%3A1727%3Ahttps%3A%2F%2Fwww.baidu.com%2Fbaidu%3Ftn%3D56060048_4_pg%26ch%3D14%26ie%3Dutf-8%26ssl_s%3D1%26ssl_c%3Dssl1_167f32420f3%26word%3D%25E9%25A9%25AC%25E8%259C%2582%25E7%25AA%259D%26h_search_ext%3D%257B%2522count%2522%253A3%252C%2522list%2522%253A%255B%257B%2522txt%2522%253A%2522%255Cu516d%255Cu5c0f%255Cu9f84%255Cu7ae5%255Cu56de%255Cu5e94%255Cu88ab%255Cu9ed1%2522%252C%2522cid%2522%253A%252246606222%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu66fe%255Cu5fd7%255Cu4f1f%255Cu5426%255Cu8ba4%255Cu9189%255Cu9a7e%2522%252C%2522cid%2522%253A%252246594522%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu5218%255Cu8bd7%255Cu8bd7%255Cu6000%255Cu5b55%255Cu540e%255Cu9732%255Cu9762%2522%252C%2522cid%2522%253A%252228673341%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu5b55%255Cu5987%255Cu643a%255Cu81ea%255Cu95ed%255Cu513f%255Cu81ea%255Cu6740%2522%252C%2522cid%2522%253A%252228673347%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu65b9%255Cu5a9b%255Cu90ed%255Cu5bcc%255Cu57ce%255Cu5408%255Cu7167%2522%252C%2522cid%2522%253A%252228673343%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu4ee3%255Cu9a7e%255Cu8eab%255Cu4ea1%2522%252C%2522cid%2522%253A%252228673350%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu7206%255Cu9152%255Cu5e97%255Cu767d%255Cu5e8a%255Cu5355%255Cu6e05%255Cu6d17%2522%252C%2522cid%2522%253A%252228673345%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu90d1%255Cu723d%255Cu4e0e%255Cu7537%255Cu53cb%255Cu88ab%255Cu5076%255Cu9047%2522%252C%2522cid%2522%253A%252228673346%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu8521%255Cu4f9d%255Cu6797%255Cu56de%255Cu5e94%255Cu54ed%255Cu4e86%2522%252C%2522cid%2522%253A%252228673342%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu501f40%255Cu4e07%255Cu6447%255Cu53f7%255Cu8d2d%255Cu623f%2522%252C%2522cid%2522%253A%252228673348%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%255D%257D%3Bs%3A5%3Arhost%3Bs%3A13%3Awww.baidu.com%3B%7D%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3Af_time%3Bi%3A1545972514%3Bs%3A9%3Af_rdomain%3Bs%3A13%3Awww.baidu.com%3Bs%3A6%3Af_host%3Bs%3A3%3Awww%3B%7D; __mfwuuid=5c25ab21-48da-4194-aa9a-6dceda2939f4; UM_distinctid=167f3243ba1cf-04d44fbecc28b5-143a7540-100200-167f3243ba2240; PHPSESSID=4t4h03q7nsjvgfskvogvqq6s04',
'Host': 'pagelet.mafengwo.cn',
'Referer': 'http://www.mafengwo.cn/?mfw_chid=3546',
'User-Agent': ''
}
timeout = random.choice(range(2, 3))
response = requests.get(url = url,timeout=timeout,headers = headers).text
time.sleep(random.uniform(2, 3))
data = json.loads(response)
html = data['data']['html']
return html
def getid(html):
partidlist =[]
html = etree.HTML(html)
module = html.xpath('//div[@class = "tn-item clearfix"]')
for part in module:
id = part.xpath('./div[@class = "tn-image"]/a/@href')[0]
partidlist.append(id)
return partidlist
print(partidlist)
def geteachpagehtml(id):
url = "https://www.mafengwo.cn/i/%s"%(id)+".html"
timeout = random.choice(range(10, 12))
proxiespool=['http://110.52.235.37:9999', 'http://119.101.116.87:9999','http://61.135.217.7:80', 'http://119.101.114.53:9999', 'http://119.101.116.255:9999', 'http://110.52.235.66:9999', 'http://119.101.116.113:9999', 'http://110.52.235.145:9999', 'http://119.101.116.124:9999', 'http://119.101.112.135:9999', 'http://119.101.116.42:9999', 'http://119.101.113.251:9999', 'http://119.101.113.109:9999', 'http://119.101.118.35:9999']
proxy = random.choice(proxiespool)
headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate,br',
'Connection': 'keep-alive',
'Cache-Control':'max-age=0',
'Cookie':'mfw_uuid=5c25ab21-48da-4194-aa9a-6dceda2939f4; _r=baidu; _rp=a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A18%3A%22www.baidu.com%2Flink%22%3Bs%3A1%3A%22t%22%3Bi%3A1546173618%3B%7D; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A18%3A%22tongji.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222019-01-02+13%3A51%3A38%22%3B%7D; __mfwlv=1546408064; __mfwvn=7; __mfwlt=1546410908; uva=s%3A1855%3A%22a%3A4%3A%7Bs%3A13%3A%22host_pre_time%22%3Bs%3A10%3A%222018-12-28%22%3Bs%3A2%3A%22lt%22%3Bi%3A1545972514%3Bs%3A10%3A%22last_refer%22%3Bs%3A1727%3A%22https%3A%2F%2Fwww.baidu.com%2Fbaidu%3Ftn%3D56060048_4_pg%26ch%3D14%26ie%3Dutf-8%26ssl_s%3D1%26ssl_c%3Dssl1_167f32420f3%26word%3D%25E9%25A9%25AC%25E8%259C%2582%25E7%25AA%259D%26h_search_ext%3D%257B%2522count%2522%253A3%252C%2522list%2522%253A%255B%257B%2522txt%2522%253A%2522%255Cu516d%255Cu5c0f%255Cu9f84%255Cu7ae5%255Cu56de%255Cu5e94%255Cu88ab%255Cu9ed1%2522%252C%2522cid%2522%253A%252246606222%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu66fe%255Cu5fd7%255Cu4f1f%255Cu5426%255Cu8ba4%255Cu9189%255Cu9a7e%2522%252C%2522cid%2522%253A%252246594522%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu5218%255Cu8bd7%255Cu8bd7%255Cu6000%255Cu5b55%255Cu540e%255Cu9732%255Cu9762%2522%252C%2522cid%2522%253A%252228673341%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu5b55%255Cu5987%255Cu643a%255Cu81ea%255Cu95ed%255Cu513f%255Cu81ea%255Cu6740%2522%252C%2522cid%2522%253A%252228673347%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu65b9%255Cu5a9b%255Cu90ed%255Cu5bcc%255Cu57ce%255Cu5408%255Cu7167%2522%252C%2522cid%2522%253A%252228673343%2522%252C%2522sellv%2522%253Anull%252C%2522sell%2522%253Afalse%257D%252C%257B%2522txt%2522%253A%2522%255Cu4ee3%255Cu9a7e%255Cu8eab%255Cu4ea1%2522%252C%2522cid%2522%253A
最新资源