from hashlib import md5 from bs4 import BeautifulSoup from urllib.parse import urlencode import requests from requests.exceptions import RequestException import json import re import os from multiprocessing import Pool
def parse_page_index(html):
data = json.loads(html) if 'data' in data.keys(): for items in data.get('data'): if items.get('article_url') == None: continue yield items.get('article_url')
def get_page_detail(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
reponse = requests.get(url,headers=headers) try: if reponse.status_code == 200: return reponse.text except RequestException:
print('详情页页不存在')
def parse_page_detail(html):
# print(html)
result = re.findall('gallery:.*?JSON.parse\("(.*?)"\)', html, re.S) for results in result:
results = re.sub('\\\\','',results) if len(result) ==0: pass else:
soup = BeautifulSoup(html, 'lxml')
title = soup.title.string if results:
data = json.loads(results) if 'count' in data.keys():
sub_images = data.get('sub_images')
images = [items.get('url') for items in sub_images] for image in images:
download_images(image)
def save_images(content):
file_path = ('E:/爬取的图片/'+ md5(content).hexdigest() +'.jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f:
f.write(content)
f.close()
def main(offset,keyword):
html=get_page_index(offset,keyword) for url in parse_page_index(html):
html = get_page_detail(url) if html:
result=parse_page_detail(html)
if __name__ == '__main__':
pool = Pool()
keyword = input('keyword:') for i in range(20):
offset=i*20
main(offset,keyword)