|
import json
from requests.exceptions import RequestException
import requests
import re
from multiprocessing import Pool
def get_url(url):
try:
reponse = requests.get(url)
if reponse.status_code == 200:
return reponse.text
return None
except RequestException:
return None
def parse_page(html):
res = re.findall('<dd>.*?"board-index.*?board-index.*?">(\d*)</i>.*?title="(.*?)".*?data-src="(.*?)".*?<p.*?"star">(.*?)</p>.*?"releasetime">(.*?)</p>.*?</dd>',html,re.S)
# print(res)
for result in res:
yield {
'index': result[0],
'title':result[1],
'url':result[2],
'name':result[3].strip()[3:],
'time':result[4].strip()[5:]
}
def with_open(result):
with open('爬猫影电影网top100.txt','a',encoding='utf8') as f:
f.write(json.dumps(result,ensure_ascii=False)+'\n')
f.close()
def main(i):
url = 'https://maoyan.com/board/4?offset=' + str(i)
html = get_url(url)
# print(html)
for result in parse_page(html):
with_open(result)
if __name__ == '__main__':
pool = Pool()
pool.map(main,[i*10 for i in range(10)])
|
|