#最简单的爬虫,,,新手学习可用这两个库 可以在命令行里面输入:pip install requests和pip install BeautifulSoup4进行自动安装
import requests
import bs4#BeautifulSoup4
import re
#定义类
class DouBanGet:
def __init__(self,url):
self.url=url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
self.res = requests.get(self.url, headers=self.headers)
self.soup = bs4.BeautifulSoup(self.res.text, "html.parser")
def getTitle(self):#获取标题
targets = self.soup.find_all("div", class_="hd")
t = []
for each in targets:
t.append(each.a.span.text)
return t
def getRating_num(self):#获取评分
targets = self.soup.find_all("span", class_="rating_num")
t = []
for each in targets:
t.append(each.text)
return t
def getBrief(self):#获取简介
targets = self.soup.find_all("div", class_="bd")
t = []
for each in targets:
if each.p.text!="豆瓣":
t.append(each.p.text)
return t
def getPic(self):#获取海报地址
targets = self.soup.find_all("div", class_="pic")
t = []
for each in targets:
t.append(each.a.img['src'])
return t
def getCount(self):#获取数量
targets = self.soup.find("span", class_="count")
# 正则表达式取数字
return re.sub("\D", "", targets.text)
#开始获取-------------------------------------------------
db1=DouBanGet("https://movie.douban.com/top250")
num=int(db1.getCount())#获取页数
num=num//25#整除
text=""#结果文本初始化
for i in range(num):
#每隔25个为一页
url="https://movie.douban.com/top250?start=%d" % (i*25)
db = DouBanGet(url)
title=db.getTitle()#标题
rating_num=db.getRating_num()#评分
brief=db.getBrief()
pic=db.getPic()#海报地址
print(len(brief))
for n in range(len(title)):
text=text+title[n]+"|"+rating_num[n]+"|"+brief[n]+"|"+pic[n]+"\r\n" #按照格式组合成一行
print(title[n]+"|"+rating_num[n]+"|"+brief[n]+"|"+pic[n])#调试输出
#最后保存到桌面
f=open(r"C:\Users\Administrator\Desktop\top250.txt",'wb')
f.write(bytes(text.encode("utf-8")))#有中文就需要转码
f.close()