|
1from bs4 import BeautifulSoup
2import requests,os,re
3
4
5
6
7#获取小说章节与链接
8if __name__ == "__main__":
9 server = "https://www.abcxs.com"
10 url = "https://www.abcxs.com/book/13417/#main"
11 r = requests.get(url)
12 html = r.text
13
14 #获取书名
15 title_bf = BeautifulSoup(html)
16 title = title_bf.find_all(property = 'og:title')
17 print(title)
18 #data = re.search( '<meta content="(.*?)" property=', title, re.M|re.I)
19 searchObj = re.search( '<meta content="(.*?)" property=', str(title), re.M|re.I)
20 if searchObj:
21 print ("searchObj.group(1) : ", searchObj.group(1))
22 ShuMing = searchObj.group(1)
23 else:
24 print ("Nothing found!!")
25
26 #获取小说目录
27
28 div_bf = BeautifulSoup(html)
29 div = div_bf.find_all('div',class_="listmain")
30 #print(div[0])
31 a_bf = BeautifulSoup(str(div[0]))
32 a = a_bf.find_all('a')
33 for each in a:
34 print(each.string, server + each.get('href'))
35
36 #创建文件目录
37 path = "J:/python/Python/我的Python学习/爬虫及文件写入/" + ShuMing
38 if not os.path.exists(path):
39 os.mkdir(path)
40 #获取正文内容
41 if __name__ == "__main__":
42 r = requests.get(server + each.get('href'))
43 html = r.text
44 bf = BeautifulSoup(html)
45 tetx_content = bf.find_all('div', class_ = 'showtxt')
46 print(tetx_content[0].text.replace('\xa0'*8,'\n'))
47 #tetx_content = bf.replace(u'\xa0', u' ')
48
49 #写入文件
50 with open(path + "/" + each.string + '.txt', 'w') as f:
51 #f.file.write(tetx_content[0].content.replace(u'\xa0,u'))
52 f.write(tetx_content[0].text.replace(u'\xa0', u' '))
|
|