coding=utf-8
import requests
from lxml import etree
host = "https://bbs.125.la/"
def get_demo():
i = 0
while i <= 869:
i += 1
print(i)
url = "https://bbs.125.la/forum-98-" + str(i) + ".html"
print("=" * 40)
print(url)
header = {
"Host": "bbs.125.la",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400",
"X-Requested-With": "XMLHttpRequest",
"Accept": "*/*",
"Referer": "https://bbs.125.la/forum-98-1.html",
}
cookies = {
"Cookie": "替换成自己的cookies"
}
req = requests.get(url, headers=header, cookies=cookies)
text = req.content.decode('gbk',"ignore")
html = etree.HTML(text)
result = html.xpath("//*[@id='threadlisttableid']/tbody/tr/th/a[2]")
for a in result:
str1 = a.xpath("text()")[0]
href = a.xpath("@href")[0]
if href.find("html") != -1:
try:
f.write(str1 + "----" + host + href + "\n")
print(str1, host + href)
except:
print("编码有误",str1)
if name == 'main':
f = open('demo.txt', mode='w',encoding='utf-8')
get_demo()
f.close()
|