|
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import requests
- from lxml import etree
- import sqlite3
- def write_sql(c, text):
- html = etree.HTML(text)
- # 标题
- titles = html.xpath('//ul[@class="news"]//a[@target="_blank"]/p/text()')
- # 链接
- hrefs = html.xpath('//ul[@class="news"]//a[@target="_blank"]/@href')
- # 日期
- ems = html.xpath('//ul[@class="news"]//a[@target="_blank"]/em/text()')
- number = 0
- for title, href, em in zip(titles, hrefs, ems):
- href = host + href
- cursor = c.execute(
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % href)
- res = c.fetchall()
- # 判断该字段是否已存在
- if res[0][0] > 0:
- continue
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- href,
- title.replace(""", """"),
- em))
- number += 1
- print(title, href, em)
- conn.commit()
- return number > 0
- if __name__ == '__main__':
- conn = sqlite3.connect("Python-xxx.db")
- c = conn.cursor()
- c.execute('''CREATE TABLE IF NOT EXISTS Python (
- Url VARCHAR,
- Title VARCHAR,
- Author VARCHAR
- )''')
- conn.commit()
- host = "https://xxx"
- url = host + "/xxx"
- req = requests.get(url)
- req.encoding = 'utf-8'
- # print(req.text)
- html = etree.HTML(req.text)
- clearfixs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/text()')
- hrefs = html.xpath('//*[@class="nav clearfix"]//a[starts-with(@href, "/cate/")]/@href')
- # print(clearfix, href)
- for clearfix, href in zip(clearfixs, hrefs):
- print(clearfix, host + href)
- page = 1
- while True:
- url = host + href + "/list_%s.html" % page
- req = requests.get(url)
- req.encoding = 'utf-8'
- if (not write_sql(c, req.text)):
- break
- print("第%s页" % page)
- page += 1
- conn.close()
复制代码 xpath用着就是舒服~
|
|