|
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import sqlite3
- import requests
- from bs4 import BeautifulSoup
- from re import escape
- if __name__ == '__main__':
- conn = sqlite3.connect('Python.db')
- c = conn.cursor()
- c.execute('''CREATE TABLE IF NOT EXISTS Python (
- Url VARCHAR,
- Title VARCHAR,
- Author VARCHAR
- )''')
- conn.commit()
- # --------------------Split Line--------------------
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
- }
- for i in range(1, 1046):
- url = "http://xxx/index_%s.html" % str(i)
- req = requests.get(url=url, headers=headers)
- req.encoding = "utf-8"
- html = BeautifulSoup(req.text, "lxml")
- # --------------------Split Line--------------------
- for div in html.find_all('div', class_='loop'):
- content_body = div.select('h2 > a')[0]
- content_infor = div.select('.content_infor > span:nth-child(3)')[0]
- # --------------------Split Line--------------------
- cursor = c.execute(
- "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
- len = 0
- for row in cursor:
- len = row[0]
- if len > 0:
- continue
- # --------------------Split Line--------------------
- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
- "http://xxx" + content_body.get('href'),
- escape(content_body.get('title').replace(""", """")),
- content_infor.text.replace('xxx: ', '')))
- conn.commit()
- print("第%s页" % str(i))
- # --------------------Split Line--------------------
- conn.close()
复制代码
|
|