|
好久没有写代码了,今天闲着就来写写这个。
若写的不好请大神指点,谢谢!
- from urllib import request
- from lxml import etree
- from pprint import pprint
- class GetMoviePlay:
- """
- # 取电影标题 '//*[@id="data_list"]/tr/td[1]/a'
- # 取电影页面链接 '//*[@id="data_list"]/tr/td[1]/a/@href'
- # 取电影类型 '//*[@id="data_list"]/tr/td[2]/a'
- # 取电影地区 '//*[@id="data_list"]/tr/td[3]/p/font'
- # 取连载状态 '//*[@id="data_list"]/tr/td[4]/span/font'
- # 取更新时间 '//*[@id="data_list"]/tr/td[5]/font'
- # 取当前页码详情 '/html/body/div[4]/div[1]/div/text()[1]'
- # 获取真实链接 /html/body/div[4]/div[2]/div/div/ul/li[2]/text()
- """
- def __init__(self,page):
- self.__url = 'http://yongjiuzy.cc/?m=vod-type-id-6-pg-%d.html' % page
- self.__name = (
- '//*[@id="data_list"]/tr/td[1]/a/text()',
- '//*[@id="data_list"]/tr/td[1]/a/@href',
- '//*[@id="data_list"]/tr/td[2]/a/text()',
- '//*[@id="data_list"]/tr/td[3]/p/font/text()',
- '//*[@id="data_list"]/tr/td[4]/span/font/text()',
- '//*[@id="data_list"]/tr/td[5]/font/text()',
- '/html/body/div[4]/div[1]/div/text()[1]',
- '/html/body/div[4]/div[1]/div/div[2]/li[1]/text()[2]',
- '/html/body/div[4]/div[2]/div/div/ul/li[2]/text()'
- )
- @property
- def movie_info_dict(self):
- res = request.Request(self.__url)
- response = request.urlopen(res).read()
- response = etree.HTML(response)
- # 取电影标题
- m_cont_list_tmp = response.xpath(self.__name[0])
- m_cont_list = []
- # 解析电影标题,只取奇数的主标题
- for i in range(len(m_cont_list_tmp)):
- if i % 2 == 0:
- m_cont_list.append(m_cont_list_tmp[i])
- # 取电影页面链接
- m_url_list = response.xpath(self.__name[1])
- # 拼接url
- for i in range(len(m_url_list)):
- m_url_list[i] = 'http://yongjiuzy.cc/' + m_url_list[i]
- # 取电影类型
- m_type_list = response.xpath(self.__name[2])
- # 取电影地区 '//*[@id="data_list"]/tr/td[3]/p/font'
- m_city_list = response.xpath(self.__name[3])
- # 取连载状态 '//*[@id="data_list"]/tr/td[4]/span/font'
- m_state_list = response.xpath(self.__name[4])
- # 取更新时间 '//*[@id="data_list"]/tr/td[5]/font'
- m_time_list = response.xpath(self.__name[5])
- # 取当前页码详情 '/html/body/div[4]/div[1]/div/text()[1]'
- m_this_page = response.xpath(self.__name[6])
- return {'context':m_cont_list,
- 'url':m_url_list,
- 'type':m_type_list,
- 'city':m_city_list,
- 'state':m_state_list,
- 'time':m_time_list,
- 'thisPage':m_this_page
- }
- def getMovieTrue(self,url):
- res = request.Request(url)
- response = request.urlopen(res).read()
- response = etree.HTML(response)
- # 电影名称 /html/body/div[4]/div[1]/div/div[2]/li[1]/text()[2]
- m_true_context = response.xpath(self.__name[7])
- # 获取真实链接 /html/body/div[4]/div[2]/div/div/ul/li[2]/text()
- m_true_url = response.xpath(self.__name[8])
- # 返回列表数据
- return [m_true_context[0],m_true_url[0]]
- if __name__ == '__main__':
- a = GetMoviePlay(1) # 初始化一个页码
- # 返回了一堆数据,其实我就只要了url
- url = a.movie_info_dict['url']
- for i in range(len(url)):
- info = a.getMovieTrue(url[i])
- pprint(info)
复制代码
|
-
评分
-
查看全部评分
|