上次代码只能抓取一个网页上的链接,本次可以自主设定抓取的页面个数。
代码如下:
from selenium import webdriver
import os, timeclass DownloadFiles(): def __init__(self):
self.url = 'http://www.neeq.com.cn/disclosure/announcement.html'
self.basePath = os.path.dirname(__file__)
self.times = 7 #表示翻页的次数 def makedir(self, name):
path = os.path.join(self.basePath, name)
isExist = os.path.exists(path)
if not isExist:
os.makedirs(path)
print('File has been created.')
else:
print('The file is existed.')
# 切换到该目录下
os.chdir(path) def connect(self, url):
driver = webdriver.PhantomJS()
driver.get(url)
return driver #翻页
def nextPage(self, driver): #每次点击next之后停顿5秒钟
next = driver.find_element_by_class_name('next')
next.click()
time.sleep(5) def getFiles(self):
driver = self.connect(self.url)
self.makedir('Files')
#自动翻页
for i in range(self.times):
print('第' + str(i+1) + '页:')
aList = driver.find_elements_by_tag_name('a')
for r in aList:
try:
link = r.get_attribute('href')
if link.endswith('pdf'):
print(r.text)
print(link)
fileName = r.text + '.pdf'
#urlretrieve(link, fileName)
except:
pass
self.nextPage(driver=driver)if __name__ == '__main__':
obj = DownloadFiles()
obj.getFiles()