BeautifulSoup

参考：http://www.freebuf.com/news/special/96763.html

相关资料：http://www.jb51.net/article/65287.htm

1、Python3 win7安装BeautifulSoup

BeautifulSoup中文文档：http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html

BeautifulSoup下载：http://www.crummy.com/software/BeautifulSoup/

解压，运行cmd执行：python setup.py install即可

2、导入beatifulsoup库：from bs4 import BeautifulSoup

传入数据，建立对象： soup = BeautifulSoup(data)，

操作soup，完成需求解析。

3、示例代码：

 from bs4 import BeautifulSoup
 from urllib import request
 import re web = request.urlopen('http://www.freebuf.com')
 # 没有特别指明解析器，bs4使用了它认为最好的解析器,但是在不同的环境下运行，可能解析器是不一样的。
 # 如果没有'html.parser'，会有warning提示，表明了bs4的自动选择解析器来解析的特性。
 soup = BeautifulSoup(web.read(),'html.parser')
 tags_a = soup.find_all(name='a', attrs={'href': re.compile('^https?://')}) for tag_a in tags_a:
     print(tag_a['href'])

4、利用BeautifulSoup获取网站的sitemap:

 # coding:utf-8
 # 获取整个网站的sitemap import urllib.request
 import urllib.error
 from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 import time
 import datetime url = input('请输入扫描的url:')
 domain = input('请输入包含的域名：')
 sites = set() # 获取一个页面的所有url
 def get_local_pages(url, domain):
     pages = set()
     global sites
     repeat_time = 0     # 解析传入的url为后面相对路径拼接用
     parse_url = urlparse(url)     # 防止url读取卡住：自动重读5次
     while True:
         try:
             print('Ready to Open the web!')
             time.sleep(1)
             print('Opening the web : %s' % url)
             web = urllib.request.urlopen(url=url, timeout=20)
             print('Success to Open the web!')
             break
         except urllib.error.URLError as e:
             print('Open Url Error:',e)
             print('Open url Failed!!!Repeat!')
             time.sleep(1)
             repeat_time += 1
             if repeat_time == 5:
                 return     soup = BeautifulSoup(web.read())
     tags = soup.find_all(name='a')     for tag in tags:
         # 避免参数传递异常
         try:
             ret = tag['href']
         except:
             print('Maybe not the attr : href')
             continue         parse_page = urlparse(ret)         # 1 url不为空（协议，域名，路径）
         if parse_page[0] is '' and parse_page[1] is '' and parse_page[2] is '':
             print('Bad Page(协议\域名\路径均为空):%s' % ret)
             continue         # 2 协议不为空，判断合法性
         if parse_page[0] is not '' and 'http' not in parse_page[0]:
             print('Bad Page(协议不合法,非http):%s' % ret)
             continue         # 3 域名不为空，domain要包含在域名中
         if parse_page[1] is not '' and domain not in parse_page[1]:
             print('Bad Page(域名不合法,非%s):%s' % (domain, ret))
             continue         # 4 协议为空，域名不为空(拼接ret),例如：//caipiao.taobao.com
         if parse_page[0] is '' and parse_page[1] is not '':
             print('Fix page(仅域名存在): %s' % ret)
             newpage = parse_url[0] + ':' + ret
             if newpage not in sites:
                 print('Add Fix Page(拼接域名):%s' % newpage)
                 pages.add(newpage)
             continue         # 5 协议域名为空，路径不为空(拼接ret)
         if parse_page[0] is '' and parse_page[1] is '':
             print('Fix page(仅路径存在): %s' % ret)
             temp_page = parse_url[0] + '://' + parse_url[1] + '/' + ret
             # 保持URL的干净
             newpage = temp_page[:8] + temp_page[8:].replace('//', '/')
             if newpage not in sites:
                 print('Add Fix Page(拼接路径):%s' % newpage)
                 pages.add(newpage)
             continue         # 整理输出
         newpage = ret
         if newpage not in sites:
             print('Add New Page:%s' % newpage)
             pages.add(newpage)     return pages # dfs 算法遍历全站(目前中小型网站可用，待完善)
 def dfs(pages, domain):
     global sites
     if pages in sites:
         return 'Success!'     # visited = set()
     # sites = set.union(sites,pages)
     for page in pages:
         if page not in sites:
             sites.add(page)
             get_pages = get_local_pages(page, domain)
             dfs(get_pages, domain)
     return t1 = datetime.datetime.now()
 pages = get_local_pages(url, domain)
 dfs(pages,domain)
 text_name = domain + '全站扫描.txt'
 with open(text_name, 'a') as f:
     f.write('\n' + str(datetime.datetime.now()) + '\n')
 for i in sites:
     with open(text_name, 'a') as f:
         f.write(i + '\n') with open(text_name, 'a') as f:
     f.write('\n用时：' + str(datetime.datetime.now() - t1) + '\n') sitemap

sitemap

5、基本知识点

Bs4的基本api的使用，关于beautifulSoup的基本使用方法，我这里需要介绍在下面的脚本中我使用到的方法：

Soup = BeautifulSoup(data) #构建一个解析器

Tags = Soup.findAll(name,attr)

我们重点要讲findAll方法的两个参数：name和attr

Name：指的是标签名，传入一个标签名的名称就可以返回所有固定名称的标签名

Attr：是一个字典存储需要查找的标签参数，返回对应的标签

Tag.children 表示获取tag标签的所有子标签

Tag.string 表示获取tag标签内的所有字符串，不用一层一层索引下去寻找字符串

Tag.attrs[key] 表示获取tag标签内参数的键值对键为key的值

Tag.img 表示获取tag标签的标签名为img的自标签（一个）

6、利用BeautifulSoup获取58页面的指定信息（python2.7）

 #!/usr/bin/env python
 # -*- coding: utf-8 -*- import urllib
 import urllib2
 from bs4 import BeautifulSoup url = 'http://ny.58.com/zufang/24584108096437x.shtml?qq-pf-to=pcqq.c2c' # rq = urllib2.Request(url)
 # print rq
 rp = urllib.urlopen(url)
 html = rp.read()
 soup = BeautifulSoup(html) # 获取标题
 title = soup.find_all(name='h1', attrs={'class': 'main-title font-heiti'})
 for data in title:
     data_title = data.get_text()
     print data_title # 获取租金
 primary = soup.find_all(name='em', attrs={'class': 'house-price'})
 for data in primary:
     data_primary = data.get_text()
     print data_primary # 获取房屋
 house_type = soup.find_all(name='div', attrs={'class': 'fl house-type c70'})
 for data in house_type:
     temp_type = data.get_text().replace('-', ' ')
     temp_type = ' '.join(temp_type.split())
     print temp_type
     # data_type_list = []
     # for d in temp_type:
     #     data_type_list.append(d)
     # print data_type_list # 获取小区
 xiaoqu = soup.find_all(name='div', attrs={'class': 'fl xiaoqu c70'})
 for data in xiaoqu:
     data_xiaoqu = data.get_text().strip()
     print data_xiaoqu # 获取配置
 config = soup.find_all(name='li', attrs={'class': 'house-primary-content-li clearfix person-config'})
 for data in config:
     data_config = data.div.get_text().replace('-',' ')
     data_config = ' '.join(data_config.split())
     print data_config # 获取联系人
 contact = soup.find_all(name='li', attrs={'class': 'house-primary-content-li clearfix person-contact'})
 for data in contact:
     data_contact = data.div.span.get_text()
     print data_contact # 写入文件
 # with open('58_test1.txt','w') as f:
 #     f.write('标题：'+data_title.decode('gbk'))
 #     f.write('租金：' + data_primary)

个人收藏笔记记录

开通VIP