BeautifulSoup库

安装:conda install beautifulsoup4

获取指定class样式的div

soup=BeautifulSoup(html)
result = soup.find_all(name='div',attrs={"class":"footer"})#按照字典的形式给attrs参数赋值

GuessedAtParserWarning

添加类似 lxml的解析器:

soup = BeautifulSoup(r.text,"lxml")

示例: 抓取弯曲日报第一期内容并解析

URL:https://wanqu.co/issues/1

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup

def get_wanqu_article(issue):
    # 获取湾区日报 wanqu.co 的第 issue 期html 页面内容
    print("抓取第" + str(issue) + "期文章")
    url = "https://wanqu.co/issues/" + str(issue)
    r = requests.get(url)
    r.encoding="UTF-8"
    if r.status_code==200:
        html = r.text
    else:
        print("抓取湾区日报第" + str(issue) + "期失败")
        html = ''

    '''
    解析获取的 html,输出文章条目:
    - 弯曲日报期数:2014/08/06 第1期
        - 文章题目:StackOverflow: 25台服务器,每月560,000,000 page views
        - 文章源链: 原链 highscalability.com 
        - 点评:湾区日报作者点评
    '''

    if len(html) > 0:
        soup = BeautifulSoup(html,"lxml")

        # 获取title
        tags = soup.find_all('h1', class_="wq-header")
        title = tags[0].text

        # 获取所有文章在wanqu.co的链接
        '''
        <a href="https://wanqu.co/a/9/stackoverflow-25台服务器每月560000000-page-views/"
            style="color: #000;" title="StackOverflow: 25台服务器,每月560,000,000 page views">
            <h2 class="wq-header" style="margin-bottom:4px;">StackOverflow: 25台服务器,每月560,000,000
                page views</h2>
        </a>
        '''
        tags = soup.find_all('a')
        article_url_list = []
        for tag in tags: 
            tags2 = tag.find_all('h2', class_="wq-header")
            if len(tags2)>0:
                article_url_list.append(tag.get('href'))

        # 抓取并解析每一篇文章
        article_list = []
        for url in article_url_list:
            r = requests.get(url)
            r.encoding="UTF-8"
            if r.status_code==200:
                article_html =  r.text
            else:
                print("抓取文章html失败:" + url)
                article_html = ''
            if len(article_html)>0:
                # [title,url,lead]
                article_info = parse_article_html(article_html)
                article_list.append(article_info)
        return [title,article_list]
    else:
        return []

def parse_article_html(article_html):
    soup = BeautifulSoup(article_html,"lxml")
    # 获取title
    tags = soup.find_all('h1', class_="wq-header")
    title = tags[0].text

    # 获取原链
    tags = soup.find_all('a')
    for i in range(len(tags)):
        if "原链" in tags[i].text:
            url = tags[i].get('href')
            url = url[:url.find("?utm_source")]
            break

    # 获取点评
    tags = soup.find_all('div', class_="lead")
    lead = tags[0].text.strip()

    return [title,url,lead]


if __name__=="__main__":
    info = get_wanqu_article(1)

示例: 抓取必应(bing)每日壁纸脚本

本脚本从 https://bing.ioliu.cn/ 获取图片,并保存到指定目录

# -*- coding: utf-8 -*-
# 获取 bing(必应)的每日壁纸
# bing官方 api只能获取7天的,参考:https://blog.csdn.net/m0_37682004/article/details/82314055
# 本脚本从 https://bing.ioliu.cn/ 获取图片

import requests
#  conda install beautifulsoup4 lxml
from bs4 import BeautifulSoup 
import os
from PIL import Image # conda install Pillow

def get_all_file_names(local_dir):
    # 获取指定目录下的所有文件名,返回文件名列表
    pic_file_list=[]
    for root, dirs, files in os.walk(local_dir):
        for name in files:
            pic_file_list.append(name)
    return pic_file_list

def get_html_content(pageNum):
    # 页码url: https://bing.ioliu.cn/?p=1, 首页为1
    url = "https://bing.ioliu.cn/?p=" + str(pageNum)
    print("获取第 " + str(pageNum) + "页html内容")
    html = requests.get(url)
    return html.content

def get_pic_url_list(htmlContent):
    # 获取html中的图片 url列表
    '''
<img class="progressive__img progressive--is-loaded" 
src="http://h1.ioliu.cn/bing/BubbleNebula_ZH-CN2787112807_1920x1080.jpg" 
data-progressive="http://h1.ioliu.cn/bing/BubbleNebula_ZH-CN2787112807_1920x1080.jpg">
    '''
    soup = BeautifulSoup(htmlContent, 'lxml')
    pics = soup.findAll('img')
    url_list = []
    for pic in pics:
        if 'src' in pic.attrs:
            url_list.append(pic.attrs['src'])
    return url_list

def save_bing_pics(pageNumStart,pageNumEnd,local_dir):
    pic_file_list = get_all_file_names(local_dir)
    for pageNum in range(pageNumStart,pageNumEnd +1):
        htmlContent = get_html_content(pageNum)
        url_list = get_pic_url_list(htmlContent)
        for pic_url in url_list:
            pic_name = pic_url.split('/')[-1]
            if pic_name not in pic_file_list:
                pic_file_list.append(pic_name)
                print("获取图片:" + pic_name)
                html = requests.get(pic_url)
                with open(os.path.join(local_dir,pic_name),'wb') as file:
                    print("保存图片:" + pic_name)
                    file.write(html.content)
            else:
                print("图片已存在:" + pic_name)

def clear_pic_files(width,height,local_dir):
    # 清理图片文件,删除分辨率小于  width * height 的图片
    fileNameListToDel = []
    all_file_names = get_all_file_names(local_dir)
    for file_name in all_file_names:
        with Image.open(os.path.join(local_dir,file_name)) as imgFile:
            # 宽,高
            #print("图片 " + file_name + ":" + str(imgFile.width) + "*" + str(imgFile.height) )
            if (imgFile.width < width) or (imgFile.height < height):
                fileNameListToDel.append(file_name)
    # 删除文件
    for file_name in fileNameListToDel:
        os.remove(os.path.join(local_dir,file_name))
if __name__=="__main__":
    local_dir="D:/壁纸/bing"
    save_bing_pics(1,4,local_dir)  
    clear_pic_files(1920,1080,local_dir)  
© Licensed under CC BY-NC-SA 4.0

要节约用水,尽量和女友一起洗澡——加菲猫

发表我的评论
取消评论
表情

Hi,您需要填写昵称和邮箱!