patho爬虫-爬取--华夏基金代码

最新推荐文章于 2021-03-01 05:21:52 发布
原创最新推荐文章于 2021-03-01 05:21:52 发布 · 469 阅读
本内容遵循CC 4.0 BY-SA版权协议
该博客主要围绕patho爬虫展开，其核心功能是爬取基金代码，在信息技术领域涉及爬虫技术的应用。
import re
import requests

#创建一个华夏基金类：
class Fund():
    def __init__(self):
        self.run()
    def run(self):
        #获取初始的url：
        base_url="http://fund.chinaamc.com/portal/cn/include/newproducthome.jsp"
        response=requests.get(base_url)
        html=response.text
        # print(html)
        """
        接着，创建四个函数，分别表示四个基金列表的table，然后分别爬取
        每个table中包含的数据
        """
        self.get_data_tb(html)
        self.get_data_tb1(html)
        self.get_data_tb2(html)
        self.get_data_tb3(html)

#第一张
    def get_data_tb(self,html):
        print('---------------------打印第一张表-------------------------')
        #获取第一个基金列表
        table1_list=re.findall('<table width="100%" border="0" cellspacing="0" cellpadding="0" style="margin-bottom:10px;border-bottom:1px solid #eee;" id="tb">(.*?)</table>',html,re.S)
        #获取table中的每一个tr标签：
        tr1_list=re.findall('<tr align="center"(.*?)</tr>',table1_list[0],re.S)
        # print(tr1_list)#检查是否获取到数据。
        fund_key=re.findall('<span class="p16_libe">(.*?)</span>',table1_list[0],re.S)
        # print(fund_key)
        for t_list in tr1_list:
            #获取基金名：
            title=re.findall('title="(.*?)" target=',t_list)
            # print(title)#第一个是空值
            #其他的信息：
            other=re.findall('<td height="30">(.*?)</td>',t_list)
            # print(other)#第一个是空值
            #因为有空值的存在，需要做出相应的判断
            if title and other:
                if other[5]=='---':
                    other[5]=''
                table1_dict={
                    fund_key[0]:title[0],#基金简称
                    fund_key[1]:other[1],#基金代码
                    fund_key[2]:other[2],#净值日期
                    fund_key[3]:other[3],#净值
                    fund_key[4]:other[4],#累计净值
                    fund_key[5]:other[5],#涨跌幅
                    fund_key[6]:other[6],#成立日期
                    fund_key[7]:other[7],#申购状态
                    fund_key[8]:other[8],#赎回状态
                    fund_key[9]:other[9],#定投状态
                }
                print(table1_dict)
#第二张
    def get_data_tb1(self,html):#第二个table的内容：
        print('---------------------打印第二张表-------------------------')
        table2_list=re.findall('<table width="100%" border="0" cellspacing="0" cellpadding="0" style="margin-bottom:10px; border-bottom:1px solid #eee;border-top:1px solid #eee;" id="tb1">(.*?)</table>',html,re.S)
        #获取第二个table中的每个tr标签：
        tr2_list=re.findall('<tr align="center"(.*?)</tr>', table2_list[0], re.S)
        #打印查看数据是否获取到
        # print(tr2_list)
        for t2_list in tr2_list:
            #找到基金的名称：
            title2=re.findall('title="(.*?)"',t2_list)
            # print(title2)#查看数据
            other2=re.findall('<td height="30">(.*?)</td>',t2_list,re.S)
            # print(other2)#查看数据
            if other2 and title2:
                #获取基金代码：
                daima=other2[2]
                #获取净值日期：
                date=other2[3]
                #获取百万分收益
                jingzhi=other2[4]
                #获取七年收益率
                jingzhi7=other2[5]
                #最近30天的年华
                day30=other2[6]
                # 获取今年以来的年化
                day_yuar = other2[7]
                # 成立日期
                chengli_date = other2[8]
                # 申购状态
                gou = other2[9]
                # 赎回状态
                shu= other2[10]
                # 定投状态
                tou = other2[11]
                table2_dict={
                    '基金简称': title2[0],
                    '基金代码': daima,
                    '净值日期': date,
                    '百万盘收益': jingzhi,
                    '七日年收益': jingzhi7,
                    '最近30天的年化': day30,
                    '获取今年以来的年化': day_yuar,
                    '成立日期': chengli_date,
                    '申购状态': gou,
                    '赎回状态': shu,
                    '定投状态': tou,
                }
                print(table2_dict)
#第三张
    def get_data_tb2(self,html):#第三个table的内容
        print('---------------------打印第三张表-------------------------')
        #获取table表格的内容：
        table3_list=re.findall( '<table width="100%" border="0" cellspacing="0" cellpadding="0" style="margin-bottom:10px;border-bottom:1px solid #eee; border-top:1px solid #eee;" id="tb2">(.*?)</table>',html, re.S)
        #获取表格的每个tr标签
        tr3_list=re.findall('<tr align="center"(.*?)</tr>', table3_list[0], re.S)
        # print(tr3_list)
        #遍历tr3_list
        for t3_list in tr3_list:
            # 找到基金名
            name = re.findall('title="(.*?)"', t3_list)
            # print(name)
            # 基金名以外的东西需要遍历出来
            other = re.findall('<td height="30">(.*?)</td>', t3_list)
            # print(other)
            # 第一个是空值，需要判断
            if name and other:
                table3_dict = {
                    '基金简称': name[0],
                    '基金代码': other[2],
                    '净值日期': other[3],
                    '万盘收益': other[4],
                    '七日年收益': other[5],
                    '运作期年化收益': '',
                    '成立日期': other[6],
                    '申购状态': other[7],
                    '赎回状态': other[8],
                    '定投状态': other[9],
                }
                print(table3_dict)
#第四张
    def get_data_tb3(self,html):#4获取最后一个table
        print('---------------------打印第四张表-------------------------')
        #获取表单的所有信息
        table4_list=re.findall( '<table width="100%" border="0" cellspacing="0" cellpadding="0" style="border-top:1px solid #eee;border-bottom:1px solid #eee;margin-bottom:10px" id="tb3">(.*?)</table>',html,re.S)
        #获取每个tr标签
        tr4_list=re.findall('<tr align="center"(.*?)</tr>',table4_list[0],re.S)
        # print(tr4_list)
        fund4_key = re.findall('<span class="p16_libe">(.*?)</span>', table4_list[0], re.S)
        print(fund4_key)
        #循环遍历
        for t4_list in tr4_list:
            #获取基金名称
            name=re.findall('title="(.*?)"',t4_list)
            # print(name)
            #获取其他的内容
            other4=re.findall('<td height="30">(.*?)</td>',t4_list)
            # print(other4)
            if name and other4:
                if not other4[4] or other4[4]=='--':
                    other4[4]=''
                if not other4[5] or other4[5]=='--':
                    other4[5]=''
                if not other4[7]:
                    other4[7]=''
                if other4[8]=='---':
                    other4[8]=''
                table4_dict={
                    fund4_key[0]:name[0],
                    fund4_key[1]:other4[2],
                    fund4_key[2]:other4[3],
                    fund4_key[3]:other4[4],
                    fund4_key[4]:other4[5],
                    fund4_key[5]:other4[6],
                    fund4_key[6]:other4[7],
                    fund4_key[7]:other4[8],
                }
                print(table4_dict)
if __name__ == '__main__':
    Fund()