feat:🌱 add fund base infomation crawler

main
jackluson 4 years ago
parent 1519e5bac3
commit adf2ddddc6

@ -0,0 +1,96 @@
'''
Desc: 获取晨星单支基金的基础信息 -- 代码名称分类基金公司成立时间等这些基金一成立都不会变动的信息
File: /acquire_fund_base.py
Project: src
File Created: Monday, 8th March 2021 5:31:50 pm
Author: luxuemin2108@gmail.com
-----
Copyright (c) 2020 Camel Lu
'''
import math
from utils import parse_cookiestr, set_cookies, login_site
from fund_info_crawler import FundInfo
from lib.mysnowflake import IdWorker
import pymysql
connect = pymysql.connect(host='127.0.0.1', user='root',
password='xxxx', db='fund_work', charset='utf8')
cursor = connect.cursor()
def login():
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_driver = webdriver.Chrome(options=chrome_options)
chrome_driver.set_page_load_timeout(12000)
login_url = 'https://www.morningstar.cn/membership/signin.aspx'
login_status = login_site(
chrome_driver, login_url)
if login_status:
print('login success')
else:
print('login fail')
exit()
return chrome_driver
if __name__ == '__main__':
chrome_driver = login()
morning_cookies = chrome_driver.get_cookies()
# 获取数据库的基金列表
sql_count = "SELECT count(*) FROM fund_morning_snapshot WHERE fund_code IS NOT NULL AND morning_star_code IS NOT NULL"
cursor.execute(sql_count)
count = cursor.fetchone() # 获取记录条数
print('count', count[0])
IdWorker = IdWorker()
page_limit = 10
record_total = count[0]
page_start = 3890
error_funds = ['005086']
while(page_start < record_total):
sql = "SELECT fund_code, morning_star_code, fund_name FROM fund_morning_snapshot WHERE fund_code IS NOT NULL AND morning_star_code IS NOT NULL ORDER BY fund_code LIMIT %s, %s"
cursor.execute(
sql, [page_start, page_limit]) # 执行sql语句
results = cursor.fetchall() # 获取查询的所有记录
for record in results:
each_fund = FundInfo(
record[0], record[1], record[2], chrome_driver, morning_cookies)
# 从天天基金网上更新信息
# each_fund.update_fund_info_by_tiantian()
# 从晨星网上更新信息
is_normal = each_fund.get_fund_detail_info()
if is_normal == False or each_fund.found_date == '-':
error_funds.append(each_fund.fund_code)
continue
# each_fund.get_asset_composition_info()
snow_flake_id = IdWorker.get_id()
base_dict = {
'id': snow_flake_id,
'fund_code': each_fund.fund_code,
'morning_star_code': each_fund.morning_star_code,
'fund_name': each_fund.fund_name,
'fund_cat': each_fund.fund_cat,
'company': each_fund.company,
'found_date': each_fund.found_date
}
keys = ','.join(base_dict.keys())
values = ','.join(['%s'] * len(base_dict))
update_values = ''
for key in base_dict.keys():
if key in ['id', 'fund_code']:
continue
update_values = update_values + '{0}=VALUES({0}),'.format(key)
base_sql_insert = "INSERT INTO {table} ({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE {update_values}; ".format(
table='fund_morning_base',
keys=keys,
values=values,
update_values=update_values[0:-1]
)
cursor.execute(base_sql_insert, tuple(base_dict.values()))
connect.commit()
page_start = page_start + page_limit
print('page_start', page_start)
chrome_driver.close()
print('error_funds', error_funds)

@ -0,0 +1,65 @@
'''
Desc: 爬取基金详情页信息
File: /fund_info_crawler.py
Project: src
File Created: Monday, 8th March 2021 5:43:27 pm
Author: luxuemin2108@gmail.com
-----
Copyright (c) 2020 Camel Lu
'''
import re
from time import sleep
from IOFile import crawl_html
from bs4 import BeautifulSoup
from utils import parse_cookiestr, set_cookies, login_site
class FundInfo:
# 初始化定义,利用基金代码、基金名称进行唯一化
def __init__(self, code, namecode, name, chrome_driver, morning_cookies):
self.season_number = '2021-1'
self.fund_code = code # 基金代码,需要初始化赋值
self.fund_name = name # 基金名称,需要初始化赋值
self.morning_star_code = namecode # 基金编码,晨星网特有,需要建立索引表
self.morning_cookies = morning_cookies or None
self.chrome_driver = chrome_driver or None
# 通过晨星网获取
self.fund_cat = None # 基金分类
self.found_date = None # 成立时间
self.total_asset = None # 总资产
self.investname_style = None # 投资风格
self.manager = dict() # 基金经理,name,id,管理时间
self.company = None # 基金公司
self.three_month_retracement = 0.0 # 三个月最大回撤
self.bond_total_position = dict() # 债券总仓位、前五大持仓
self.stock_total_position = dict() # 股票总仓位、前十大持仓
self.ten_top_stock_list = [] # 股票十大持仓股信息
self.risk_assessment = dict() # 标准差 风险系数 夏普比
self.risk_statistics = dict() # 阿尔法 贝塔 R平方值
# 处理基金详情页跳转
def go_fund_url(self, cookie_str=None):
self.login_morning_star(cookie_str)
morning_fund_selector_url = "https://www.morningstar.cn/quicktake/" + \
self.morning_star_code
# 获取基金三个月内的最大回撤
self.chrome_driver.get(morning_fund_selector_url) # 打开爬取页面
sleep(6)
# 判断是否页面出错,重定向,如果是的话跳过
if self.chrome_driver.current_url == 'https://www.morningstar.cn/errors/defaulterror.html':
return False
def get_fund_base_info(self, cookie_str=None):
# 基金分类
self.fund_cat = self.chrome_driver.find_element_by_id(
'qt_base').find_element_by_class_name("category").text
# 成立时间
self.found_date = self.chrome_driver.find_element_by_id(
'qt_base').find_element_by_class_name("inception").text
# 基金公司
self.company = self.chrome_driver.find_element_by_id(
'qt_management').find_element_by_xpath("//ul[@id='qt_management']/li[4]/span[@class='col2 comp']/a").text
Loading…
Cancel
Save