wip: add quarter compare

main
jackluson 3 years ago
parent a3042763ee
commit 8e3cff4068

@ -29,6 +29,7 @@ def login():
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--headless')
chrome_driver = webdriver.Chrome(options=chrome_options)
chrome_driver.set_page_load_timeout(12000)
login_url = 'https://www.morningstar.cn/membership/signin.aspx'
@ -68,6 +69,8 @@ if __name__ == '__main__':
WHERE fund_morning_base.fund_cat NOT LIKE '%%货币%%' \
AND fund_morning_base.fund_cat NOT LIKE '%%纯债基金%%' \
AND fund_morning_base.fund_cat NOT LIKE '目标日期' \
AND fund_morning_base.fund_name NOT LIKE '%%C' \
AND fund_morning_base.fund_name NOT LIKE '%%B' \
AND fund_morning_base.fund_cat NOT LIKE '%%短债基金%%'"
cursor.execute(sql_count)
count = cursor.fetchone() # 获取记录条数
@ -101,7 +104,7 @@ if __name__ == '__main__':
chrome_driver = login()
morning_cookies = chrome_driver.get_cookies()
page_start = start
page_limit = 10
page_limit = 1
while(page_start < end):
sql = "SELECT t.fund_code,\
t.morning_star_code, t.fund_name, t.fund_cat \
@ -111,6 +114,8 @@ if __name__ == '__main__':
AND t.fund_cat NOT LIKE '%%纯债基金%%' \
AND t.fund_cat NOT LIKE '目标日期' \
AND t.fund_cat NOT LIKE '%%短债基金%%' \
AND t.fund_name NOT LIKE '%%C' \
AND t.fund_name NOT LIKE '%%B' \
ORDER BY f.fund_rating_5 DESC,f.fund_rating_3 DESC, \
t.fund_cat, t.fund_code LIMIT %s, %s"
lock.acquire()
@ -137,10 +142,29 @@ if __name__ == '__main__':
lock.release()
continue
# 开始爬取数据
each_fund.get_fund_season_info() # 基本数据
each_fund.get_fund_manager_info() # 基金经理模块
each_fund.get_fund_morning_rating() # 基金晨星评级
each_fund.get_fund_qt_rating() # 基金风险评级
quarter_index = each_fund.get_quarter_index() # 数据更新时间
if quarter_index == each_fund.quarter_index:
print('quarter_index', quarter_index)
if each_fund.fund_name.endswith('A'):
print('fund_name', each_fund.fund_name[0:-1])
similar_name = each_fund.fund_name[0:-1]
sql_similar = "SELECT t.fund_code,\
t.morning_star_code, t.fund_name \
FROM fund_morning_base as t \
LEFT JOIN fund_morning_snapshot as f ON f.fund_code = t.fund_code \
WHERE t.fund_name LIKE %s \
AND t.fund_name NOT LIKE '%%A';"
lock.acquire()
cursor.execute(
sql_similar, [similar_name + '%']) # 执行sql语句
results = cursor.fetchall() # 获取查询的所有记录
print('results', results)
lock.release()
# each_fund.get_fund_season_info() # 基本数据
# each_fund.get_fund_manager_info() # 基金经理模块
# each_fund.get_fund_morning_rating() # 基金晨星评级
# each_fund.get_fund_qt_rating() # 基金风险评级
continue
# 判断是否有股票持仓,有则爬取
if each_fund.stock_position['total'] != '0.00' and each_fund.total_asset != None:
each_fund.get_asset_composition_info()
@ -171,9 +195,9 @@ if __name__ == '__main__':
manager_sql_insert = generate_insert_sql(
manager_dict, 'fund_morning_manager', ['id', 'manager_id', 'name'])
lock.acquire()
cursor.execute(manager_sql_insert,
tuple(manager_dict.values()))
connect_instance.commit()
# cursor.execute(manager_sql_insert,
# tuple(manager_dict.values()))
# connect_instance.commit()
lock.release()
# 季度信息 TODO: 对比数据更新时间field
season_dict = {
@ -207,9 +231,9 @@ if __name__ == '__main__':
season_sql_insert = generate_insert_sql(
season_dict, 'fund_morning_season', ['id', 'quarter_index', 'fund_code'])
lock.acquire()
cursor.execute(season_sql_insert,
tuple(season_dict.values()))
connect_instance.commit()
# cursor.execute(season_sql_insert,
# tuple(season_dict.values()))
# connect_instance.commit()
lock.release()
# 入库十大股票持仓
stock_position_total = each_fund.stock_position.get(
@ -236,9 +260,9 @@ if __name__ == '__main__':
stock_dict, 'fund_morning_stock_info', ['id', 'quarter_index', 'fund_code'])
lock.acquire()
# print('stock_sql_insert', stock_sql_insert)
cursor.execute(stock_sql_insert,
tuple(stock_dict.values()))
connect_instance.commit()
# cursor.execute(stock_sql_insert,
# tuple(stock_dict.values()))
# connect_instance.commit()
lock.release()
# pprint(fundDict)
page_start = page_start + page_limit
@ -247,7 +271,7 @@ if __name__ == '__main__':
chrome_driver.close()
threaders = []
start = time()
step_num = 2500
step_num = 1
# steps = [{
# "start": 800,
# "end": 2500
@ -261,7 +285,7 @@ if __name__ == '__main__':
# "start": 8300,
# "end": record_total
# }]
for i in range(4):
for i in range(1):
skip_num = 100
# print(i * step_num + skip_num, (i+1) * step_num)
# start = steps[i]['start']
@ -275,6 +299,6 @@ if __name__ == '__main__':
for threader in threaders:
threader.join()
stop = time()
print('run time is %s' % (stop-start))
print('run time is %s' % (stop - start))
print('error_funds', error_funds)
exit()

@ -9,7 +9,6 @@ def connect():
env_db_host = os.getenv('db_host')
env_db_name = os.getenv('db_name')
env_db_user = os.getenv('db_user')
print('env_db_user', env_db_user)
env_db_password = os.getenv('db_password')
connect = pymysql.connect(
host=env_db_host, user=env_db_user, password=env_db_password, db=env_db_name, charset='utf8')

@ -11,14 +11,15 @@ Copyright (c) 2020 Camel Lu
import re
from time import sleep
from bs4 import BeautifulSoup
from utils import parse_cookiestr, set_cookies, login_site, get_star_count
from utils import parse_cookiestr, set_cookies, login_site, get_star_count, get_season_index
from selenium.common.exceptions import NoSuchElementException
class FundSpider:
# 初始化定义,利用基金代码、基金名称进行唯一化
def __init__(self, code, namecode, name, chrome_driver, morning_cookies):
self.quarter_index = '2021-q1' # TODO: get quarter_index by current time
def __init__(self, code, namecode, name, chrome_driver, morning_cookies):
self.quarter_index = '2021-Q1' # TODO: get quarter_index by current time
self.update_date = None # 数据更新时间,默认取资产配置更新时间
self.fund_code = code # 基金代码,需要初始化赋值
self.fund_name = name # 基金名称,需要初始化赋值
self.morning_star_code = namecode # 基金编码,晨星网特有,需要建立索引表
@ -56,6 +57,7 @@ class FundSpider:
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument('--headless')
# _chrome_driver = webdriver.Chrome("/usr/local/chromedriver")
self._chrome_driver = webdriver.Chrome(options=chrome_options)
self._chrome_driver.set_page_load_timeout(12000)
@ -174,7 +176,7 @@ class FundSpider:
manager_name = manager_ele.find_element_by_xpath(
"li[@class='col1']/a").text
manager_id = re.findall(
r'(?<=managerid=)(\w+)$', manager_ele.find_element_by_xpath("li[@class='col1']/a").get_attribute('href')).pop(0)
r'(?<=managerid=)(\w+)$', manager_ele.find_element_by_xpath("li[@class='col1']/a").get_attribute('href')).pop(0)
manager_start_date = manager_ele.find_element_by_xpath(
"li[@class='col1']/i").text[0:10]
manager_brife = manager_ele.find_element_by_xpath(
@ -328,8 +330,17 @@ class FundSpider:
temp_stock_info['stock_code'] = stock_base[0]
temp_stock_info['stock_market'] = None if len(
stock_base) == 1 else stock_base.pop()
temp_stock_info['stock_name'] = li_elements[index+1].text
temp_stock_info['stock_name'] = li_elements[index + 1].text
# temp_stock_info['stock_value'] = li_elements[index+2].text
temp_stock_info['stock_portion'] = li_elements[index +
3].text if li_elements[index+3].text != '-' else None
3].text if li_elements[index + 3].text != '-' else None
self.ten_top_stock_list.append(temp_stock_info)
def get_quarter_index(self):
# 总资产 TODO: 增加一个数据更新时间field
self.update_date = self.get_element_text_by_class_name(
"date4", 'aspnetForm')
split_dates = self.update_date.split('-', 1)
quarter_index = get_season_index(split_dates[1])
print("self.update_date", split_dates[0] + '-Q' + str(quarter_index))
return split_dates[0] + '-Q' + str(quarter_index)

@ -90,7 +90,7 @@ def get_star_count(morning_star_url):
# path = './assets/star/star'
for i in range(6):
p1 = np.array(Image.open(path+str(i)+'.gif'))
p1 = np.array(Image.open(path + str(i) + '.gif'))
p2 = np.array(Image.open(temp_star_url))
if (p1 == p2).all():
return i

Loading…
Cancel
Save