diff --git a/main.py b/main.py index f6ec18b..5d25eca 100644 --- a/main.py +++ b/main.py @@ -33,28 +33,28 @@ def main(): 9.“股票持仓基金汇总”\n \ 10.“高分基金”\n \ 输入:") - if input_value == '1' or input_value == '快照': - page_index = 486 + if input_value == '1': + page_index = 0 get_fund_list(page_index) # 执行申万行业信息入库 - elif input_value == '2' or input_value == '新基入库': + elif input_value == '2': acquire_fund_base() # 执行行业股票信息入库 - elif input_value == '3' or input_value == "季度信息": + elif input_value == '3': acquire_fund_quarter() - elif input_value == '4' or input_value == "基金状态归档": + elif input_value == '4': fund_supplement = FundSupplement() # 补充基金清算维度信息 fund_supplement.update_archive_status() - elif input_value == '5' or input_value == "组合持仓明细": + elif input_value == '5': get_special_fund_code_holder_stock_detail() - elif input_value == '6' or input_value == "基金持仓股排名": + elif input_value == '6': all_stocks_rank() - elif input_value == '7' or input_value == "基金重仓股Top100": + elif input_value == '7': t100_stocks_rank() - elif input_value == '8' or input_value == "股票持仓基金明细": + elif input_value == '8': all_stock_holder_detail() - elif input_value == '9' or input_value == "股票持仓基金汇总": + elif input_value == '9': calculate_quarter_fund_count() - elif input_value == '10' or input_value == "高分基金": + elif input_value == '10': output_high_score_funds() else: print('输入有误') diff --git a/requirements.txt b/requirements.txt index 9e50ee2..06f3f34 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ Pillow==8.3.1 python-dotenv==0.19.0 cryptography==37.0.4 lxml==4.9.1 +scikit-image==0.19.3 +sewar==0.4.5 diff --git a/src/acquire_fund_quarter.py b/src/acquire_fund_quarter.py index d2bf2ba..eba4f35 100644 --- a/src/acquire_fund_quarter.py +++ b/src/acquire_fund_quarter.py @@ -10,7 +10,7 @@ Copyright (c) 2020 Camel Lu ''' from threading import Lock, current_thread -from time import sleep +from time import sleep, time from pprint import pprint from fund_info.crawler import FundSpider from fund_info.api import FundApier @@ -39,8 +39,7 @@ def get_total_asset(fund_code, platform): def acquire_fund_quarter(): lock = Lock() each_fund_query = FundQuery() - record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数 - print('record_total', record_total) + idWorker = IdWorker() result_dir = './output/' fund_csv = FundCSV(result_dir) @@ -52,153 +51,199 @@ def acquire_fund_quarter(): chrome_driver = login_morning_star(login_url, False) page_start = start page_limit = 10 - while(page_start < end): - results = each_fund_query.select_quarter_fund( - page_start, page_limit) - for record in results: - sleep(1) - # 0P000179WG - # 001811 中欧明睿新常态混合A - each_fund = FundSpider( - record[0], record[1], record[2], chrome_driver) - is_error_page = each_fund.go_fund_url() - # 是否能正常跳转到基金详情页,没有的话,写入csv,退出当前循环 - if is_error_page == True: - # error_funds.append(each_fund.fund_code) - fund_infos = [each_fund.fund_code, each_fund.morning_star_code, - each_fund.fund_name, record[3], page_start, '页面跳转有问题'] - output_line = ', '.join(str(x) - for x in fund_infos) + '\n' - fund_csv.write_abnormal_url_fund(False, output_line) + try: + while(page_start < end): + results = each_fund_query.select_quarter_fund( + page_start, page_limit) + for record in results: + sleep(1) + # 0P000179WG + # 001811 中欧明睿新常态混合A + each_fund = FundSpider( + record[0], record[1], record[2], chrome_driver) + + each_fund.set_found_data(record[3]) + is_error_page = each_fund.go_fund_url() + # 是否能正常跳转到基金详情页,没有的话,写入csv,退出当前循环 + if is_error_page == True: + # error_funds.append(each_fund.fund_code) + fund_infos = [each_fund.fund_code, each_fund.morning_star_code, + each_fund.fund_name, record[3], page_start, '页面跳转有问题'] + output_line = ', '.join(str(x) + for x in fund_infos) + '\n' + fund_csv.write_abnormal_url_fund(False, output_line) - continue - # 开始爬取数据 - quarter_index = each_fund.get_quarter_index() # 数据更新时间,如果不一致,不爬取下面数据 - if quarter_index != each_fund.quarter_index: - print('quarter_index', quarter_index, each_fund.update_date, - each_fund.fund_code, each_fund.fund_name) - continue + continue + # 开始爬取数据 + quarter_index = each_fund.get_quarter_index() # 数据更新时间,如果不一致,不爬取下面数据 + if quarter_index != each_fund.quarter_index: + print('quarter_index', quarter_index, each_fund.update_date, + each_fund.fund_code, each_fund.fund_name) + continue - each_fund.get_fund_season_info() # 基本季度性数据 - each_fund.get_fund_manager_info() # 基金经理模块 - each_fund.get_fund_morning_rating() # 基金晨星评级 - each_fund.get_fund_qt_rating() # 基金风险评级 - # 判断是否有股票持仓,有则爬取 - if each_fund.stock_position['total'] != '0.00' and each_fund.total_asset != None: - each_fund.get_asset_composition_info() - # 爬取过程中是否有异常,有的话,存在csv中 - if each_fund._is_trigger_catch == True: - fund_infos = [each_fund.fund_code, each_fund.morning_star_code, - each_fund.fund_name, record[3], - each_fund.stock_position['total'], - page_start, each_fund._catch_detail] - output_line = ', '.join(str(x) - for x in fund_infos) + '\n' - fund_csv.write_season_catch_fund(False, output_line) - # 入库 - lock.acquire() - snow_flake_id = idWorker.get_id() - lock.release() - # 开始存入数据 - fund_insert = FundInsert() - # 基金经理 - first_manager_id = None - first_manager_start_date = None - for manager_item in each_fund.manager_list: - manager = Manager(**manager_item) - manager.upsert() - if first_manager_id == None: - first_manager_id = manager_item['manager_id'] - if first_manager_start_date == None: - first_manager_start_date = manager_item['manager_start_date'] - - manager_assoc_data = { - 'quarter_index': quarter_index, - 'manager_start_date': manager_item['manager_start_date'], - 'manager_id': manager_item['manager_id'], - 'fund_code': each_fund.fund_code - } - manager_assoc = ManagerAssoc(**manager_assoc_data) - manager_assoc.upsert() - # fund_insert.insert_fund_manger_info(manager_dict) - quarterly_dict = { - 'id': snow_flake_id, - 'quarter_index': each_fund.quarter_index, - 'fund_code': each_fund.fund_code, - 'investname_style': each_fund.investname_style, - 'total_asset': each_fund.total_asset, - 'manager_id': first_manager_id, # 暂时存第一个基金经理信息 - 'manager_start_date': first_manager_start_date, # 暂时存第一个基金经理信息 - 'three_month_retracement': each_fund.three_month_retracement, - 'june_month_retracement': each_fund.june_month_retracement, - 'risk_statistics_alpha': each_fund.risk_statistics.get('alpha'), - 'risk_statistics_beta': each_fund.risk_statistics.get('beta'), - 'risk_statistics_r_square': each_fund.risk_statistics.get('r_square'), - 'risk_assessment_standard_deviation': each_fund.risk_assessment.get('standard_deviation'), - 'risk_assessment_risk_coefficient': each_fund.risk_assessment.get('risk_coefficient'), - 'risk_assessment_sharpby': each_fund.risk_assessment.get('sharpby'), - 'risk_rating_2': each_fund.risk_rating.get(2), - 'risk_rating_3': each_fund.risk_rating.get(3), - 'risk_rating_5': each_fund.risk_rating.get(5), - 'risk_rating_10': each_fund.risk_rating.get(10), - 'stock_position_total': each_fund.stock_position.get('total'), - 'stock_position_ten': each_fund.stock_position.get('ten'), - 'bond_position_total': each_fund.bond_position.get('total'), - 'bond_position_five': each_fund.bond_position.get('five'), - 'morning_star_rating_3': each_fund.morning_star_rating.get(3), - 'morning_star_rating_5': each_fund.morning_star_rating.get(5), - 'morning_star_rating_10': each_fund.morning_star_rating.get(10), - } - fund_insert.fund_quarterly_info(quarterly_dict) - # 入库十大股票持仓 - stock_position_total = each_fund.stock_position.get( - 'total', '0.00') - if float(stock_position_total) > 0: - stock_dict = { - 'id': snow_flake_id, + each_fund.get_fund_season_info() # 基本季度性数据 + each_fund.get_fund_manager_info() # 基金经理模块 + each_fund.get_fund_morning_rating() # 基金晨星评级 + each_fund.get_fund_qt_rating() # 基金风险评级 + # 判断是否有股票持仓,有则爬取 + if each_fund.stock_position['total'] != '0.00' and each_fund.total_asset != None: + each_fund.get_asset_composition_info() + # 爬取过程中是否有异常,有的话,存在csv中 + if each_fund._is_trigger_catch == True: + fund_infos = [each_fund.fund_code, each_fund.morning_star_code, + each_fund.fund_name, record[3], + each_fund.stock_position['total'], + page_start, each_fund._catch_detail] + output_line = ', '.join(str(x) + for x in fund_infos) + '\n' + fund_csv.write_season_catch_fund(False, output_line) + # 入库 + lock.acquire() + snow_flake_id = idWorker.get_id() + lock.release() + # 开始存入数据 + fund_insert = FundInsert() + # 基金经理 + first_manager_id = None + first_manager_start_date = None + for manager_item in each_fund.manager_list: + manager = Manager(**manager_item) + manager.upsert() + if first_manager_id == None: + first_manager_id = manager_item['manager_id'] + if first_manager_start_date == None: + first_manager_start_date = manager_item['manager_start_date'] + manager_assoc_data = { + 'quarter_index': quarter_index, + 'manager_start_date': manager_item['manager_start_date'], + 'manager_id': manager_item['manager_id'], + 'fund_code': each_fund.fund_code + } + manager_assoc = ManagerAssoc(**manager_assoc_data) + manager_assoc.upsert() + # fund_insert.insert_fund_manger_info(manager_dict) + init_total_asset = each_fund.total_asset + quarterly_dict = { + # 'id': snow_flake_id, 'quarter_index': each_fund.quarter_index, 'fund_code': each_fund.fund_code, + 'investname_style': each_fund.investname_style, + # 'total_asset': each_fund.total_asset, + 'manager_id': first_manager_id, # 暂时存第一个基金经理信息 + 'manager_start_date': first_manager_start_date, # 暂时存第一个基金经理信息 + 'three_month_retracement': each_fund.three_month_retracement, + 'june_month_retracement': each_fund.june_month_retracement, + 'risk_statistics_alpha': each_fund.risk_statistics.get('alpha'), + 'risk_statistics_beta': each_fund.risk_statistics.get('beta'), + 'risk_statistics_r_square': each_fund.risk_statistics.get('r_square'), + 'risk_assessment_standard_deviation': each_fund.risk_assessment.get('standard_deviation'), + 'risk_assessment_risk_coefficient': each_fund.risk_assessment.get('risk_coefficient'), + 'risk_assessment_sharpby': each_fund.risk_assessment.get('sharpby'), + 'risk_rating_2': each_fund.risk_rating.get(2), + 'risk_rating_3': each_fund.risk_rating.get(3), + 'risk_rating_5': each_fund.risk_rating.get(5), + 'risk_rating_10': each_fund.risk_rating.get(10), 'stock_position_total': each_fund.stock_position.get('total'), + 'stock_position_ten': each_fund.stock_position.get('ten'), + 'bond_position_total': each_fund.bond_position.get('total'), + 'bond_position_five': each_fund.bond_position.get('five'), + 'morning_star_rating_3': each_fund.morning_star_rating.get(3), + 'morning_star_rating_5': each_fund.morning_star_rating.get(5), + 'morning_star_rating_10': each_fund.morning_star_rating.get(10), } - for index in range(len(each_fund.ten_top_stock_list)): - temp_stock = each_fund.ten_top_stock_list[index] - prefix = 'top_stock_' + str(index) + '_' - code_key = prefix + 'code' - stock_dict[code_key] = temp_stock['stock_code'] - name_key = prefix + 'name' - stock_dict[name_key] = temp_stock['stock_name'] - portion_key = prefix + 'portion' - stock_dict[portion_key] = temp_stock['stock_portion'] - market_key = prefix + 'market' - stock_dict[market_key] = temp_stock['stock_market'] - fund_insert.fund_stock_info(stock_dict) - # 获取同类基金,再获取同类基金的总资产 - if each_fund.fund_name.endswith('A'): - similar_name = each_fund.fund_name[0:-1] - results = each_fund_query.select_similar_fund( - similar_name) # 获取查询的所有记录 - platform = 'zh_fund' if '封闭' in similar_name else 'ai_fund' - for i in range(0, len(results)): - item = results[i] - item_code = item[0] - total_asset = get_total_asset(item_code, platform) - quarterly_dict['fund_code'] = item_code - quarterly_dict['total_asset'] = total_asset - quarterly_dict['id'] = snow_flake_id + i + 1 - # 入库 - fund_insert.fund_quarterly_info(quarterly_dict) - if float(stock_position_total) > 0: - stock_dict['fund_code'] = item_code - stock_dict['id'] = snow_flake_id + i + 1 + + # 入库十大股票持仓 + stock_position_total = each_fund.stock_position.get( + 'total', '0.00') + if float(stock_position_total) > 0: + stock_dict = { + 'id': snow_flake_id, + 'quarter_index': each_fund.quarter_index, + 'fund_code': each_fund.fund_code, + 'stock_position_total': each_fund.stock_position.get('total'), + } + for index in range(len(each_fund.ten_top_stock_list)): + temp_stock = each_fund.ten_top_stock_list[index] + prefix = 'top_stock_' + str(index) + '_' + code_key = prefix + 'code' + stock_dict[code_key] = temp_stock['stock_code'] + name_key = prefix + 'name' + stock_dict[name_key] = temp_stock['stock_name'] + portion_key = prefix + 'portion' + stock_dict[portion_key] = temp_stock['stock_portion'] + market_key = prefix + 'market' + stock_dict[market_key] = temp_stock['stock_market'] + + # 获取同类基金,再获取同类基金的总资产 + if each_fund.fund_name.endswith('A') or each_fund.fund_name.endswith('B') or each_fund.fund_name.endswith('C'): + similar_name = each_fund.fund_name[0:-1] + results = each_fund_query.select_similar_fund( + similar_name) # 获取查询的所有记录 + platform = 'zh_fund' if '封闭' in similar_name else 'ai_fund' + for i in range(0, len(results)): + item = results[i] + item_code = item[0] + if item_code == each_fund.fund_code: + continue + print("item_code", item_code, platform ) + total_asset = get_total_asset(item_code, platform) + init_total_asset = init_total_asset - total_asset + manager_assoc_data = { + 'quarter_index': quarter_index, + 'manager_start_date': manager_item['manager_start_date'], + 'manager_id': manager_item['manager_id'], + 'fund_code': item_code + } + manager_assoc = ManagerAssoc(**manager_assoc_data) + manager_assoc.upsert() + quarterly_dict['fund_code'] = item_code + quarterly_dict['total_asset'] = total_asset + quarterly_dict['id'] = snow_flake_id + i + 1 # 入库 - fund_insert.fund_stock_info(stock_dict) - # pprint(fundDict) - page_start = page_start + page_limit - print(current_thread().getName(), 'page_start', page_start) - sleep(3) + fund_insert.fund_quarterly_info(quarterly_dict) + if float(stock_position_total) > 0: + stock_dict['fund_code'] = item_code + stock_dict['id'] = snow_flake_id + i + 1 + # 入库 + fund_insert.fund_stock_info(stock_dict) + quarterly_dict['fund_code'] = each_fund.fund_code + quarterly_dict['total_asset'] = init_total_asset + quarterly_dict['id'] = snow_flake_id + fund_insert.fund_quarterly_info(quarterly_dict) + if float(stock_position_total) > 0: + stock_dict['fund_code'] = each_fund.fund_code + stock_dict['id'] = snow_flake_id + fund_insert.fund_stock_info(stock_dict) + # pprint(fundDict) + page_start = page_start + page_limit + print(current_thread().getName(), 'page_start', page_start) + sleep(3) + except(BaseException): + chrome_driver.close() + raise BaseException chrome_driver.close() - thread_count = 1 - bootstrap_thread(crawlData, record_total, thread_count) + thread_count = 4 + + # for count in range(6): + total_start_time = time() + # record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数 + # print("record_total", record_total) + # bootstrap_thread(crawlData, record_total, thread_count) + + for i in range(3): + print("i", i) + start_time = time() + record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数 + print('record_total', record_total) + try: + bootstrap_thread(crawlData, record_total, thread_count) + except: + end_time = time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + end_time = time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + total_end_time = time() + print("total耗时: {:.2f}秒".format(total_end_time - total_start_time)) exit() if __name__ == '__main__': diff --git a/src/acquire_fund_snapshot.py b/src/acquire_fund_snapshot.py index 4e8eaa9..40574f0 100644 --- a/src/acquire_fund_snapshot.py +++ b/src/acquire_fund_snapshot.py @@ -28,15 +28,12 @@ from utils.login import login_morning_star connect_instance = connect() cursor = connect_instance.cursor() -''' -判读是否当前页一致,没有的话,切换上一页,下一页操作 -''' def text_to_be_present_in_element(locator, text, next_page_locator): """ An expectation for checking if the given text is present in the specified element. - locator, text + locator, text -- 判读是否当前页一致,没有的话,切换上一页,下一页操作 """ def _predicate(driver): try: @@ -63,7 +60,6 @@ def get_fund_list(page_index): page_count = 25 # 晨星固定分页数 page_total = math.ceil(int(chrome_driver.find_element_by_xpath( '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count) - result_dir = './output/' output_head = '代码' + ',' + '晨星专属号' + ',' + '名称' + ',' + \ '类型' + ',' + '三年评级' + ',' + '五年评级' + ',' + '今年回报率' + '\n' @@ -119,13 +115,19 @@ def get_fund_list(page_index): # 晨星基金专属晨星码 morning_star_code_list.append(current_morning_code) name_list.append(tds_text[1].find_all('a')[0].string) + # print("name_list", name_list) # 基金分类 fund_cat.append(tds_text[2].string) # 三年评级 - rating = get_star_count(tds_text[3].find_all('img')[0]['src']) + # rating = None + rating_3_img_ele = tds_text[3].find_all('img')[0] + rating_3_src = rating_3_img_ele['src'] + rating = get_star_count(rating_3_src, current_morning_code, rating_3_img_ele) fund_rating_3.append(rating) # 5年评级 - rating = get_star_count(tds_text[4].find_all('img')[0]['src']) + rating_5_img_ele = tds_text[4].find_all('img')[0] + rating_5_src = rating_5_img_ele['src'] + rating = get_star_count(rating_5_src, current_morning_code, rating_5_img_ele) fund_rating_5.append(rating) # 今年以来回报(%) return_value = tds_nume[3].string if tds_nume[3].string != '-' else None diff --git a/src/assets/samples/star_0.png b/src/assets/samples/star_0.png new file mode 100644 index 0000000..5f46e2e Binary files /dev/null and b/src/assets/samples/star_0.png differ diff --git a/src/assets/samples/star_1.png b/src/assets/samples/star_1.png new file mode 100644 index 0000000..b74ca10 Binary files /dev/null and b/src/assets/samples/star_1.png differ diff --git a/src/assets/samples/star_2.png b/src/assets/samples/star_2.png new file mode 100644 index 0000000..437ab30 Binary files /dev/null and b/src/assets/samples/star_2.png differ diff --git a/src/assets/samples/star_3.png b/src/assets/samples/star_3.png new file mode 100644 index 0000000..4767214 Binary files /dev/null and b/src/assets/samples/star_3.png differ diff --git a/src/assets/samples/star_4.png b/src/assets/samples/star_4.png new file mode 100644 index 0000000..7ae9503 Binary files /dev/null and b/src/assets/samples/star_4.png differ diff --git a/src/assets/samples/star_5.png b/src/assets/samples/star_5.png new file mode 100644 index 0000000..3695de0 Binary files /dev/null and b/src/assets/samples/star_5.png differ diff --git a/src/assets/star/tmp.gif b/src/assets/star/tmp.gif index a389fb1..db24e37 100644 Binary files a/src/assets/star/tmp.gif and b/src/assets/star/tmp.gif differ diff --git a/src/crud/insert.py b/src/crud/insert.py index 2e14f1c..bfbf1b2 100644 --- a/src/crud/insert.py +++ b/src/crud/insert.py @@ -14,7 +14,7 @@ sys.path.append('./src') from sqlalchemy.orm import Session from models.manager import Manager, ManagerAssoc from models.quarter import Quarter -from models.var import prefix, ORM_Base, engine +from models.var import engine session = Session(engine) diff --git a/src/db/connect.py b/src/db/connect.py index 6b652de..6e238b7 100644 --- a/src/db/connect.py +++ b/src/db/connect.py @@ -2,7 +2,6 @@ import pymysql from config.env import env_db_host, env_db_name, env_db_user, env_db_password, env_db_stock_name - def connect(): connect = pymysql.connect( host=env_db_host, user=env_db_user, password=env_db_password, db=env_db_name, charset='utf8') diff --git a/src/fund_info/api.py b/src/fund_info/api.py index 6541af5..281dd77 100644 --- a/src/fund_info/api.py +++ b/src/fund_info/api.py @@ -79,11 +79,11 @@ class FundApier: pprint(res_json) print('code:1', self.fund_code) else: - pprint(res.raw) + print('url:', url) print('code:2', self.fund_code) raise('中断') except: - pprint(res.raw) + print('url:', url) print('code:3', self.fund_code) raise('中断') @@ -135,6 +135,7 @@ class FundApier: 'fundcode': self.fund_code, } res = requests.post(url, headers=headers, data=payload) + # print("res", res) res.encoding = "utf-8" time.sleep(1) try: @@ -169,5 +170,5 @@ class FundApier: if __name__ == '__main__': fund_api = FundApier('000421', end_date='2021-05-31',) - # fund_api.get_analyse_info_zh() + fund_api.get_analyse_info_zh() # print("fund_api", fund_api) diff --git a/src/fund_info/crawler.py b/src/fund_info/crawler.py index e83d773..1b40f7b 100644 --- a/src/fund_info/crawler.py +++ b/src/fund_info/crawler.py @@ -9,8 +9,8 @@ Copyright (c) 2020 Camel Lu ''' import re -from time import sleep -from bs4 import BeautifulSoup +from datetime import datetime, timedelta, date +from time import sleep, time from utils.index import get_star_count, get_quarter_index, get_last_quarter_str from selenium.common.exceptions import NoSuchElementException @@ -51,6 +51,8 @@ class FundSpider: # 十大持仓信息 self.ten_top_stock_list = [] # 股票十大持仓股信息 + def set_found_data(self, date): + self.found_date = date # 处理基金详情页跳转 def go_fund_url(self, cookie_str=None): # self.login_morning_star(cookie_str) @@ -156,10 +158,8 @@ class FundSpider: manager['manager_id'] = manager_id manager['manager_start_date'] = manager_ele.find_element_by_xpath( "li[@class='col1']/i").text[0:10] - manager['brife'] = manager_ele.find_element_by_xpath( "li[@class='col2']").text - self.manager_list.append(manager) except NoSuchElementException: @@ -173,18 +173,42 @@ class FundSpider: def get_fund_morning_rating(self): try: qt_el = self._chrome_driver.find_element_by_id('qt_star') - rating_3_src = qt_el.find_element_by_xpath( - "//li[@class='star3']/img").get_attribute('src') - rating_5_src = qt_el.find_element_by_xpath( - "//li[@class='star5']/img").get_attribute('src') - rating_10_src = qt_el.find_element_by_xpath( - "//li[@class='star10']/img").get_attribute('src') - rating_3 = get_star_count(rating_3_src) - rating_5 = get_star_count(rating_5_src) - rating_10 = get_star_count(rating_10_src) - self.morning_star_rating[3] = rating_3 - self.morning_star_rating[5] = rating_5 - self.morning_star_rating[10] = rating_10 + rating_3_img_ele = qt_el.find_element_by_xpath( + "//li[@class='star3']/img") + rating_3_src = rating_3_img_ele.get_attribute('src') + rating_5_img_ele = qt_el.find_element_by_xpath( + "//li[@class='star5']/img") + rating_5_src = rating_5_img_ele.get_attribute('src') + rating_10_img_ele = qt_el.find_element_by_xpath( + "//li[@class='star10']/img") + rating_10_src = rating_10_img_ele.get_attribute('src') + + delta = timedelta(days=3 * 365) + date_now = date.today() + is_more = False + + if date_now - delta > self.found_date: + is_more = True + rating_3 = get_star_count(rating_3_src, self.fund_code, rating_3_img_ele) + self.morning_star_rating[3] = rating_3 + if is_more == False: + return + + delta = timedelta(days=5 * 365) + is_more = False + if date_now - delta > self.found_date: + is_more = True + rating_5 = get_star_count(rating_5_src, self.fund_code, rating_5_img_ele) + self.morning_star_rating[5] = rating_5 + + if is_more == False: + return + delta = timedelta(days=10 * 365) + if date_now - delta > self.found_date: + rating_10 = get_star_count(rating_10_src, self.fund_code, rating_10_img_ele) + self.morning_star_rating[10] = rating_10 + + except NoSuchElementException: self._is_trigger_catch = True print('error_fund_info:', self.fund_code, @@ -225,8 +249,9 @@ class FundSpider: def get_fund_season_info(self): # 总资产 TODO: 增加一个数据更新时间field - self.total_asset = self.get_element_text_by_class_name( + total_asset = self.get_element_text_by_class_name( "asset", 'qt_base') + self.total_asset = float(total_asset) if total_asset else 0 # 投资风格 self.investname_style = self.get_element_text_by_class_name( 'sbdesc', 'qt_base') diff --git a/src/fund_info/supplement.py b/src/fund_info/supplement.py index 4bb1558..14da574 100644 --- a/src/fund_info/supplement.py +++ b/src/fund_info/supplement.py @@ -22,9 +22,12 @@ class FundSupplement: def update_archive_status(self): fund_query = FundQuery() each_fund_update = FundUpdate() + start = 0 funds = fund_query.select_quarter_fund(0, 15000) print("funds's len", len(funds)) - for fund_item in funds: + for index in range(start, len(funds)): + # print("index", index) + fund_item = funds[index] fund_code = fund_item[0] fund_api = FundApier(fund_code, platform='zh_fund') fund_api.get_analyse_info_zh() diff --git a/src/models/__init__.py b/src/models/__init__.py index e17922f..9bb6870 100644 --- a/src/models/__init__.py +++ b/src/models/__init__.py @@ -9,5 +9,6 @@ Copyright (c) 2022 Camel Lu ''' print('--models init--'); - -import fund +import sys +sys.path.append('./src') +import models.fund diff --git a/src/models/var.py b/src/models/var.py index b247208..52e0059 100644 --- a/src/models/var.py +++ b/src/models/var.py @@ -19,7 +19,7 @@ ORM_Base = get_orm_base() prefix = 'fund_morning_' -engine = get_engine(echo=True) +engine = get_engine(echo=False) # class ORM_Base(Base): # def __init__(self, **kwargs) -> None: diff --git a/src/sql_model/fund_query.py b/src/sql_model/fund_query.py index eb793d6..cd22baa 100644 --- a/src/sql_model/fund_query.py +++ b/src/sql_model/fund_query.py @@ -82,10 +82,8 @@ class FundQuery(BaseQuery): '普通债券型', '普通债券型(封闭)', '普通债券', '普通债券(封闭)', '普通债券型基金','普通债券型基金(封闭)', '信用债', '信用债(封闭)','目标日期', '商品 - 贵金属', '商品 - 其它' ) \ AND t.found_date <= %s \ AND t.is_archive = 0 \ - AND t.fund_name NOT LIKE '%%C' \ - AND t.fund_name NOT LIKE '%%B' \ AND t.fund_code NOT IN( SELECT fund_code FROM fund_morning_quarter as b \ - WHERE b.quarter_index = %s AND b.stock_position_total != 0)" + WHERE b.quarter_index = %s)" return condition # 筛选出要更新的基金季度性信息的基金(B,C类基金除外,因为B、C基金大部分信息与A类一致)的总数 @@ -100,8 +98,9 @@ class FundQuery(BaseQuery): @lock_process def select_quarter_fund(self, page_start, page_limit): sql = "SELECT t.fund_code,\ - t.morning_star_code, t.fund_name, t.fund_cat \ + t.morning_star_code, t.fund_name, t.found_date, t.fund_cat \ FROM fund_morning_base as t " + self.get_select_quarter_condition() + " LIMIT %s, %s;" + self.cursor.execute( sql, [self.quarter_date, self.quarter_index, page_start, page_limit]) # 执行sql语句 return self.cursor.fetchall() # 获取查询的所有记录 @@ -225,8 +224,9 @@ class FundQuery(BaseQuery): t.morning_star_code, t.fund_name \ FROM fund_morning_base as t \ LEFT JOIN fund_morning_snapshot as f ON f.fund_code = t.fund_code \ - WHERE t.fund_name LIKE %s \ - AND t.fund_name NOT LIKE '%%A';" + WHERE t.fund_name LIKE %s;" + + # AND t.fund_name NOT LIKE '%%A';" self.cursor.execute(sql_similar, [similar_name + '%']) results = self.cursor.fetchall() # 获取查询的所有记录 return results diff --git a/src/utils/index.py b/src/utils/index.py index b09ad20..4a44c32 100644 --- a/src/utils/index.py +++ b/src/utils/index.py @@ -2,14 +2,41 @@ import time import datetime import os - +import numpy as np +import requests +from PIL import Image +from skimage import io +from sewar.full_ref import uqi, sam import re from threading import Thread, Lock import pandas as pd from openpyxl import load_workbook - +requests.adapters.DEFAULT_RETRIES = 10 # 增加重连次数 +s = requests.session() +s.keep_alive = False # 关闭多余连接 + +dir = os.getcwd() + '/src/' + +img_dir = dir + 'img/' +samples_dir = dir + 'assets/samples/' + +def use_sewar_get_star_level(img_path): + sample_imgs = os.listdir(samples_dir) + img1 = io.imread(fname=img_path) + for filename in sample_imgs: + level = filename[-5:-4] + img_path_2 = samples_dir + filename + img2 = io.imread(fname=img_path_2) + res_uqi = uqi(img1, img2) + res_sam = sam(img1, img2) + + if res_uqi > 0.98 and res_sam < 0.11: + # res_level = level2 + return level + print('img_path', img_path) + raise "img_path 图片比较失败" def lock_process(func): lock = Lock() @@ -28,27 +55,57 @@ def debug(func): return wrapper # 返回包装过函数 -def get_star_count(morning_star_url): - import numpy as np - import requests - from PIL import Image +def get_star_count_with_sewar(fund_code, img_ele): + picture_time = time.strftime( + "%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) + directory_time = time.strftime("%Y-%m-%d", time.localtime(time.time())) + file_dir = os.getcwd() + '/star-record/' + directory_time + '/' + try: + if not os.path.exists(file_dir): + os.makedirs(file_dir) + print("目录新建成功:%s" % file_dir) + except BaseException as msg: + print("新建目录失败:%s" % msg) + + code_path = './star-record/' + directory_time + '/' + picture_time + '_' + fund_code + '_' + '_code.png' + is_success = img_ele.screenshot(code_path) + time.sleep(2) + if is_success: + return use_sewar_get_star_level(code_path) + else: + raise "截图失败" + + +def get_star_count_with_np(morning_star_url): module_path = os.getcwd() + '/src' temp_star_url = module_path + '/assets/star/tmp.gif' - r = requests.get(morning_star_url) + try: + r = requests.get(morning_star_url) + except BaseException: + raise BaseException('请求失败') with open(temp_star_url, "wb") as f: f.write(r.content) f.close() path = module_path + '/assets/star/star' - # path = './assets/star/star' try: for i in range(6): p1 = np.array(Image.open(path + str(i) + '.gif')) p2 = np.array(Image.open(temp_star_url)) if (p1 == p2).all(): return i - except: - print('morning_star_url', morning_star_url) + except BaseException: + raise BaseException('识别失败') + +def get_star_count(morning_star_url, fund_code, img_ele=None): + # path = './assets/star/star' + try: + return get_star_count_with_sewar(fund_code, img_ele) + except BaseException: + print("BaseException", BaseException) + print('图片相似度比较失败') + return get_star_count_with_np(morning_star_url) + def parse_csv(datafile):