diff --git a/src/acquire_fund_quarter.py b/src/acquire_fund_quarter.py index 553a9b4..d223681 100644 --- a/src/acquire_fund_quarter.py +++ b/src/acquire_fund_quarter.py @@ -22,6 +22,7 @@ from sql_model.fund_insert import FundInsert from sql_model.fund_query import FundQuery from utils.driver import create_chrome_driver from utils.index import bootstrap_thread +from utils.file_op import read_error_code_from_json, write_fund_json_data from utils.login import login_morning_star # 利用api获取同类基金的资产 @@ -44,13 +45,17 @@ def get_total_asset(fund_code, platform): def acquire_fund_quarter(): lock = Lock() each_fund_query = FundQuery() - idWorker = IdWorker() - result_dir = './output/' - fund_csv = FundCSV(result_dir) - fund_csv.write_season_catch_fund(True) - fund_csv.write_abnormal_url_fund(True) - + # result_dir = './output/' + # fund_csv = FundCSV(result_dir) + # fund_csv.write_season_catch_fund(True) + # fund_csv.write_abnormal_url_fund(True) + err_info = read_error_code_from_json() + error_funds_with_page = err_info.get('error_funds_with_page') + error_funds_with_found_date = err_info.get('error_funds_with_found_date') + error_funds_with_unmatch = err_info.get('error_funds_with_unmatch') + filename = err_info.get('filename') + file_dir = err_info.get('file_dir') def crawlData(start, end): login_url = 'https://www.morningstar.cn/membership/signin.aspx' chrome_driver = create_chrome_driver() @@ -62,29 +67,30 @@ def acquire_fund_quarter(): results = each_fund_query.select_quarter_fund( page_start, page_limit) for record in results: - sleep(1) - # 0P000179WG - # 001811 中欧明睿新常态混合A - each_fund = FundSpider( - record[0], record[1], record[2], chrome_driver) - + fund_code = record[0] + if fund_code in error_funds_with_page or fund_code in error_funds_with_found_date or fund_code in error_funds_with_unmatch: + print('error fund: ', fund_code) + continue + each_fund = FundSpider(fund_code, record[1], record[2], chrome_driver) each_fund.set_found_data(record[3]) is_error_page = each_fund.go_fund_url() # 是否能正常跳转到基金详情页,没有的话,写入csv,退出当前循环 if is_error_page == True: # error_funds.append(each_fund.fund_code) - fund_infos = [each_fund.fund_code, each_fund.morning_star_code, - each_fund.fund_name, record[3], page_start, '页面跳转有问题'] - output_line = ', '.join(str(x) - for x in fund_infos) + '\n' - fund_csv.write_abnormal_url_fund(False, output_line) + # fund_infos = [each_fund.fund_code, each_fund.morning_star_code, + # each_fund.fund_name, record[3], page_start, '页面跳转有问题'] + # output_line = ', '.join(str(x) + # for x in fund_infos) + '\n' + # fund_csv.write_abnormal_url_fund(False, output_line) + error_funds_with_page.append(each_fund.fund_code) continue # 开始爬取数据 quarter_index = each_fund.get_quarter_index() # 数据更新时间,如果不一致,不爬取下面数据 if quarter_index != each_fund.quarter_index: - print('quarter_index', quarter_index, each_fund.update_date, - each_fund.fund_code, each_fund.fund_name) + # print('quarter_index', quarter_index, each_fund.update_date, + # each_fund.fund_code, each_fund.fund_name) + error_funds_with_unmatch.append(each_fund.fund_code) continue each_fund.get_fund_season_info() # 基本季度性数据 @@ -95,14 +101,14 @@ def acquire_fund_quarter(): if each_fund.stock_position['total'] != '0.00' and each_fund.total_asset != None: each_fund.get_asset_composition_info() # 爬取过程中是否有异常,有的话,存在csv中 - if each_fund._is_trigger_catch == True: - fund_infos = [each_fund.fund_code, each_fund.morning_star_code, - each_fund.fund_name, record[3], - each_fund.stock_position['total'], - page_start, each_fund._catch_detail] - output_line = ', '.join(str(x) - for x in fund_infos) + '\n' - fund_csv.write_season_catch_fund(False, output_line) + # if each_fund._is_trigger_catch == True: + # fund_infos = [each_fund.fund_code, each_fund.morning_star_code, + # each_fund.fund_name, record[3], + # each_fund.stock_position['total'], + # page_start, each_fund._catch_detail] + # output_line = ', '.join(str(x) + # for x in fund_infos) + '\n' + # fund_csv.write_season_catch_fund(False, output_line) # 入库 lock.acquire() snow_flake_id = idWorker.get_id() @@ -157,7 +163,6 @@ def acquire_fund_quarter(): 'morning_star_rating_5': each_fund.morning_star_rating.get(5), 'morning_star_rating_10': each_fund.morning_star_rating.get(10), } - # 入库十大股票持仓 stock_position_total = each_fund.stock_position.get( 'total', '0.00') @@ -192,7 +197,6 @@ def acquire_fund_quarter(): item_code = item[0] if item_code == each_fund.fund_code: continue - print("item_code", item_code, platform) total_asset = get_total_asset(item_code, platform) if total_asset != None: init_total_asset = init_total_asset - total_asset @@ -233,25 +237,22 @@ def acquire_fund_quarter(): raise BaseException chrome_driver.close() thread_count = 6 - - # for count in range(6): total_start_time = time() # record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数 - # print("record_total", record_total) # bootstrap_thread(crawlData, record_total, thread_count) - - for i in range(3): - print("i", i) + record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数 + for i in range(2): start_time = time() - record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数 print('record_total', record_total) try: bootstrap_thread(crawlData, record_total, thread_count) except: - end_time = time() - print("耗时: {:.2f}秒".format(end_time - start_time)) + cur_total = each_fund_query.select_quarter_fund_total() # 获取记录条数 + print('crawler item count:', record_total - cur_total) + record_total = cur_total end_time = time() print("耗时: {:.2f}秒".format(end_time - start_time)) + write_fund_json_data({'error_funds_with_page': error_funds_with_page, 'error_funds_with_found_date': error_funds_with_found_date, 'error_funds_with_unmatch': error_funds_with_unmatch}, filename=filename, file_dir=file_dir) total_end_time = time() print("total耗时: {:.2f}秒".format(total_end_time - total_start_time)) exit() diff --git a/src/crud/query.py b/src/crud/query.py index b6f68a0..25a8a36 100644 --- a/src/crud/query.py +++ b/src/crud/query.py @@ -11,7 +11,7 @@ Copyright (c) 2022 Camel Lu import sys sys.path.append('./src') -from sqlalchemy import and_ +from sqlalchemy import and_, or_ from sqlalchemy.orm import Session from models.fund import FundBase, FundQuarter @@ -53,9 +53,12 @@ def query_empty_company_and_found_date_fund(start, size): all_funds = session.query(FundBase).where(FundBase.company == None, FundBase.found_date == None, FundBase.is_archive==0).offset(start).limit(size).all() return all_funds +def query_empty_company_or_found_date_fund(start, size): + all_funds = session.query(FundBase).where(FundBase.is_archive==0).filter(or_(FundBase.company == None, FundBase.found_date == None)).offset(start).limit(size).all() + return all_funds if __name__ == '__main__': quarter_index = '2022-Q2' # fund_list = query_high_score_funds(quarter_index) - query_empty_company_and_found_date_fund(2, 10) + query_empty_company_or_found_date_fund(0, 5000) # print("fund_list",fund_list) diff --git a/src/fund_info/crawler.py b/src/fund_info/crawler.py index b89af58..2c07a45 100644 --- a/src/fund_info/crawler.py +++ b/src/fund_info/crawler.py @@ -61,14 +61,20 @@ class FundSpider: self.morning_star_code self._chrome_driver.get(morning_fund_selector_url) # 打开爬取页面 - sleep(6) + sleep(5) # 判断是否页面出错,重定向,如果是的话跳过 if self._chrome_driver.current_url == 'https://www.morningstar.cn/errors/defaulterror.html': return True - while self._chrome_driver.page_source == None: + if 'Value cannot be null' in self._chrome_driver.title: + return True + try_count = 5 + while self._chrome_driver.page_source == None and try_count > 0: self._chrome_driver.refresh() print('wait:fund_code', self.fund_code) sleep(9) + try_count -= 1 + if self._chrome_driver.page_source == None: + return True return False # self._chrome_driver.execute_script('location.reload()') @@ -140,7 +146,7 @@ class FundSpider: # 获取基金经理信息(多位在任基金经理,只需第一位) def get_fund_manager_info(self): manager_ele_list = self._chrome_driver.find_element(By.ID, - 'qt_manager').find_elements_by_xpath("ul") + 'qt_manager').find_elements(By.XPATH, "ul") for manager_ele in manager_ele_list: try: # 基金经理 diff --git a/src/sync_fund_base.py b/src/sync_fund_base.py index 3a300c6..c3ae60b 100644 --- a/src/sync_fund_base.py +++ b/src/sync_fund_base.py @@ -14,11 +14,14 @@ from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait -from crud.query import query_all_fund, query_empty_company_and_found_date_fund -from models.fund import FundBase +from crud.query import (query_all_fund, + query_empty_company_and_found_date_fund, + query_empty_company_or_found_date_fund) from fund_info.crawler import FundSpider -from utils.index import bootstrap_thread +from models.fund import FundBase from utils.driver import create_chrome_driver, text_to_be_present_in_element +from utils.file_op import read_error_code_from_json, write_fund_json_data +from utils.index import bootstrap_thread from utils.login import login_morning_star @@ -94,8 +97,13 @@ def sync_fund_base(page_index): print('end') def further_complete_base_info(): - all_funds = query_empty_company_and_found_date_fund(0, 10000) - error_funds = [] + all_funds = query_empty_company_or_found_date_fund(0, 10000) + err_info = read_error_code_from_json() + error_funds_with_page = err_info.get('error_funds_with_page') + error_funds_with_found_date = err_info.get('error_funds_with_found_date') + error_funds_with_unmatch = err_info.get('error_funds_with_unmatch') + filename = err_info.get('filename') + file_dir = err_info.get('file_dir') def crawlData(start, end): login_url = 'https://www.morningstar.cn/membership/signin.aspx' chrome_driver = create_chrome_driver() @@ -109,19 +117,21 @@ def further_complete_base_info(): # results = query_empty_company_and_found_date_fund(page_start, page_limit) for record in results: fund_code = record.fund_code + if fund_code in error_funds_with_page or fund_code in error_funds_with_found_date: + continue morning_star_code = record.morning_star_code fund_name = record.fund_name each_fund = FundSpider(fund_code, morning_star_code, fund_name, chrome_driver) # 是否能正常跳转到基金详情页 is_error_page = each_fund.go_fund_url() if is_error_page == True: - error_funds.append(each_fund.fund_code) + error_funds_with_page.append(each_fund.fund_code) continue each_fund.get_fund_base_info() # 去掉没有成立时间的 if each_fund.found_date == '-' or each_fund.found_date == None: # lock.acquire() - error_funds.append(each_fund.fund_code) + error_funds_with_found_date.append(each_fund.fund_code) # lock.release() continue # 拼接sql需要的数据 @@ -138,7 +148,12 @@ def further_complete_base_info(): print('page_start', page_start) page_start = page_start + page_limit chrome_driver.close() - bootstrap_thread(crawlData, len(all_funds), 3) + try: + bootstrap_thread(crawlData, len(all_funds), 6) + write_fund_json_data({'error_funds_with_page': error_funds_with_page, 'error_funds_with_found_date': error_funds_with_found_date, 'error_funds_with_unmatch': error_funds_with_unmatch}, filename=filename, file_dir=file_dir) + except: + write_fund_json_data({'error_funds_with_page': error_funds_with_page, 'error_funds_with_found_date': error_funds_with_found_date, 'error_funds_with_unmatch': error_funds_with_unmatch}, filename=filename, file_dir=file_dir) + if __name__ == '__main__': #127, 300, 600- page_index = 1 diff --git a/src/utils/file_op.py b/src/utils/file_op.py index a90650e..2ccb254 100644 --- a/src/utils/file_op.py +++ b/src/utils/file_op.py @@ -7,12 +7,15 @@ Author: luxuemin2108@gmail.com ----- Copyright (c) 2021 Camel Lu ''' +import json import os import time import pandas as pd from openpyxl import load_workbook +from .index import get_last_quarter_str + # 写json文件 def write_fund_json_data(data, filename, file_dir=None): @@ -30,8 +33,6 @@ def write_fund_json_data(data, filename, file_dir=None): def read_dir_all_file(path): return os.listdir(path) - - def update_xlsx_file(path, df_data, sheet_name): try: if os.path.exists(path): @@ -110,3 +111,23 @@ def update_xlsx_file_with_insert(path, df_data, sheet_name, index = 0): except BaseException: print("path", path) raise BaseException('更新excel失败') + +def read_error_code_from_json(): + quarter_index = get_last_quarter_str() + filename = 'error_funds_' + quarter_index + '.json' + file_dir = './output/json/' + error_funds_with_page = [] + error_funds_with_unmatch = [] + error_funds_with_found_date = [] + if os.path.exists(file_dir + filename): + with open(file_dir + filename) as json_file: + my_data = json.load(json_file) + error_funds_with_page = my_data.get('error_funds_with_page') + error_funds_with_found_date = my_data.get('error_funds_with_found_date') + return { + "file_dir": file_dir, + "filename": filename, + 'error_funds_with_unmatch': error_funds_with_unmatch, + 'error_funds_with_page': error_funds_with_page, + 'error_funds_with_found_date': error_funds_with_found_date + }