From 31bafab045ea5e65503eb4ad95e40886036aeedc Mon Sep 17 00:00:00 2001 From: jackluson <18219112108@163.com> Date: Sat, 5 Nov 2022 18:03:31 +0800 Subject: [PATCH] feat: add sync fund base --- requirements.txt | 2 +- src/acquire_fund_base.py | 18 ++--- src/crud/query.py | 7 +- src/fund_info/supplement.py | 6 +- src/sync_fund_base.py | 147 ++++++++++++++++++++++++++++++++++++ 5 files changed, 165 insertions(+), 15 deletions(-) create mode 100644 src/sync_fund_base.py diff --git a/requirements.txt b/requirements.txt index 9eecd6a..71958c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -selenium==3.11.0 +selenium==4.5.0 PyMySQL==1.0.2 pandas==1.1.5 requests==2.18.4 diff --git a/src/acquire_fund_base.py b/src/acquire_fund_base.py index e27a928..00fe8b1 100644 --- a/src/acquire_fund_base.py +++ b/src/acquire_fund_base.py @@ -8,15 +8,13 @@ Author: luxuemin2108@gmail.com Copyright (c) 2020 Camel Lu ''' from threading import Lock - +from utils.login import login_morning_star +from utils.driver import create_chrome_driver +from utils.index import bootstrap_thread from fund_info.crawler import FundSpider from lib.mysnowflake import IdWorker -from sql_model.fund_insert import FundInsert from sql_model.fund_query import FundQuery -from utils.driver import create_chrome_driver -from utils.index import bootstrap_thread -from utils.login import login_morning_star - +from sql_model.fund_insert import FundInsert def acquire_fund_base(): lock = Lock() @@ -45,16 +43,16 @@ def acquire_fund_base(): for record in results: each_fund = FundSpider( record[0], record[1], record[2], chrome_driver) - # 从晨星网上更新信息 - is_normal = each_fund.go_fund_url() - if is_normal == False: + # 是否能正常跳转到基金详情页 + is_error_page = each_fund.go_fund_url() + if is_error_page == True: lock.acquire() error_funds.append(each_fund.fund_code) lock.release() continue each_fund.get_fund_base_info() # 去掉没有成立时间的 - if each_fund.found_date == '-': + if each_fund.found_date == '-' or each_fund.found_date == None: lock.acquire() error_funds.append(each_fund.fund_code) lock.release() diff --git a/src/crud/query.py b/src/crud/query.py index 806c98a..b6f68a0 100644 --- a/src/crud/query.py +++ b/src/crud/query.py @@ -49,8 +49,13 @@ def query_all_fund(): } return all_fund_dict +def query_empty_company_and_found_date_fund(start, size): + all_funds = session.query(FundBase).where(FundBase.company == None, FundBase.found_date == None, FundBase.is_archive==0).offset(start).limit(size).all() + return all_funds + if __name__ == '__main__': quarter_index = '2022-Q2' - fund_list = query_high_score_funds(quarter_index) + # fund_list = query_high_score_funds(quarter_index) + query_empty_company_and_found_date_fund(2, 10) # print("fund_list",fund_list) diff --git a/src/fund_info/supplement.py b/src/fund_info/supplement.py index 14da574..b03c4ce 100644 --- a/src/fund_info/supplement.py +++ b/src/fund_info/supplement.py @@ -8,17 +8,17 @@ Author: luxuemin2108@gmail.com Copyright (c) 2021 Camel Lu ''' -from utils.index import get_last_quarter_str +from fund_info.api import FundApier from sql_model.fund_query import FundQuery from sql_model.fund_update import FundUpdate -from fund_info.api import FundApier +from utils.index import get_last_quarter_str + class FundSupplement: def __init__(self, code=None): self.fund_code = code # 动态计算季度信息 self.quarter_index = get_last_quarter_str() - def update_archive_status(self): fund_query = FundQuery() each_fund_update = FundUpdate() diff --git a/src/sync_fund_base.py b/src/sync_fund_base.py new file mode 100644 index 0000000..022e072 --- /dev/null +++ b/src/sync_fund_base.py @@ -0,0 +1,147 @@ +''' +Desc: +File: /sync_fund_base.py +File Created: Sunday, 30th October 2022 2:53:56 pm +Author: luxuemin2108@gmail.com +----- +Copyright (c) 2022 Camel Lu +''' +import math +import re +from time import sleep + +from bs4 import BeautifulSoup +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait + +from crud.query import query_all_fund, query_empty_company_and_found_date_fund +from models.fund import FundBase +from fund_info.crawler import FundSpider +from utils.index import bootstrap_thread +from utils.driver import create_chrome_driver, text_to_be_present_in_element +from utils.login import login_morning_star + + +def sync_fund_base(page_index): + morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx" + chrome_driver = create_chrome_driver() + login_morning_star(chrome_driver, morning_fund_selector_url) + page_count = 25 # 晨星固定分页数 + page_total = math.ceil(int(chrome_driver.find_element(By.XPATH, + '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count) + all_fund_dict = query_all_fund() + all_fund_codes = all_fund_dict.keys() + while page_index <= page_total: + # 求余 + remainder = page_total % 10 + # 判断是否最后一页 + num = (remainder + + 2) if page_index > (page_total - remainder) else 12 + xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % ( + num) + print('page_index', page_index) + # 等待,直到当前页(样式判断)等于page_num + WebDriverWait(chrome_driver, timeout=600).until(text_to_be_present_in_element( + "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_index), xpath_str)) + sleep(1) + # 获取每页的源代码 + data = chrome_driver.page_source + # 利用BeautifulSoup解析网页源代码 + bs = BeautifulSoup(data, 'lxml') + class_list = ['gridItem', 'gridAlternateItem'] # 数据在这两个类下 + # 取出所有类的信息,并保存到对应的列表里 + for i in range(len(class_list)): + tr_list = bs.find_all('tr', {'class': class_list[i]}) + for tr_index in range(len(tr_list)): + # 雪花id + tr = tr_list[tr_index] + tds_text = tr.find_all('td', {'class': "msDataText"}) + # 基金代码 + code_a_element = tds_text[0].find_all('a')[0] + cur_fund_code = code_a_element.string + cur_morning_star_code = re.findall( + r'(?<=/quicktake/)(\w+)$', code_a_element.get('href')).pop(0) + cur_fund_name = tds_text[1].find_all('a')[0].string + cur_fund_cat = tds_text[2].string + if cur_fund_code in all_fund_codes: + exit_fund = all_fund_dict.get(cur_fund_code) + if (cur_morning_star_code != exit_fund['morning_star_code']) or (cur_fund_name != exit_fund['fund_name']) or (cur_fund_cat != exit_fund['fund_cat']) : + fund_base_params = { + **exit_fund, + 'morning_star_code': cur_morning_star_code, + 'fund_name': cur_fund_name, + 'fund_cat': cur_fund_cat + } + fund_base = FundBase(**fund_base_params) + fund_base.upsert() + elif cur_fund_code: + fund_base_params = { + 'fund_code': cur_fund_code, + 'morning_star_code': cur_morning_star_code, + 'fund_name': cur_fund_name, + 'fund_cat': cur_fund_cat + } + fund_base = FundBase(**fund_base_params) + print('fund_name:', cur_fund_name, 'fund_code:', cur_fund_code, 'morning_star_code:', cur_morning_star_code) + fund_base.upsert() + # 获取下一页元素 + next_page = chrome_driver.find_element(By.XPATH, xpath_str) + # 点击下一页 + next_page.click() + sleep(3) + page_index += 1 + chrome_driver.close() + print('end') + +def further_complete_base_info(): + all_funds = query_empty_company_and_found_date_fund(0, 10000) + error_funds = [] + def crawlData(start, end): + login_url = 'https://www.morningstar.cn/membership/signin.aspx' + chrome_driver = create_chrome_driver() + login_morning_star(chrome_driver, login_url) + page_start = start + page_limit = 10 + # 遍历从基金列表的单支基金 + while(page_start < end): + page_end = page_start + page_limit + results = all_funds[page_start:page_end] + # results = query_empty_company_and_found_date_fund(page_start, page_limit) + for record in results: + fund_code = record.fund_code + morning_star_code = record.morning_star_code + fund_name = record.fund_name + each_fund = FundSpider(fund_code, morning_star_code, fund_name, chrome_driver) + # 是否能正常跳转到基金详情页 + is_error_page = each_fund.go_fund_url() + if is_error_page == True: + error_funds.append(each_fund.fund_code) + continue + each_fund.get_fund_base_info() + # 去掉没有成立时间的 + if each_fund.found_date == '-' or each_fund.found_date == None: + # lock.acquire() + error_funds.append(each_fund.fund_code) + # lock.release() + continue + # 拼接sql需要的数据 + base_dict = { + 'fund_code': fund_code, + 'morning_star_code': morning_star_code, + 'fund_name': each_fund.fund_name, + 'fund_cat': each_fund.fund_cat , + 'company': each_fund.company, + 'found_date': each_fund.found_date + } + fund_base = FundBase(**base_dict) + fund_base.upsert() + page_start = page_start + page_limit + print('page_start', page_start) + chrome_driver.close() + bootstrap_thread(crawlData, len(all_funds), 3) +if __name__ == '__main__': + #127, 300, 600- + page_index = 1 + # sync_fund_base(page_index) + further_complete_base_info() + \ No newline at end of file