Merge pull request #6 from jackluson/feature/2021-q3

Feature/2021 q3
main
camel-lu 3 years ago committed by GitHub
commit 84d9bba046
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,45 @@
'''
Desc:
File: /main.py
Project: fund-morning-star-crawler
File Created: Thursday, 28th October 2021 10:51:07 pm
Author: luxuemin2108@gmail.com
-----
Copyright (c) 2021 Camel Lu
'''
import logging
import sys
sys.path.append('./src')
from src.acquire_fund_snapshot import get_fund_list
from src.acquire_fund_base import acquire_fund_base
from src.fund_info.supplement import FundSupplement
from src.acquire_fund_quarter import acquire_fund_quarter
def main():
input_value = input("请输入下列序号执行操作:\n \
1.快照 \n \
2.新基入库\n \
3.季度信息\n \
4.基金状态归档\n \
输入")
if input_value == '1' or input_value == '快照':
page_index = 1
get_fund_list(page_index) # 执行申万行业信息入库
elif input_value == '2' or input_value == '新基入库':
acquire_fund_base() # 执行行业股票信息入库
elif input_value == '3' or input_value == "季度信息":
acquire_fund_quarter()
elif input_value == '4' or input_value == "基金状态归档":
fund_supplement = FundSupplement()
# 补充基金清算维度信息
fund_supplement.update_archive_status()
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s',
filename='log/crawler.log', filemode='a', level=logging.INFO)
main()

File diff suppressed because it is too large Load Diff

@ -7,7 +7,6 @@ Author: luxuemin2108@gmail.com
----- -----
Copyright (c) 2020 Camel Lu Copyright (c) 2020 Camel Lu
''' '''
from time import sleep
from threading import Lock from threading import Lock
from utils.login import login_morning_star from utils.login import login_morning_star
from utils.index import bootstrap_thread from utils.index import bootstrap_thread
@ -16,21 +15,20 @@ from lib.mysnowflake import IdWorker
from sql_model.fund_query import FundQuery from sql_model.fund_query import FundQuery
from sql_model.fund_insert import FundInsert from sql_model.fund_insert import FundInsert
def acquire_fund_base():
if __name__ == '__main__':
lock = Lock() lock = Lock()
each_fund_query = FundQuery() each_fund_query = FundQuery()
each_fund_insert = FundInsert() each_fund_insert = FundInsert()
record_total = each_fund_query.get_fund_count_from_snapshot_no_exist() # 获取记录条数 record_total = each_fund_query.get_fund_count_from_snapshot_no_exist() # 获取记录条数
IdWorker = IdWorker() idWorker = IdWorker()
print('record_total', record_total) print('record_total', record_total)
error_funds = [] # 一些异常的基金详情页如果发现记录该基金的code error_funds = [] # 一些异常的基金详情页如果发现记录该基金的code
def crawlData(start, end): def crawlData(start, end):
login_url = 'https://www.morningstar.cn/membership/signin.aspx' login_url = 'https://www.morningstar.cn/membership/signin.aspx'
chrome_driver = login_morning_star(login_url, True) chrome_driver = login_morning_star(login_url, False)
page_start = start page_start = start
page_limit = 10 page_limit = 10
# 遍历从基金列表的单支基金 # 遍历从基金列表的单支基金
@ -56,7 +54,7 @@ if __name__ == '__main__':
continue continue
# 拼接sql需要的数据 # 拼接sql需要的数据
lock.acquire() lock.acquire()
snow_flake_id = IdWorker.get_id() snow_flake_id = idWorker.get_id()
lock.release() lock.release()
base_dict = { base_dict = {
'id': snow_flake_id, 'id': snow_flake_id,
@ -71,5 +69,9 @@ if __name__ == '__main__':
page_start = page_start + page_limit page_start = page_start + page_limit
print('page_start', page_start) print('page_start', page_start)
chrome_driver.close() chrome_driver.close()
bootstrap_thread(crawlData, record_total, 2)
bootstrap_thread(crawlData, record_total, 4)
print('error_funds', error_funds) print('error_funds', error_funds)
if __name__ == '__main__':
acquire_fund_base()

@ -9,9 +9,8 @@ Author: luxuemin2108@gmail.com
Copyright (c) 2020 Camel Lu Copyright (c) 2020 Camel Lu
''' '''
import math from threading import Lock, current_thread
from threading import Thread, Lock, current_thread from time import sleep
from time import sleep, time
from pprint import pprint from pprint import pprint
from fund_info.crawler import FundSpider from fund_info.crawler import FundSpider
from fund_info.api import FundApier from fund_info.api import FundApier
@ -36,13 +35,12 @@ def get_total_asset(fund_code, platform):
total_asset = each_fund.get_total_asset() total_asset = each_fund.get_total_asset()
return total_asset return total_asset
def acquire_fund_quarter():
if __name__ == '__main__':
lock = Lock() lock = Lock()
each_fund_query = FundQuery() each_fund_query = FundQuery()
record_total = each_fund_query.get_crawler_quarter_fund_total() # 获取记录条数 record_total = each_fund_query.get_crawler_quarter_fund_total() # 获取记录条数
print('record_total', record_total) print('record_total', record_total)
IdWorker = IdWorker() idWorker = IdWorker()
result_dir = './output/' result_dir = './output/'
fund_csv = FundCSV(result_dir) fund_csv = FundCSV(result_dir)
fund_csv.write_season_catch_fund(True) fund_csv.write_season_catch_fund(True)
@ -50,7 +48,7 @@ if __name__ == '__main__':
def crawlData(start, end): def crawlData(start, end):
login_url = 'https://www.morningstar.cn/membership/signin.aspx' login_url = 'https://www.morningstar.cn/membership/signin.aspx'
chrome_driver = login_morning_star(login_url, True) chrome_driver = login_morning_star(login_url, False)
page_start = start page_start = start
page_limit = 10 page_limit = 10
while(page_start < end): while(page_start < end):
@ -96,7 +94,7 @@ if __name__ == '__main__':
fund_csv.write_season_catch_fund(False, output_line) fund_csv.write_season_catch_fund(False, output_line)
# 入库 # 入库
lock.acquire() lock.acquire()
snow_flake_id = IdWorker.get_id() snow_flake_id = idWorker.get_id()
lock.release() lock.release()
# 开始存入数据 # 开始存入数据
fund_insert = FundInsert() fund_insert = FundInsert()
@ -189,3 +187,6 @@ if __name__ == '__main__':
bootstrap_thread(crawlData, record_total, 4) bootstrap_thread(crawlData, record_total, 4)
exit() exit()
if __name__ == '__main__':
acquire_fund_quarter()

@ -8,17 +8,22 @@ Author: luxuemin2108@gmail.com
Copyright (c) 2020 Camel Lu Copyright (c) 2020 Camel Lu
''' '''
import re
import math import math
import os import os
import re
import sys
sys.path.append(os.getcwd() + '/src')
from time import sleep from time import sleep
from bs4 import BeautifulSoup
import pandas as pd import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from db.connect import connect
from lib.mysnowflake import IdWorker from lib.mysnowflake import IdWorker
from utils.index import get_star_count, bootstrap_thread
from utils.login import login_morning_star from utils.login import login_morning_star
from utils.index import get_star_count
from db.connect import connect
connect_instance = connect() connect_instance = connect()
cursor = connect_instance.cursor() cursor = connect_instance.cursor()
@ -51,13 +56,12 @@ def text_to_be_present_in_element(locator, text, next_page_locator):
return _predicate return _predicate
def get_fund_list(): def get_fund_list(page_index):
morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx" morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx"
chrome_driver = login_morning_star(morning_fund_selector_url, False) chrome_driver = login_morning_star(morning_fund_selector_url, False)
# 定义起始页码 # 定义起始页码
page_num = 443 page_count = 25 # 晨星固定分页数
page_count = 25 page_total = math.ceil(int(chrome_driver.find_element_by_xpath(
page_num_total = math.ceil(int(chrome_driver.find_element_by_xpath(
'/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count) '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count)
result_dir = './output/' result_dir = './output/'
@ -66,22 +70,22 @@ def get_fund_list():
env_snapshot_table_name = os.getenv('snapshot_table_name') env_snapshot_table_name = os.getenv('snapshot_table_name')
output_file_name = env_snapshot_table_name + ".csv" output_file_name = env_snapshot_table_name + ".csv"
# 设置表头 # 设置表头
if page_num == 1: if page_index == 1:
with open(result_dir + output_file_name, 'w+') as csv_file: with open(result_dir + output_file_name, 'w+') as csv_file:
csv_file.write(output_head) csv_file.write(output_head)
while page_num <= page_num_total: while page_index <= page_total:
# 求余 # 求余
remainder = page_num_total % 10 remainder = page_total % 10
# 判断是否最后一页 # 判断是否最后一页
num = (remainder + num = (remainder +
2) if page_num > (page_num_total - remainder) else 12 2) if page_index > (page_total - remainder) else 12
xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % ( xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % (
num) num)
print('page_num', page_num) print('page_index', page_index)
# 等待直到当前页样式判断等于page_num # 等待直到当前页样式判断等于page_num
WebDriverWait(chrome_driver, timeout=600).until(text_to_be_present_in_element( WebDriverWait(chrome_driver, timeout=600).until(text_to_be_present_in_element(
"/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_num), xpath_str)) "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_index), xpath_str))
sleep(1) sleep(1)
# 列表用于存放爬取的数据 # 列表用于存放爬取的数据
id_list = [] # 雪花id id_list = [] # 雪花id
@ -138,6 +142,7 @@ def get_fund_list():
cursor.executemany(sql_insert, fund_list) cursor.executemany(sql_insert, fund_list)
connect_instance.commit() connect_instance.commit()
# print('fund_list', fund_list) # print('fund_list', fund_list)
# 输出为csv文件
with open(result_dir + output_file_name, 'a') as csv_file: with open(result_dir + output_file_name, 'a') as csv_file:
for fund_item in fund_list: for fund_item in fund_list:
output_line = ', '.join(str(x) for x in fund_item) + '\n' output_line = ', '.join(str(x) for x in fund_item) + '\n'
@ -149,11 +154,12 @@ def get_fund_list():
# 点击下一页 # 点击下一页
next_page.click() next_page.click()
sleep(3) sleep(3)
page_num += 1 page_index += 1
chrome_driver.close() chrome_driver.close()
print('end') print('end')
# chrome_driver.close() # chrome_driver.close()
if __name__ == "__main__": if __name__ == "__main__":
fund_list = get_fund_list() page_index = 1
fund_list = get_fund_list(page_index)

@ -1,6 +1,7 @@
import pymysql
import os import os
import pymysql
from dotenv import load_dotenv from dotenv import load_dotenv

@ -78,13 +78,12 @@ class FundQuery(BaseQuery):
# 过滤没有股票持仓的基金 # 过滤没有股票持仓的基金
sql_count = "SELECT COUNT(1) FROM fund_morning_base as a \ sql_count = "SELECT COUNT(1) FROM fund_morning_base as a \
WHERE a.fund_cat NOT LIKE '%%货币%%' \ WHERE a.fund_cat NOT LIKE '%%货币%%' \
AND a.fund_cat NOT LIKE '%%纯债基金%%' \
AND a.fund_cat NOT LIKE '目标日期' \
AND a.is_archive = 0 \ AND a.is_archive = 0 \
AND a.found_date <= %s \ AND a.found_date <= %s \
AND a.fund_name NOT LIKE '%%C' \ AND a.fund_name NOT LIKE '%%C' \
AND a.fund_name NOT LIKE '%%B' \ AND a.fund_name NOT LIKE '%%B' \
AND a.fund_cat NOT LIKE '%%短债基金%%' \ AND a.fund_cat NOT IN ('利率债', '短债基金', '短债型', '短债基金(封闭)', '纯债基金', '纯债基金(封闭)', \
'普通债券型', '普通债券型基金','普通债券型基金(封闭)', '信用债', '信用债(封闭)','目标日期' ) \
AND a.fund_code NOT IN( SELECT fund_code FROM fund_morning_quarter as b \ AND a.fund_code NOT IN( SELECT fund_code FROM fund_morning_quarter as b \
WHERE b.quarter_index = %s);" WHERE b.quarter_index = %s);"
self.cursor.execute(sql_count, [self.quarter_date, self.quarter_index]) self.cursor.execute(sql_count, [self.quarter_date, self.quarter_index])
@ -98,9 +97,8 @@ class FundQuery(BaseQuery):
t.morning_star_code, t.fund_name, t.fund_cat \ t.morning_star_code, t.fund_name, t.fund_cat \
FROM fund_morning_base as t \ FROM fund_morning_base as t \
WHERE t.fund_cat NOT LIKE '%%货币%%' \ WHERE t.fund_cat NOT LIKE '%%货币%%' \
AND t.fund_cat NOT LIKE '%%纯债基金%%' \ AND t.fund_cat NOT IN ('利率债', '短债基金', '短债型', '短债基金(封闭)', '纯债基金', '纯债基金(封闭)', \
AND t.fund_cat NOT LIKE '目标日期' \ '普通债券型', '普通债券型基金','普通债券型基金(封闭)', '信用债', '信用债(封闭)','目标日期' ) \
AND t.fund_cat NOT LIKE '%%短债基金%%' \
AND t.found_date <= %s \ AND t.found_date <= %s \
AND t.is_archive = 0 \ AND t.is_archive = 0 \
AND t.fund_name NOT LIKE '%%C' \ AND t.fund_name NOT LIKE '%%C' \

Loading…
Cancel
Save