feat:add base to main

main
jackluson 3 years ago
parent 9bf86a3d4e
commit 475941b6de

@ -9,25 +9,23 @@ Copyright (c) 2021 Camel Lu
''' '''
import logging import logging
import os
import sys import sys
sys.path.append('./src') sys.path.append('./src')
from src.acquire_fund_snapshot import get_fund_list from src.acquire_fund_snapshot import get_fund_list
from src.acquire_fund_base import acquire_fund_base
def main(): def main():
input_value = input("请输入下列序号执行操作:\n \ input_value = input("请输入下列序号执行操作:\n \
1.快照 \n \ 1.快照 \n \
2.行业个股\n \ 2.新基入库\n \
3.股票日更\n \
4.个股+日更\n \
5.财务指标\n \
6.A股估值\n \
输入") 输入")
if input_value == '1' or input_value == '快照': if input_value == '1' or input_value == '快照':
get_fund_list() # 执行申万行业信息入库 page_index = 1
get_fund_list(page_index) # 执行申万行业信息入库
elif input_value == '2' or input_value == '新基入库':
acquire_fund_base() # 执行行业股票信息入库
if __name__ == '__main__': if __name__ == '__main__':

File diff suppressed because it is too large Load Diff

@ -7,7 +7,6 @@ Author: luxuemin2108@gmail.com
----- -----
Copyright (c) 2020 Camel Lu Copyright (c) 2020 Camel Lu
''' '''
from time import sleep
from threading import Lock from threading import Lock
from utils.login import login_morning_star from utils.login import login_morning_star
from utils.index import bootstrap_thread from utils.index import bootstrap_thread
@ -16,21 +15,20 @@ from lib.mysnowflake import IdWorker
from sql_model.fund_query import FundQuery from sql_model.fund_query import FundQuery
from sql_model.fund_insert import FundInsert from sql_model.fund_insert import FundInsert
def acquire_fund_base():
if __name__ == '__main__':
lock = Lock() lock = Lock()
each_fund_query = FundQuery() each_fund_query = FundQuery()
each_fund_insert = FundInsert() each_fund_insert = FundInsert()
record_total = each_fund_query.get_fund_count_from_snapshot_no_exist() # 获取记录条数 record_total = each_fund_query.get_fund_count_from_snapshot_no_exist() # 获取记录条数
IdWorker = IdWorker() idWorker = IdWorker()
print('record_total', record_total) print('record_total', record_total)
error_funds = [] # 一些异常的基金详情页如果发现记录该基金的code error_funds = [] # 一些异常的基金详情页如果发现记录该基金的code
def crawlData(start, end): def crawlData(start, end):
login_url = 'https://www.morningstar.cn/membership/signin.aspx' login_url = 'https://www.morningstar.cn/membership/signin.aspx'
chrome_driver = login_morning_star(login_url, True) chrome_driver = login_morning_star(login_url, False)
page_start = start page_start = start
page_limit = 10 page_limit = 10
# 遍历从基金列表的单支基金 # 遍历从基金列表的单支基金
@ -56,7 +54,7 @@ if __name__ == '__main__':
continue continue
# 拼接sql需要的数据 # 拼接sql需要的数据
lock.acquire() lock.acquire()
snow_flake_id = IdWorker.get_id() snow_flake_id = idWorker.get_id()
lock.release() lock.release()
base_dict = { base_dict = {
'id': snow_flake_id, 'id': snow_flake_id,
@ -71,5 +69,9 @@ if __name__ == '__main__':
page_start = page_start + page_limit page_start = page_start + page_limit
print('page_start', page_start) print('page_start', page_start)
chrome_driver.close() chrome_driver.close()
bootstrap_thread(crawlData, record_total, 2)
bootstrap_thread(crawlData, record_total, 4)
print('error_funds', error_funds) print('error_funds', error_funds)
if __name__ == '__main__':
acquire_fund_base()

@ -22,7 +22,7 @@ from selenium.webdriver.support.ui import WebDriverWait
from db.connect import connect from db.connect import connect
from lib.mysnowflake import IdWorker from lib.mysnowflake import IdWorker
from utils.index import get_star_count from utils.index import get_star_count, bootstrap_thread
from utils.login import login_morning_star from utils.login import login_morning_star
connect_instance = connect() connect_instance = connect()
@ -56,13 +56,12 @@ def text_to_be_present_in_element(locator, text, next_page_locator):
return _predicate return _predicate
def get_fund_list(): def get_fund_list(page_index):
morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx" morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx"
chrome_driver = login_morning_star(morning_fund_selector_url, False) chrome_driver = login_morning_star(morning_fund_selector_url, False)
# 定义起始页码 # 定义起始页码
page_num = 9
page_count = 25 # 晨星固定分页数 page_count = 25 # 晨星固定分页数
page_num_total = math.ceil(int(chrome_driver.find_element_by_xpath( page_total = math.ceil(int(chrome_driver.find_element_by_xpath(
'/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count) '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count)
result_dir = './output/' result_dir = './output/'
@ -71,22 +70,22 @@ def get_fund_list():
env_snapshot_table_name = os.getenv('snapshot_table_name') env_snapshot_table_name = os.getenv('snapshot_table_name')
output_file_name = env_snapshot_table_name + ".csv" output_file_name = env_snapshot_table_name + ".csv"
# 设置表头 # 设置表头
if page_num == 1: if page_index == 1:
with open(result_dir + output_file_name, 'w+') as csv_file: with open(result_dir + output_file_name, 'w+') as csv_file:
csv_file.write(output_head) csv_file.write(output_head)
while page_num <= page_num_total: while page_index <= page_total:
# 求余 # 求余
remainder = page_num_total % 10 remainder = page_total % 10
# 判断是否最后一页 # 判断是否最后一页
num = (remainder + num = (remainder +
2) if page_num > (page_num_total - remainder) else 12 2) if page_index > (page_total - remainder) else 12
xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % ( xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % (
num) num)
print('page_num', page_num) print('page_index', page_index)
# 等待直到当前页样式判断等于page_num # 等待直到当前页样式判断等于page_num
WebDriverWait(chrome_driver, timeout=600).until(text_to_be_present_in_element( WebDriverWait(chrome_driver, timeout=600).until(text_to_be_present_in_element(
"/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_num), xpath_str)) "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_index), xpath_str))
sleep(1) sleep(1)
# 列表用于存放爬取的数据 # 列表用于存放爬取的数据
id_list = [] # 雪花id id_list = [] # 雪花id
@ -155,11 +154,12 @@ def get_fund_list():
# 点击下一页 # 点击下一页
next_page.click() next_page.click()
sleep(3) sleep(3)
page_num += 1 page_index += 1
chrome_driver.close() chrome_driver.close()
print('end') print('end')
# chrome_driver.close() # chrome_driver.close()
if __name__ == "__main__": if __name__ == "__main__":
fund_list = get_fund_list() page_index = 1
fund_list = get_fund_list(page_index)

Loading…
Cancel
Save