feat:add base to main

3 years ago · 475941b6de
parent 9bf86a3d4e
commit 475941b6de
4 changed files with 14981 additions and 27 deletions
--- a/main.py
+++ b/main.py
@ -9,25 +9,23 @@ Copyright (c) 2021 Camel Lu
 '''

 import logging
-import os
 import sys

 sys.path.append('./src')

 from src.acquire_fund_snapshot import get_fund_list
-
+from src.acquire_fund_base import acquire_fund_base

 def main():
    input_value = input("请输入下列序号执行操作:\n \
        1.“快照” \n \
-        2.“行业个股”\n \
-        3.“股票日更”\n \
-        4.“个股+日更”\n \
-        5.“财务指标”\n \
-        6.“A股估值”\n \
+        2.“新基入库”\n \
    输入：")
    if input_value == '1' or input_value == '快照':
-        get_fund_list()  # 执行申万行业信息入库
+        page_index = 1
+        get_fund_list(page_index)  # 执行申万行业信息入库
+    elif input_value == '2' or input_value == '新基入库':
+        acquire_fund_base()  # 执行行业股票信息入库


 if __name__ == '__main__':
--- a/output/fund_morning_snapshot_2021_q3.csv
+++ b/output/fund_morning_snapshot_2021_q3.csv
--- a/src/acquire_fund_base.py
+++ b/src/acquire_fund_base.py
@ -7,7 +7,6 @@ Author: luxuemin2108@gmail.com
 -----
 Copyright (c) 2020 Camel Lu
 '''
-from time import sleep
 from threading import Lock
 from utils.login import login_morning_star
 from utils.index import bootstrap_thread
@ -16,21 +15,20 @@ from lib.mysnowflake import IdWorker
 from sql_model.fund_query import FundQuery
 from sql_model.fund_insert import FundInsert

-
-if __name__ == '__main__':
+def acquire_fund_base():
    lock = Lock()
    each_fund_query = FundQuery()
    each_fund_insert = FundInsert()

    record_total = each_fund_query.get_fund_count_from_snapshot_no_exist()    # 获取记录条数

-    IdWorker = IdWorker()
+    idWorker = IdWorker()
    print('record_total', record_total)
    error_funds = []  # 一些异常的基金详情页，如果发现记录该基金的code

    def crawlData(start, end):
        login_url = 'https://www.morningstar.cn/membership/signin.aspx'
-        chrome_driver = login_morning_star(login_url, True)
+        chrome_driver = login_morning_star(login_url, False)
        page_start = start
        page_limit = 10
        # 遍历从基金列表的单支基金
@ -56,7 +54,7 @@ if __name__ == '__main__':
                    continue
                # 拼接sql需要的数据
                lock.acquire()
-                snow_flake_id = IdWorker.get_id()
+                snow_flake_id = idWorker.get_id()
                lock.release()
                base_dict = {
                    'id': snow_flake_id,
@ -71,5 +69,9 @@ if __name__ == '__main__':
            page_start = page_start + page_limit
            print('page_start', page_start)
        chrome_driver.close()
-    bootstrap_thread(crawlData, record_total, 2)
+    
+    bootstrap_thread(crawlData, record_total, 4)
    print('error_funds', error_funds)
+
+if __name__ == '__main__':
+    acquire_fund_base()
--- a/src/acquire_fund_snapshot.py
+++ b/src/acquire_fund_snapshot.py
@ -22,7 +22,7 @@ from selenium.webdriver.support.ui import WebDriverWait

 from db.connect import connect
 from lib.mysnowflake import IdWorker
-from utils.index import get_star_count
+from utils.index import get_star_count, bootstrap_thread
 from utils.login import login_morning_star

 connect_instance = connect()
@ -56,13 +56,12 @@ def text_to_be_present_in_element(locator, text, next_page_locator):
    return _predicate


-def get_fund_list():
+def get_fund_list(page_index):
    morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx"
    chrome_driver = login_morning_star(morning_fund_selector_url, False)
    # 定义起始页码
-    page_num = 9
    page_count = 25 # 晨星固定分页数
-    page_num_total = math.ceil(int(chrome_driver.find_element_by_xpath(
+    page_total = math.ceil(int(chrome_driver.find_element_by_xpath(
        '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count)

    result_dir = './output/'
@ -71,22 +70,22 @@ def get_fund_list():
    env_snapshot_table_name = os.getenv('snapshot_table_name')
    output_file_name = env_snapshot_table_name + ".csv"
    # 设置表头
-    if page_num == 1:
+    if page_index == 1:
        with open(result_dir + output_file_name, 'w+') as csv_file:
            csv_file.write(output_head)
-    while page_num <= page_num_total:
+    while page_index <= page_total:
        # 求余
-        remainder = page_num_total % 10
+        remainder = page_total % 10
        # 判断是否最后一页
        num = (remainder +
-               2) if page_num > (page_num_total - remainder) else 12
+               2) if page_index > (page_total - remainder) else 12
        xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % (
            num)
-        print('page_num', page_num)
+        print('page_index', page_index)

        # 等待，直到当前页（样式判断）等于page_num
        WebDriverWait(chrome_driver, timeout=600).until(text_to_be_present_in_element(
-            "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_num), xpath_str))
+            "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_index), xpath_str))
        sleep(1)
        # 列表用于存放爬取的数据
        id_list = []  # 雪花id
@ -155,11 +154,12 @@ def get_fund_list():
        # 点击下一页
        next_page.click()
        sleep(3)
-        page_num += 1
+        page_index += 1
    chrome_driver.close()
    print('end')
    # chrome_driver.close()


 if __name__ == "__main__":
-    fund_list = get_fund_list()
+    page_index = 1
+    fund_list = get_fund_list(page_index)