fix:🐛 add empty list adjust

4 years ago · 6a390c5774
parent 4cb5f77e74
commit 6a390c5774
3 changed files with 60 additions and 16 deletions
--- a/src/acquire_fund_base.py
+++ b/src/acquire_fund_base.py
@ -10,7 +10,7 @@ Copyright (c) 2020 Camel Lu

 import math
 from utils import parse_cookiestr, set_cookies, login_site
-from fund_info_crawler import FundInfo
+from fund_info_crawler import FundSpider
 from lib.mysnowflake import IdWorker
 import pymysql
 connect = pymysql.connect(host='127.0.0.1', user='root',
@ -56,7 +56,7 @@ if __name__ == '__main__':
            sql, [page_start, page_limit])    # 执行sql语句
        results = cursor.fetchall()    # 获取查询的所有记录
        for record in results:
-            each_fund = FundInfo(
+            each_fund = FundSpider(
                record[0], record[1], record[2], chrome_driver, morning_cookies)
            # 从晨星网上更新信息
            is_normal = each_fund.go_fund_url()
--- a/src/acquire_fund_season.py
+++ b/src/acquire_fund_season.py
@ -12,7 +12,7 @@ Copyright (c) 2020 Camel Lu
 import math
 from threading import Thread, Lock, current_thread
 from utils import parse_cookiestr, set_cookies, login_site
-from fund_info_crawler import FundInfo
+from fund_info_crawler import FundSpider
 from lib.mysnowflake import IdWorker
 from time import sleep, time
 import pymysql
@ -57,6 +57,18 @@ if __name__ == '__main__':
    record_total = count[0]
    page_start = 0
    error_funds = []
+    output_catch_head = '代码' + ',' + '晨星专属号' + ',' + '名称' + ',' + \
+        '类型' + '股票总仓位' + '页码' + '备注' + '\n'
+    # 设置表头
+    result_dir = './output/'
+    if page_start == 0:
+        with open(result_dir + 'fund_morning_season_catch.csv', 'w+') as csv_file:
+            csv_file.write(output_catch_head)
+    output_catch_error = '代码' + ',' + '晨星专属号' + ',' + '名称' + ',' + \
+        '类型' + '页码' + '备注' + '\n'
+    if page_start == 0:
+        with open(result_dir + 'fund_morning_season_error.csv', 'w+') as csv_file:
+            csv_file.write(output_catch_error)

    def crawlData(start, end):
        chrome_driver = login()
@ -84,20 +96,38 @@ if __name__ == '__main__':
            for record in results:
                sleep(1)
                print(current_thread().getName(), 'record-->', record)
-                each_fund = FundInfo(
+                each_fund = FundSpider(
                    record[0], record[1], record[2], chrome_driver, morning_cookies)
                is_normal = each_fund.go_fund_url()
                if is_normal == False:
+                    lock.acquire()
                    error_funds.append(each_fund.fund_code)
+                    fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
+                                  each_fund.fund_name, each_fund.fund_cat, page_start, '页面跳转有问题']
+                    with open(result_dir + 'fund_morning_season_error.csv', 'a') as csv_file:
+                        output_line = ', '.join(str(x)
+                                                for x in fund_infos) + '\n'
+                        csv_file.write(output_line)
+                    lock.release()
                    continue
                each_fund.get_fund_manager_info()
                each_fund.get_fund_season_info()
+                if each_fund._is_trigger_catch == True:
+                    lock.acquire()
+                    fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
+                                  each_fund.fund_name, each_fund.fund_cat,
+                                  each_fund.stock_position['stock_total_position'],
+                                  page_start, each_fund._catch_detail]
+                    with open(result_dir + 'fund_morning_season_catch.csv', 'a') as csv_file:
+                        output_line = ', '.join(str(x)
+                                                for x in fund_infos) + '\n'
+                        csv_file.write(output_line)
+                    lock.release()
                fundDict = dict((name, getattr(each_fund, name))
                                for name in vars(each_fund)
                                if not (name.startswith('_') or getattr(each_fund, name) == None))

-                print(current_thread().getName(), fundDict)
-                continue
+                # print(current_thread().getName(), fundDict)
            page_start = page_start + page_limit
            print(current_thread().getName(), 'page_start', page_start)
            sleep(3)
@ -107,7 +137,8 @@ if __name__ == '__main__':
    step_num = 2500
    for i in range(3):
        print(i * step_num, (i+1) * step_num)
-        t = Thread(target=crawlData, args=(i * step_num, (i+1) * step_num))
+        t = Thread(target=crawlData, args=(
+            i * step_num, (i+1) * step_num))
        t.setDaemon(True)
        threaders.append(t)
        t.start()
--- a/src/fund_info_crawler.py
+++ b/src/fund_info_crawler.py
@ -15,7 +15,7 @@ from utils import parse_cookiestr, set_cookies, login_site
 from selenium.common.exceptions import NoSuchElementException


-class FundInfo:
+class FundSpider:
    # 初始化定义，利用基金代码、基金名称进行唯一化
    def __init__(self, code, namecode, name,  chrome_driver, morning_cookies):
        self.season_number = '2021-1s'
@ -25,6 +25,8 @@ class FundInfo:

        self._morning_cookies = morning_cookies or None
        self._chrome_driver = chrome_driver or None
+        self._is_trigger_catch = False
+        self._catch_detail = None

        # 基本信息
        self.fund_cat = None  # 基金分类
@ -87,9 +89,9 @@ class FundInfo:
        # 判断是否页面出错，重定向，如果是的话跳过
        if self._chrome_driver.current_url == 'https://www.morningstar.cn/errors/defaulterror.html':
            return False
-        if self._chrome_driver.page_source == None:
+        while self._chrome_driver.page_source == None:
            self._chrome_driver.refresh()
-            print('fund_code', self.fund_code)
+            print('wait:fund_code', self.fund_code)
            sleep(9)
            # self._chrome_driver.execute_script('location.reload()')

@ -99,6 +101,8 @@ class FundInfo:
                parent_id).find_element_by_class_name(class_name).text
            return text if text != '-' else None
        except NoSuchElementException:
+            self._is_trigger_catch = True
+            self._catch_detail = parent_id + '-' + class_name
            print('error_fund_info:', self.fund_code,
                  '-', self.morning_star_code, self.stock_position["stock_total_position"])
            file_name = './abnormal/' + self.fund_code + \
@ -114,6 +118,8 @@ class FundInfo:
                id).text
            return text if text != '-' else None
        except NoSuchElementException:
+            self._is_trigger_catch = True
+            self._catch_detail = id
            print('error_fund_info:', self.fund_code,
                  '-', self.morning_star_code, self.stock_position["stock_total_position"])
            file_name = './abnormal/' + '-' + id + self.fund_code + "-no_such_element.png"
@ -132,6 +138,8 @@ class FundInfo:
                text = parent_el.find_element_by_xpath(xpath).text
            return text if text != '-' else None
        except NoSuchElementException:
+            self._is_trigger_catch = True
+            self._catch_detail = xpath
            print('error_fund_info:', self.fund_code,
                  '-', self.morning_star_code, self.stock_position["stock_total_position"])
            file_name = './abnormal/' + \
@ -171,6 +179,7 @@ class FundInfo:
            self.manager['start_date'] = manager_start_date
            self.manager['brife'] = manager_brife
        except NoSuchElementException:
+            self._is_trigger_catch = True
            print('error_fund_info:', self.fund_code,
                  '-', self.morning_star_code)
            file_name = './abnormal/manager-' + self.fund_code + "-no_such_element.png"
@ -200,17 +209,21 @@ class FundInfo:
        # 十大股票仓位
        ten_stock_position = None
        ten_stock_position_text = self.get_element_text_by_id("qt_stocktab")
-        if ten_stock_position_text != None:
-            ten_stock_position = re.findall(
-                r"\d+\.?\d*", ten_stock_position_text).pop(0)
+        if ten_stock_position_text != None or ten_stock_position_text != '-':
+            ten_stock_position_list = re.findall(
+                r"\d+\.?\d*", ten_stock_position_text)
+            if len(ten_stock_position_list) > 0:
+                ten_stock_position = ten_stock_position_list.pop(0)
        self.stock_position["ten_stock_position"] = ten_stock_position

        # 五大债券仓位
        five_bond_position = None
        five_bond_position_text = self.get_element_text_by_id("qt_bondstab")
-        if five_bond_position_text != None:
-            five_bond_position = re.findall(
-                r"\d+\.?\d*", five_bond_position_text).pop(0)
+        if five_bond_position_text != None or five_bond_position_text != '-':
+            five_bond_position_list = re.findall(
+                r"\d+\.?\d*", five_bond_position_text)
+            if len(five_bond_position_list) > 0:
+                five_bond_position = five_bond_position_list.pop(0)
        self.bond_position["five_bond_position"] = five_bond_position

        # 获取标准差