fix:🐛 add empty list adjust

main
jackluson 4 years ago
parent 4cb5f77e74
commit 6a390c5774

@ -10,7 +10,7 @@ Copyright (c) 2020 Camel Lu
import math
from utils import parse_cookiestr, set_cookies, login_site
from fund_info_crawler import FundInfo
from fund_info_crawler import FundSpider
from lib.mysnowflake import IdWorker
import pymysql
connect = pymysql.connect(host='127.0.0.1', user='root',
@ -56,7 +56,7 @@ if __name__ == '__main__':
sql, [page_start, page_limit]) # 执行sql语句
results = cursor.fetchall() # 获取查询的所有记录
for record in results:
each_fund = FundInfo(
each_fund = FundSpider(
record[0], record[1], record[2], chrome_driver, morning_cookies)
# 从晨星网上更新信息
is_normal = each_fund.go_fund_url()

@ -12,7 +12,7 @@ Copyright (c) 2020 Camel Lu
import math
from threading import Thread, Lock, current_thread
from utils import parse_cookiestr, set_cookies, login_site
from fund_info_crawler import FundInfo
from fund_info_crawler import FundSpider
from lib.mysnowflake import IdWorker
from time import sleep, time
import pymysql
@ -57,6 +57,18 @@ if __name__ == '__main__':
record_total = count[0]
page_start = 0
error_funds = []
output_catch_head = '代码' + ',' + '晨星专属号' + ',' + '名称' + ',' + \
'类型' + '股票总仓位' + '页码' + '备注' + '\n'
# 设置表头
result_dir = './output/'
if page_start == 0:
with open(result_dir + 'fund_morning_season_catch.csv', 'w+') as csv_file:
csv_file.write(output_catch_head)
output_catch_error = '代码' + ',' + '晨星专属号' + ',' + '名称' + ',' + \
'类型' + '页码' + '备注' + '\n'
if page_start == 0:
with open(result_dir + 'fund_morning_season_error.csv', 'w+') as csv_file:
csv_file.write(output_catch_error)
def crawlData(start, end):
chrome_driver = login()
@ -84,20 +96,38 @@ if __name__ == '__main__':
for record in results:
sleep(1)
print(current_thread().getName(), 'record-->', record)
each_fund = FundInfo(
each_fund = FundSpider(
record[0], record[1], record[2], chrome_driver, morning_cookies)
is_normal = each_fund.go_fund_url()
if is_normal == False:
lock.acquire()
error_funds.append(each_fund.fund_code)
fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
each_fund.fund_name, each_fund.fund_cat, page_start, '页面跳转有问题']
with open(result_dir + 'fund_morning_season_error.csv', 'a') as csv_file:
output_line = ', '.join(str(x)
for x in fund_infos) + '\n'
csv_file.write(output_line)
lock.release()
continue
each_fund.get_fund_manager_info()
each_fund.get_fund_season_info()
if each_fund._is_trigger_catch == True:
lock.acquire()
fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
each_fund.fund_name, each_fund.fund_cat,
each_fund.stock_position['stock_total_position'],
page_start, each_fund._catch_detail]
with open(result_dir + 'fund_morning_season_catch.csv', 'a') as csv_file:
output_line = ', '.join(str(x)
for x in fund_infos) + '\n'
csv_file.write(output_line)
lock.release()
fundDict = dict((name, getattr(each_fund, name))
for name in vars(each_fund)
if not (name.startswith('_') or getattr(each_fund, name) == None))
print(current_thread().getName(), fundDict)
continue
# print(current_thread().getName(), fundDict)
page_start = page_start + page_limit
print(current_thread().getName(), 'page_start', page_start)
sleep(3)
@ -107,7 +137,8 @@ if __name__ == '__main__':
step_num = 2500
for i in range(3):
print(i * step_num, (i+1) * step_num)
t = Thread(target=crawlData, args=(i * step_num, (i+1) * step_num))
t = Thread(target=crawlData, args=(
i * step_num, (i+1) * step_num))
t.setDaemon(True)
threaders.append(t)
t.start()

@ -15,7 +15,7 @@ from utils import parse_cookiestr, set_cookies, login_site
from selenium.common.exceptions import NoSuchElementException
class FundInfo:
class FundSpider:
# 初始化定义,利用基金代码、基金名称进行唯一化
def __init__(self, code, namecode, name, chrome_driver, morning_cookies):
self.season_number = '2021-1s'
@ -25,6 +25,8 @@ class FundInfo:
self._morning_cookies = morning_cookies or None
self._chrome_driver = chrome_driver or None
self._is_trigger_catch = False
self._catch_detail = None
# 基本信息
self.fund_cat = None # 基金分类
@ -87,9 +89,9 @@ class FundInfo:
# 判断是否页面出错,重定向,如果是的话跳过
if self._chrome_driver.current_url == 'https://www.morningstar.cn/errors/defaulterror.html':
return False
if self._chrome_driver.page_source == None:
while self._chrome_driver.page_source == None:
self._chrome_driver.refresh()
print('fund_code', self.fund_code)
print('wait:fund_code', self.fund_code)
sleep(9)
# self._chrome_driver.execute_script('location.reload()')
@ -99,6 +101,8 @@ class FundInfo:
parent_id).find_element_by_class_name(class_name).text
return text if text != '-' else None
except NoSuchElementException:
self._is_trigger_catch = True
self._catch_detail = parent_id + '-' + class_name
print('error_fund_info:', self.fund_code,
'-', self.morning_star_code, self.stock_position["stock_total_position"])
file_name = './abnormal/' + self.fund_code + \
@ -114,6 +118,8 @@ class FundInfo:
id).text
return text if text != '-' else None
except NoSuchElementException:
self._is_trigger_catch = True
self._catch_detail = id
print('error_fund_info:', self.fund_code,
'-', self.morning_star_code, self.stock_position["stock_total_position"])
file_name = './abnormal/' + '-' + id + self.fund_code + "-no_such_element.png"
@ -132,6 +138,8 @@ class FundInfo:
text = parent_el.find_element_by_xpath(xpath).text
return text if text != '-' else None
except NoSuchElementException:
self._is_trigger_catch = True
self._catch_detail = xpath
print('error_fund_info:', self.fund_code,
'-', self.morning_star_code, self.stock_position["stock_total_position"])
file_name = './abnormal/' + \
@ -171,6 +179,7 @@ class FundInfo:
self.manager['start_date'] = manager_start_date
self.manager['brife'] = manager_brife
except NoSuchElementException:
self._is_trigger_catch = True
print('error_fund_info:', self.fund_code,
'-', self.morning_star_code)
file_name = './abnormal/manager-' + self.fund_code + "-no_such_element.png"
@ -200,17 +209,21 @@ class FundInfo:
# 十大股票仓位
ten_stock_position = None
ten_stock_position_text = self.get_element_text_by_id("qt_stocktab")
if ten_stock_position_text != None:
ten_stock_position = re.findall(
r"\d+\.?\d*", ten_stock_position_text).pop(0)
if ten_stock_position_text != None or ten_stock_position_text != '-':
ten_stock_position_list = re.findall(
r"\d+\.?\d*", ten_stock_position_text)
if len(ten_stock_position_list) > 0:
ten_stock_position = ten_stock_position_list.pop(0)
self.stock_position["ten_stock_position"] = ten_stock_position
# 五大债券仓位
five_bond_position = None
five_bond_position_text = self.get_element_text_by_id("qt_bondstab")
if five_bond_position_text != None:
five_bond_position = re.findall(
r"\d+\.?\d*", five_bond_position_text).pop(0)
if five_bond_position_text != None or five_bond_position_text != '-':
five_bond_position_list = re.findall(
r"\d+\.?\d*", five_bond_position_text)
if len(five_bond_position_list) > 0:
five_bond_position = five_bond_position_list.pop(0)
self.bond_position["five_bond_position"] = five_bond_position
# 获取标准差

Loading…
Cancel
Save