chore: fix connect & base data

main
jackluson 3 years ago
parent 5328792bb0
commit 6f71cf9b83

@ -1,3 +1,12 @@
morning_star_username="xxxx"
morning_star_password="xxx"
snapshot_table_name="xxx"
# about database
snapshot_table_name="fund_morning_snapshot_2021_q1"
db_host="127.0.0.1"
db_name="fund_work"
db_user="xxx"
db_password="xxxx"

1
.gitignore vendored

@ -8,6 +8,7 @@ code-record/*
*fund_morning_star.csv
abnormal
demo/
output/
.env
# C extensions

@ -7,7 +7,7 @@ Author: luxuemin2108@gmail.com
-----
Copyright (c) 2020 Camel Lu
'''
import os
import math
from utils import parse_cookiestr, set_cookies, login_site
from fund_info_crawler import FundSpider
@ -15,7 +15,8 @@ from lib.mysnowflake import IdWorker
import pymysql
from db.connect import connect
cursor = connect.cursor()
connect_instance = connect()
cursor = connect_instance.cursor()
def login():
@ -36,22 +37,26 @@ def login():
if __name__ == '__main__':
chrome_driver = login()
morning_cookies = chrome_driver.get_cookies()
# 获取数据库的基金列表
sql_count = "SELECT count(*) FROM fund_morning_snapshot WHERE fund_code IS NOT NULL AND morning_star_code IS NOT NULL"
env_snapshot_table_name = os.getenv('snapshot_table_name')
sql_count = "SELECT count(*) FROM " + env_snapshot_table_name + \
" WHERE fund_code NOT IN (SELECT fund_code FROM fund_morning_base);"
cursor.execute(sql_count)
count = cursor.fetchone() # 获取记录条数
print('count', count[0])
chrome_driver = login()
morning_cookies = chrome_driver.get_cookies()
IdWorker = IdWorker()
page_limit = 10
record_total = count[0]
page_start = 0
error_funds = ['005086'] # 一些异常的基金详情页如果发现记录该基金的code
error_funds = [] # 一些异常的基金详情页如果发现记录该基金的code
# 遍历从基金列表的单支基金
while(page_start < record_total):
sql = "SELECT fund_code, morning_star_code, fund_name FROM fund_morning_snapshot WHERE fund_code IS NOT NULL AND morning_star_code IS NOT NULL ORDER BY fund_code LIMIT %s, %s"
# 从fund_morning_snapshot_2021_q1 查出 fund_morning_base 中不存在的基金
sql = "SELECT fund_code, morning_star_code, fund_name FROM " + env_snapshot_table_name + \
" WHERE fund_code NOT IN (SELECT fund_code FROM fund_morning_base) ORDER BY fund_code LIMIT %s, %s"
cursor.execute(
sql, [page_start, page_limit]) # 执行sql语句
results = cursor.fetchall() # 获取查询的所有记录
@ -64,6 +69,7 @@ if __name__ == '__main__':
error_funds.append(each_fund.fund_code)
continue
each_fund.get_fund_base_info()
# 去掉没有成立时间的
if each_fund.found_date == '-':
error_funds.append(each_fund.fund_code)
continue
@ -86,14 +92,14 @@ if __name__ == '__main__':
continue
update_values = update_values + '{0}=VALUES({0}),'.format(key)
# 入库,不存在则创建,存在则更新
base_sql_insert = "INSERT INTO {table} ({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE {update_values}; ".format(
base_sql_insert = "INSERT INTO {table} ({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE {update_values}; ".format(
table='fund_morning_base',
keys=keys,
values=values,
update_values=update_values[0:-1]
)
cursor.execute(base_sql_insert, tuple(base_dict.values()))
connect.commit()
connect_instance.commit()
page_start = page_start + page_limit
print('page_start', page_start)
chrome_driver.close()

@ -19,7 +19,9 @@ from time import sleep, time
from pprint import pprint
import pandas
cursor = connect.cursor()
connect_instance = connect()
cursor = connect_instance.cursor()
lock = Lock()
@ -171,7 +173,7 @@ if __name__ == '__main__':
lock.acquire()
cursor.execute(manager_sql_insert,
tuple(manager_dict.values()))
connect.commit()
connect_instance.commit()
lock.release()
# 季度信息 TODO: 对比数据更新时间field
season_dict = {
@ -207,7 +209,7 @@ if __name__ == '__main__':
lock.acquire()
cursor.execute(season_sql_insert,
tuple(season_dict.values()))
connect.commit()
connect_instance.commit()
lock.release()
# 入库十大股票持仓
stock_position_total = each_fund.stock_position.get(
@ -236,7 +238,7 @@ if __name__ == '__main__':
# print('stock_sql_insert', stock_sql_insert)
cursor.execute(stock_sql_insert,
tuple(stock_dict.values()))
connect.commit()
connect_instance.commit()
lock.release()
# pprint(fundDict)
page_start = page_start + page_limit

@ -19,7 +19,8 @@ from lib.mysnowflake import IdWorker
from utils import parse_cookiestr, set_cookies, login_site, get_star_count
from db.connect import connect
cursor = connect.cursor()
connect_instance = connect()
cursor = connect_instance.cursor()
'''
判读是否当前页一致没有的话切换上一页下一页操作
@ -161,7 +162,7 @@ def get_fund_list(cookie_str=None):
# print('fund_df', fund_df)
fund_list = fund_df.values.tolist()
cursor.executemany(sql_insert, fund_list)
connect.commit()
connect_instance.commit()
# print('fund_list', fund_list)
with open(result_dir + output_file_name, 'a') as csv_file:
for fund_item in fund_list:

@ -156,14 +156,14 @@ class FundSpider:
def get_fund_base_info(self):
# 基金分类
self.fund_cat = self._chrome_driver.find_element_by_id(
'qt_base').find_element_by_class_name("category").text
self.fund_cat = self.get_element_text_by_class_name(
"category", 'qt_base')
# 成立时间
self.found_date = self._chrome_driver.find_element_by_id(
'qt_base').find_element_by_class_name("inception").text
self.found_date = self.get_element_text_by_class_name(
"inception", 'qt_base')
# 基金公司
self.company = self._chrome_driver.find_element_by_id(
'qt_management').find_element_by_xpath("//ul[@id='qt_management']/li[4]/span[@class='col2 comp']/a").text
self.company = self.get_element_text_by_xpath(
"//ul[@id='qt_management']/li[4]/span[@class='col2 comp']/a", 'qt_management')
# 获取基金经理信息
def get_fund_manager_info(self):

@ -13,7 +13,8 @@ import pymysql
from pprint import pprint
from db.connect import connect
cursor = connect.cursor()
cursor = connect().cursor()
if __name__ == '__main__':
page_start = 0
page_limit = 10000

Loading…
Cancel
Save