- import urllib.request
- import re
- import datetime
- from bs4 import BeautifulSoup
- import matplotlib.pyplot as plt
- from matplotlib.ticker import MultipleLocator, FormatStrFormatter
- import time
- import matplotlib.ticker as ticker
- import re
- import json
-
- from urllib import request
- from urllib import parse
-
- # 设置全局变量,用于保存提取的数据
- HISTORY_ALL = []#存放所有历史数据
- div_list = []#存放网页中的div表
- infos = ['日期','收盘价', '开盘价','最高价', '最低价','成交量']
- PER_PAGE_DAYS = 25 #单个网页爬取的天数
-
- cyc_val = 17.502 #设置成本价
-
- def getHtml(url):
- """
- 打开网页
- :param url:
- :return:
- """
- while True:
- try:
- html = urllib.request.urlopen(url, timeout=5).read()
- break
- except:
- print("超时重试")
- html = html.decode('gbk')
- return html
- def time_long(time1, time2, type="day"):
- """
- 计算日期之间的时间差
- :param time1: 较小的时间(datetime类型)
- :param time2: 较大的时间(datetime类型)
- :param type: 返回结果的时间类型(暂时就是返回相差天数)
- :return: 相差的天数
- """
-
- day1 = time.strptime(str(time1), '%Y-%m-%d')
-
- day2 = time.strptime(str(time2), '%Y-%m-%d')
-
- if type == 'day':
-
- day_num = (int(time.mktime(day2)) - int(time.mktime(day1))) / (
-
- 24 * 60 * 60)
-
- return abs(int(day_num))
- def get_end_point(start_time,days):
- (year, month, day) = start_time.split("-")
- day1 = datetime.date(int(year), int(month), int(day))
-
- ended_point = day1 + datetime.timedelta(days)
- return ended_point
- def get_current_USDCNY():
- """
- 获取美元兑人民币实时汇率
- :return:
- """
- # 爬取网页
- url = "http://srh.bankofchina.com/search/whpj/search.jsp"
- Form_Data = {}
- Form_Data['erectDate'] = ''
- Form_Data['nothing'] = ''
- Form_Data['pjname'] = '1316'
- data = parse.urlencode(Form_Data).encode('utf-8')
- html = request.urlopen(url, data).read()
- soup = BeautifulSoup(html, 'html.parser')
- # 解析数据
- div = soup.find('div', attrs={'class': 'BOC_main publish'})
- table = div.find('table')
-
- tr = table.find_all('tr')
-
- td = tr[1].find_all('td')
- print(td[0].get_text(), td[1].get_text(), td[2].get_text(),td[3].get_text(), td[4].get_text(), td[5].get_text(), td[6].get_text())
- return float(td[3].get_text())/100,td[6].get_text()
-
- USDCNY ,current_time= get_current_USDCNY()
- print("实时汇率 "+current_time+" USDCNY:"+str(USDCNY))
-
- #输入起止时间
- while 1:
- print("----------------Please input the start time and the end time----------------")
- start_time = input()
- end_time = input()
- print("----------------The start time ----------------"+start_time)
- print("----------------The end time ----------------"+end_time)
-
- if start_time>=end_time:
- print("----------------Input date invalided.Please try again ----------------")
- else:
- break
-
- #DEBUG
- # start_time = "2018-08-09"
- # end_time = "2018-10-06"
-
- daysLen=time_long(end_time,start_time)
- print(daysLen)
-
- if daysLen < 20:
- ylabel_dis = 1
- else:
- ylabel_dis = int(daysLen/20) #用于控制绘图时横轴的标注
-
- per_loop_len = 0 #用于保存每次爬取的长度,为实际爬取的长度
-
- if(daysLen <= PER_PAGE_DAYS):
- per_loop_len = daysLen
- loop = 1
- left = 0
- else:
- per_loop_len = PER_PAGE_DAYS
- loop = int(daysLen/PER_PAGE_DAYS)
- left = daysLen%PER_PAGE_DAYS
-
- print("爬取循环次数"+str(loop))
- print("爬取循环次数"+str(left))
-
- ended_point = start_time
- cnt = 0
- end_flag = 0
- total_ok = 0
-
- xlabel = []
- date = []
- maxVal = []
- minVal = []
- PERP_PAGE_INFO = []
-
- while 1:
- #===========================日期处理部分,按每页25天的大小爬取网页
- print("=============================PAGE"+str(cnt)+"=============================")
- start_point = ended_point
- ended_point = get_end_point(str(start_point), per_loop_len);
- # print(str(cnt) + ":" + str(left) + ":" + str(per_loop_len))
- print("start_point:" + str(start_point))
- print("ended_point:" + str(ended_point))
-
- #===========================生成URL
- Url = 'https://vip.stock.finance.sina.com.cn/q/view/vFutures_History.php?jys=NYME&pz=NG&hy=&breed=NG&type=global&start=' + str(start_point) + '&end=' + str(ended_point)
- html = getHtml(Url)
- soup = BeautifulSoup(html, 'lxml')
- #============================得到本页表格
- table = soup.find('div', attrs={'class': 'historyList'})
- trs = table.find_all('tr')
- for tr in trs:
- tds = tr.find_all('td')
- div_list.clear()
- for td in tds:
- divs = td.find_all('div', attrs={'align': 'center'})
- for div in divs:
- val = div.get_text()
- div_list.append(val)
- data = {}
- for i in range(0, len(div_list)):
- data.update({infos[i]: div_list[i]})
- # print(div_list[i])
- PERP_PAGE_INFO.append(data)
-
- for i in range(1, len(PERP_PAGE_INFO)-2):
- total_ok = total_ok+1
- xlabel.append(total_ok)
- date.append(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("日期"))
- USD_maxVal = float(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("最高价"))
- maxVal.append(USD_maxVal*USDCNY)
- USD_minVal = float(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("最低价"))
- if USD_minVal == 0.020:#仅仅针对2016-02-24的异常最小值进行修正
- USD_minVal = 1.7+0.020
- minVal.append(USD_minVal*USDCNY)
- HISTORY_ALL.append(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i])
- print("the length of per page : --->"+str(len(xlabel)))
- # print(xlabel)
- # print(date)
- # print(maxVal)
- # print(minVal)
-
- if end_flag == 1:
- print("-----------------END-------------------")
- break
- cnt = cnt+1
- if cnt == loop:
- if left == 0:
- print("-----------------END-------------------")
- break
- else :
- per_loop_len = left
- left = 0
- end_flag = 1
- PERP_PAGE_INFO.clear()
- print(HISTORY_ALL)
-
- # 成本均价,绘制成本线
- length = len(xlabel)
- CYC = []
- for i in range(0,length):
- CYC.append(cyc_val)
-
- # ---------------------------------------画图---------------------------------------
- fig,ax = plt.subplots()
- plt.rcParams['figure.dpi'] = 300 #分辨率
-
- # plot data
- ax.plot(date, maxVal, 'b', label='maxVal', linewidth=0.4)
- ax.plot(date, minVal, 'r', label='minVal', linewidth=0.4)
- ax.plot(date, CYC, 'c', label='CYC', linewidth=0.3)
-
- #解决横轴过密的问题1
- for label in ax.get_xticklabels():
- label.set_visible(False)
- for label in ax.get_xticklabels()[::ylabel_dis]:
- label.set_visible(True)
-
- #设置x轴标签文字的大小(size),倾斜角度(rotation),字体大小(fontsize)
- plt.xticks(size='small',rotation=90,fontsize=5)
-
- plt.title('NG Historical Data') # 添加图表标题
- plt.ylabel('Price/RMB') # 添加 y 轴标题
- plt.xlabel('Date') # 添加 x 轴标题
- plt.legend ()#设置图例
- #以分辨率 300 来保存图片
- plt.savefig('NG-'+start_time+"-"+end_time+'.png', dpi=300) #指定分辨率保存
- # show the figure
- plt.show()
使用方式:运行程序。运行环境;pycharm python3.65 Anconda3
输入起始时间:如 2016-08-01
输入结束时间:如2019-09-06
格式要正确,等待爬取数据。
运行过程如下图