2022年 11月 8日

【Python】从新浪抓取美国天然气NG历史交易数据并进行可视化

  1. import urllib.request
  2. import re
  3. import datetime
  4. from bs4 import BeautifulSoup
  5. import matplotlib.pyplot as plt
  6. from matplotlib.ticker import MultipleLocator, FormatStrFormatter
  7. import time
  8. import matplotlib.ticker as ticker
  9. import re
  10. import json
  11. from urllib import request
  12. from urllib import parse
  13. # 设置全局变量,用于保存提取的数据
  14. HISTORY_ALL = []#存放所有历史数据
  15. div_list = []#存放网页中的div表
  16. infos = ['日期','收盘价', '开盘价','最高价', '最低价','成交量']
  17. PER_PAGE_DAYS = 25 #单个网页爬取的天数
  18. cyc_val = 17.502 #设置成本价
  19. def getHtml(url):
  20. """
  21. 打开网页
  22. :param url:
  23. :return:
  24. """
  25. while True:
  26. try:
  27. html = urllib.request.urlopen(url, timeout=5).read()
  28. break
  29. except:
  30. print("超时重试")
  31. html = html.decode('gbk')
  32. return html
  33. def time_long(time1, time2, type="day"):
  34. """
  35. 计算日期之间的时间差
  36. :param time1: 较小的时间(datetime类型)
  37. :param time2: 较大的时间(datetime类型)
  38. :param type: 返回结果的时间类型(暂时就是返回相差天数)
  39. :return: 相差的天数
  40. """
  41. day1 = time.strptime(str(time1), '%Y-%m-%d')
  42. day2 = time.strptime(str(time2), '%Y-%m-%d')
  43. if type == 'day':
  44. day_num = (int(time.mktime(day2)) - int(time.mktime(day1))) / (
  45. 24 * 60 * 60)
  46. return abs(int(day_num))
  47. def get_end_point(start_time,days):
  48. (year, month, day) = start_time.split("-")
  49. day1 = datetime.date(int(year), int(month), int(day))
  50. ended_point = day1 + datetime.timedelta(days)
  51. return ended_point
  52. def get_current_USDCNY():
  53. """
  54. 获取美元兑人民币实时汇率
  55. :return:
  56. """
  57. # 爬取网页
  58. url = "http://srh.bankofchina.com/search/whpj/search.jsp"
  59. Form_Data = {}
  60. Form_Data['erectDate'] = ''
  61. Form_Data['nothing'] = ''
  62. Form_Data['pjname'] = '1316'
  63. data = parse.urlencode(Form_Data).encode('utf-8')
  64. html = request.urlopen(url, data).read()
  65. soup = BeautifulSoup(html, 'html.parser')
  66. # 解析数据
  67. div = soup.find('div', attrs={'class': 'BOC_main publish'})
  68. table = div.find('table')
  69. tr = table.find_all('tr')
  70. td = tr[1].find_all('td')
  71. print(td[0].get_text(), td[1].get_text(), td[2].get_text(),td[3].get_text(), td[4].get_text(), td[5].get_text(), td[6].get_text())
  72. return float(td[3].get_text())/100,td[6].get_text()
  73. USDCNY ,current_time= get_current_USDCNY()
  74. print("实时汇率 "+current_time+" USDCNY:"+str(USDCNY))
  75. #输入起止时间
  76. while 1:
  77. print("----------------Please input the start time and the end time----------------")
  78. start_time = input()
  79. end_time = input()
  80. print("----------------The start time ----------------"+start_time)
  81. print("----------------The end time ----------------"+end_time)
  82. if start_time>=end_time:
  83. print("----------------Input date invalided.Please try again ----------------")
  84. else:
  85. break
  86. #DEBUG
  87. # start_time = "2018-08-09"
  88. # end_time = "2018-10-06"
  89. daysLen=time_long(end_time,start_time)
  90. print(daysLen)
  91. if daysLen < 20:
  92. ylabel_dis = 1
  93. else:
  94. ylabel_dis = int(daysLen/20) #用于控制绘图时横轴的标注
  95. per_loop_len = 0 #用于保存每次爬取的长度,为实际爬取的长度
  96. if(daysLen <= PER_PAGE_DAYS):
  97. per_loop_len = daysLen
  98. loop = 1
  99. left = 0
  100. else:
  101. per_loop_len = PER_PAGE_DAYS
  102. loop = int(daysLen/PER_PAGE_DAYS)
  103. left = daysLen%PER_PAGE_DAYS
  104. print("爬取循环次数"+str(loop))
  105. print("爬取循环次数"+str(left))
  106. ended_point = start_time
  107. cnt = 0
  108. end_flag = 0
  109. total_ok = 0
  110. xlabel = []
  111. date = []
  112. maxVal = []
  113. minVal = []
  114. PERP_PAGE_INFO = []
  115. while 1:
  116. #===========================日期处理部分,按每页25天的大小爬取网页
  117. print("=============================PAGE"+str(cnt)+"=============================")
  118. start_point = ended_point
  119. ended_point = get_end_point(str(start_point), per_loop_len);
  120. # print(str(cnt) + ":" + str(left) + ":" + str(per_loop_len))
  121. print("start_point:" + str(start_point))
  122. print("ended_point:" + str(ended_point))
  123. #===========================生成URL
  124. Url = 'https://vip.stock.finance.sina.com.cn/q/view/vFutures_History.php?jys=NYME&pz=NG&hy=&breed=NG&type=global&start=' + str(start_point) + '&end=' + str(ended_point)
  125. html = getHtml(Url)
  126. soup = BeautifulSoup(html, 'lxml')
  127. #============================得到本页表格
  128. table = soup.find('div', attrs={'class': 'historyList'})
  129. trs = table.find_all('tr')
  130. for tr in trs:
  131. tds = tr.find_all('td')
  132. div_list.clear()
  133. for td in tds:
  134. divs = td.find_all('div', attrs={'align': 'center'})
  135. for div in divs:
  136. val = div.get_text()
  137. div_list.append(val)
  138. data = {}
  139. for i in range(0, len(div_list)):
  140. data.update({infos[i]: div_list[i]})
  141. # print(div_list[i])
  142. PERP_PAGE_INFO.append(data)
  143. for i in range(1, len(PERP_PAGE_INFO)-2):
  144. total_ok = total_ok+1
  145. xlabel.append(total_ok)
  146. date.append(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("日期"))
  147. USD_maxVal = float(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("最高价"))
  148. maxVal.append(USD_maxVal*USDCNY)
  149. USD_minVal = float(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i].get("最低价"))
  150. if USD_minVal == 0.020:#仅仅针对2016-02-24的异常最小值进行修正
  151. USD_minVal = 1.7+0.020
  152. minVal.append(USD_minVal*USDCNY)
  153. HISTORY_ALL.append(PERP_PAGE_INFO[len(PERP_PAGE_INFO)-1-i])
  154. print("the length of per page : --->"+str(len(xlabel)))
  155. # print(xlabel)
  156. # print(date)
  157. # print(maxVal)
  158. # print(minVal)
  159. if end_flag == 1:
  160. print("-----------------END-------------------")
  161. break
  162. cnt = cnt+1
  163. if cnt == loop:
  164. if left == 0:
  165. print("-----------------END-------------------")
  166. break
  167. else :
  168. per_loop_len = left
  169. left = 0
  170. end_flag = 1
  171. PERP_PAGE_INFO.clear()
  172. print(HISTORY_ALL)
  173. # 成本均价,绘制成本线
  174. length = len(xlabel)
  175. CYC = []
  176. for i in range(0,length):
  177. CYC.append(cyc_val)
  178. # ---------------------------------------画图---------------------------------------
  179. fig,ax = plt.subplots()
  180. plt.rcParams['figure.dpi'] = 300 #分辨率
  181. # plot data
  182. ax.plot(date, maxVal, 'b', label='maxVal', linewidth=0.4)
  183. ax.plot(date, minVal, 'r', label='minVal', linewidth=0.4)
  184. ax.plot(date, CYC, 'c', label='CYC', linewidth=0.3)
  185. #解决横轴过密的问题1
  186. for label in ax.get_xticklabels():
  187. label.set_visible(False)
  188. for label in ax.get_xticklabels()[::ylabel_dis]:
  189. label.set_visible(True)
  190. #设置x轴标签文字的大小(size),倾斜角度(rotation),字体大小(fontsize)
  191. plt.xticks(size='small',rotation=90,fontsize=5)
  192. plt.title('NG Historical Data') # 添加图表标题
  193. plt.ylabel('Price/RMB') # 添加 y 轴标题
  194. plt.xlabel('Date') # 添加 x 轴标题
  195. plt.legend ()#设置图例
  196. #以分辨率 300 来保存图片
  197. plt.savefig('NG-'+start_time+"-"+end_time+'.png', dpi=300) #指定分辨率保存
  198. # show the figure
  199. plt.show()

使用方式:运行程序。运行环境;pycharm  python3.65 Anconda3

输入起始时间:如 2016-08-01

输入结束时间:如2019-09-06

格式要正确,等待爬取数据。

运行过程如下图