Skip to content

Commit 6146efc

Browse files
committed
CME_Zip.py - Data from zip files
Getting data from zip files and reordering it by station names which is the data we bought before getting it manually.
1 parent 5d37c12 commit 6146efc

1 file changed

Lines changed: 158 additions & 0 deletions

File tree

CME_Zip.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import os
2+
import re
3+
import zipfile
4+
import pdb
5+
import sys
6+
from datetime import datetime
7+
8+
# input zip file name
9+
input_zip_file = 'cme_futures_hist.zip'
10+
11+
# create an empty hash list
12+
# this hash list will eventually contain data like 'D0':10, 'H1':20
13+
# which means that we read 10 data lines for the station 'D0' and 20 data lines for the station 'H1'
14+
# station_hash = {
15+
# 'D0': 10,
16+
# 'H1': 20
17+
# }
18+
station_hash = {}
19+
20+
# the output folder where we are going to store data files by station
21+
output_folder = 'cme_futures_hist_by_station'
22+
23+
# if the output folder already exists, remove it
24+
if (os.path.exists(output_folder)):
25+
print(output_folder + ' exists! Please delete the folder and try again.')
26+
sys.exit()
27+
28+
os.mkdir(output_folder)
29+
30+
# open the input zip file
31+
with zipfile.ZipFile(input_zip_file, 'r') as input_zip_ref:
32+
33+
# get a list of all the file names in the input zip file
34+
# there will be one file per folder (cme_futures_hist/2022-07-13/K5K)
35+
# there will be one file per csv file (cme_futures_hist/2022-07-13/K5K/data.csv)
36+
# there will be one file per metadata.json file (cme_futures_hist/2022-07-13/metadata.json)
37+
file_list = input_zip_ref.namelist()
38+
39+
# iterate over the file_list, get the name of each file
40+
for file_name in file_list:
41+
42+
# if the file is not a data.csv file, then ignore the file
43+
# for example, ignore folders and ignore metadata.json files
44+
if (not re.search(r'data.csv', file_name)):
45+
continue
46+
47+
# print the csv file (if needed)
48+
# for example, cme_futures_hist/2022-07-13/K5K/data.csv
49+
# print(file_name)
50+
51+
# extract the station name from the file name
52+
# the split function will split the file name by '/' into an array
53+
# index 2 in the returned array will contain the station name
54+
# for example, in cme_futures_hist/2022-07-13/K5K/data.csv, KSV is at index 2
55+
subfolders_list = file_name.split('/')
56+
dt = subfolders_list[1]
57+
station_name = subfolders_list[2]
58+
59+
# print the station name (if needed)
60+
# print(station_name)
61+
62+
#if (station_name != 'HS'):
63+
#continue
64+
65+
# store this file_name as input_csv_file (for clarity's sake)
66+
input_csv_file = file_name
67+
68+
#pdb.set_trace()
69+
70+
# now open in the input csv file (cme_futures_hist/2022-07-13/K5K/data.csv)
71+
with input_zip_ref.open(input_csv_file) as input_csv_file_ref:
72+
# read the input csv file, the data will be read as binary data
73+
binary_data = input_csv_file_ref.read()
74+
75+
# convert the binary data to ascii (text) data
76+
ascii_data = binary_data.decode('ascii')
77+
78+
# split ascii_data by "\n" and assign back to ascii_data
79+
# this will convert ascii_data to an array
80+
ascii_data = ascii_data.split("\n")
81+
82+
# delete the first line in ascii_data (dt,value)
83+
del ascii_data[0]
84+
85+
# create an empty array (input_lines)
86+
# add to input_lines only those lines from ascii date that have any date (ignore blank lines)
87+
input_lines = []
88+
for line in ascii_data:
89+
if (len(line) > 0):
90+
parts = line.split(',')
91+
forecasted_dt = datetime.strptime(parts[0], '%Y%m').strftime('%Y-%m')
92+
csv_line = forecasted_dt + ',' + parts[1] + ',' + dt + "\n"
93+
input_lines.append(csv_line)
94+
95+
# note how many lines of data.csv were read for this station ('KSK')
96+
count = len(input_lines)
97+
98+
# if station_name ('K5K') is being encountered for the first time
99+
# note that we have processed zero lines for this station as of yet
100+
if (not station_name in station_hash):
101+
station_hash[station_name] = 0
102+
103+
# now, make a note of many lines of input csv file (data.csv) that we processed for 'K5K'
104+
# ignore the column header
105+
station_hash[station_name] = station_hash[station_name] + count
106+
107+
108+
# now, we will create single data file (K5K/data.csv) for the station 'K5K'
109+
# we will store this file in the output folder (./cme_futures_hist_by_station)
110+
#pdb.set_trace()
111+
112+
station_folder = os.path.join(output_folder, station_name)
113+
output_csv_file = os.path.join(station_folder, 'data.csv')
114+
115+
if (not os.path.exists(output_csv_file)):
116+
# if this is the first time for 'K5K/data.csv', create a subfolder for 'K5K' under the output folder (cme_futures_hist_by_station)
117+
# then, create the file
118+
#print('creating folder', station_folder)
119+
os.mkdir(station_folder)
120+
input_lines = ["forecasted_dt,SETT,dt\n"] + input_lines
121+
122+
# now, append the input_lines to the output_csv_file (K5K/data.csv)
123+
with open(output_csv_file, 'a', encoding='ascii') as output_csv_file_ref:
124+
output_csv_file_ref.writelines(input_lines)
125+
126+
127+
# output zip file name
128+
output_zip_file = 'cme_futures_hist_by_station.zip'
129+
130+
with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as output_zip_file_ref:
131+
# get the root, dirs, and files in the output folder
132+
for root, dirs, files in os.walk(output_folder):
133+
for file in files:
134+
#pdb.set_trace()
135+
136+
# construct the file path file for each file
137+
# root ='cme_futures_hist_by_station/D0'
138+
# file = 'data.csv'
139+
data_csv_file = os.path.join(root, file)
140+
141+
with open(data_csv_file, 'r', encoding='ascii') as data_csv_file_ref:
142+
lines = data_csv_file_ref.readlines()
143+
144+
temp_array = []
145+
for line in lines:
146+
line = line.rstrip("\n")
147+
parts = line.split(',')
148+
temp_array.append(parts)
149+
150+
sorted_array = sorted(temp_array, key=lambda x: x[2], reverse=True)
151+
with open(data_csv_file, 'w', encoding='ascii') as data_csv_file_ref:
152+
for parts in sorted_array:
153+
line = ','.join(parts)
154+
line += "\n"
155+
data_csv_file_ref.write(line)
156+
157+
# write the file to the zip file
158+
output_zip_file_ref.write(data_csv_file, os.path.relpath(data_csv_file, output_folder))

0 commit comments

Comments
 (0)