1+ import os
2+ import re
3+ import zipfile
4+ import pdb
5+ import sys
6+ from datetime import datetime
7+
8+ # input zip file name
9+ input_zip_file = 'cme_futures_hist.zip'
10+
11+ # create an empty hash list
12+ # this hash list will eventually contain data like 'D0':10, 'H1':20
13+ # which means that we read 10 data lines for the station 'D0' and 20 data lines for the station 'H1'
14+ # station_hash = {
15+ # 'D0': 10,
16+ # 'H1': 20
17+ # }
18+ station_hash = {}
19+
20+ # the output folder where we are going to store data files by station
21+ output_folder = 'cme_futures_hist_by_station'
22+
23+ # if the output folder already exists, remove it
24+ if (os .path .exists (output_folder )):
25+ print (output_folder + ' exists! Please delete the folder and try again.' )
26+ sys .exit ()
27+
28+ os .mkdir (output_folder )
29+
30+ # open the input zip file
31+ with zipfile .ZipFile (input_zip_file , 'r' ) as input_zip_ref :
32+
33+ # get a list of all the file names in the input zip file
34+ # there will be one file per folder (cme_futures_hist/2022-07-13/K5K)
35+ # there will be one file per csv file (cme_futures_hist/2022-07-13/K5K/data.csv)
36+ # there will be one file per metadata.json file (cme_futures_hist/2022-07-13/metadata.json)
37+ file_list = input_zip_ref .namelist ()
38+
39+ # iterate over the file_list, get the name of each file
40+ for file_name in file_list :
41+
42+ # if the file is not a data.csv file, then ignore the file
43+ # for example, ignore folders and ignore metadata.json files
44+ if (not re .search (r'data.csv' , file_name )):
45+ continue
46+
47+ # print the csv file (if needed)
48+ # for example, cme_futures_hist/2022-07-13/K5K/data.csv
49+ # print(file_name)
50+
51+ # extract the station name from the file name
52+ # the split function will split the file name by '/' into an array
53+ # index 2 in the returned array will contain the station name
54+ # for example, in cme_futures_hist/2022-07-13/K5K/data.csv, KSV is at index 2
55+ subfolders_list = file_name .split ('/' )
56+ dt = subfolders_list [1 ]
57+ station_name = subfolders_list [2 ]
58+
59+ # print the station name (if needed)
60+ # print(station_name)
61+
62+ #if (station_name != 'HS'):
63+ #continue
64+
65+ # store this file_name as input_csv_file (for clarity's sake)
66+ input_csv_file = file_name
67+
68+ #pdb.set_trace()
69+
70+ # now open in the input csv file (cme_futures_hist/2022-07-13/K5K/data.csv)
71+ with input_zip_ref .open (input_csv_file ) as input_csv_file_ref :
72+ # read the input csv file, the data will be read as binary data
73+ binary_data = input_csv_file_ref .read ()
74+
75+ # convert the binary data to ascii (text) data
76+ ascii_data = binary_data .decode ('ascii' )
77+
78+ # split ascii_data by "\n" and assign back to ascii_data
79+ # this will convert ascii_data to an array
80+ ascii_data = ascii_data .split ("\n " )
81+
82+ # delete the first line in ascii_data (dt,value)
83+ del ascii_data [0 ]
84+
85+ # create an empty array (input_lines)
86+ # add to input_lines only those lines from ascii date that have any date (ignore blank lines)
87+ input_lines = []
88+ for line in ascii_data :
89+ if (len (line ) > 0 ):
90+ parts = line .split (',' )
91+ forecasted_dt = datetime .strptime (parts [0 ], '%Y%m' ).strftime ('%Y-%m' )
92+ csv_line = forecasted_dt + ',' + parts [1 ] + ',' + dt + "\n "
93+ input_lines .append (csv_line )
94+
95+ # note how many lines of data.csv were read for this station ('KSK')
96+ count = len (input_lines )
97+
98+ # if station_name ('K5K') is being encountered for the first time
99+ # note that we have processed zero lines for this station as of yet
100+ if (not station_name in station_hash ):
101+ station_hash [station_name ] = 0
102+
103+ # now, make a note of many lines of input csv file (data.csv) that we processed for 'K5K'
104+ # ignore the column header
105+ station_hash [station_name ] = station_hash [station_name ] + count
106+
107+
108+ # now, we will create single data file (K5K/data.csv) for the station 'K5K'
109+ # we will store this file in the output folder (./cme_futures_hist_by_station)
110+ #pdb.set_trace()
111+
112+ station_folder = os .path .join (output_folder , station_name )
113+ output_csv_file = os .path .join (station_folder , 'data.csv' )
114+
115+ if (not os .path .exists (output_csv_file )):
116+ # if this is the first time for 'K5K/data.csv', create a subfolder for 'K5K' under the output folder (cme_futures_hist_by_station)
117+ # then, create the file
118+ #print('creating folder', station_folder)
119+ os .mkdir (station_folder )
120+ input_lines = ["forecasted_dt,SETT,dt\n " ] + input_lines
121+
122+ # now, append the input_lines to the output_csv_file (K5K/data.csv)
123+ with open (output_csv_file , 'a' , encoding = 'ascii' ) as output_csv_file_ref :
124+ output_csv_file_ref .writelines (input_lines )
125+
126+
127+ # output zip file name
128+ output_zip_file = 'cme_futures_hist_by_station.zip'
129+
130+ with zipfile .ZipFile (output_zip_file , 'w' , zipfile .ZIP_DEFLATED ) as output_zip_file_ref :
131+ # get the root, dirs, and files in the output folder
132+ for root , dirs , files in os .walk (output_folder ):
133+ for file in files :
134+ #pdb.set_trace()
135+
136+ # construct the file path file for each file
137+ # root ='cme_futures_hist_by_station/D0'
138+ # file = 'data.csv'
139+ data_csv_file = os .path .join (root , file )
140+
141+ with open (data_csv_file , 'r' , encoding = 'ascii' ) as data_csv_file_ref :
142+ lines = data_csv_file_ref .readlines ()
143+
144+ temp_array = []
145+ for line in lines :
146+ line = line .rstrip ("\n " )
147+ parts = line .split (',' )
148+ temp_array .append (parts )
149+
150+ sorted_array = sorted (temp_array , key = lambda x : x [2 ], reverse = True )
151+ with open (data_csv_file , 'w' , encoding = 'ascii' ) as data_csv_file_ref :
152+ for parts in sorted_array :
153+ line = ',' .join (parts )
154+ line += "\n "
155+ data_csv_file_ref .write (line )
156+
157+ # write the file to the zip file
158+ output_zip_file_ref .write (data_csv_file , os .path .relpath (data_csv_file , output_folder ))
0 commit comments