CME Updated Files

MaanaAragula · MaanaAragula · commit dfdd314994f3 · 2023-08-18T14:53:22.000-05:00
CME Rollup, Merge, Zip, Json
diff --git a/CME_Json.py b/CME_Json.py
@@ -0,0 +1,148 @@
+import pandas as pd
+import json
+import os
+
+# Load cme_futures.json file
+with open('CME_Futures.json') as f:
+    variables_data = json.load(f)
+
+# Read cme_weather_specs.xlsx file
+specs_data = pd.read_excel('cme_weather_specs.xlsx')
+missing_stations_data = [
+    {'Commodity Code': 'KRK', 'Contract Name': 'CME Seasonal Strip Degree Days Index Futures - Houston CDD May'},
+    {'Commodity Code': 'K6', 'Contract Name': 'CME Degree Days Index Futures - Philadelphia CDD'},
+    {'Commodity Code': 'KW', 'Contract Name': 'CME Degree Days Index Futures - Boston CDD'}
+]
+missing_data_df = pd.DataFrame(missing_stations_data)
+specs_data = pd.concat([specs_data, missing_data_df], ignore_index=True)
+
+# Generate stations.json and find the earliest and most recent dates
+stations = {
+    "type": "FeatureCollection",
+    "features": []
+}
+earliest_date = None
+most_recent_date = None
+
+# Get a list of CSV files directly under the CME_DDIF folder
+csv_files = [file for file in os.listdir('CME_DDIF') if file.endswith('.csv')]
+
+for csv_file in csv_files:
+    csv_file_path = os.path.join('CME_DDIF', csv_file)
+    station_name = os.path.splitext(csv_file)[0]
+    code = station_name  # Assuming station name corresponds to the code
+
+    
+    # Check if code exists in specs_data DataFrame
+    
+    
+    # Check if code exists in specs_data DataFrame
+    if code not in specs_data['Commodity Code'].values:
+        continue
+    
+    # Retrieve the description from specs_data DataFrame
+    description = specs_data.loc[specs_data['Commodity Code'] == code, 'Contract Name'].values[0]
+    
+    merged_data = pd.read_csv(csv_file_path)
+    
+    date_range = [
+        str(merged_data['dt'].min()),
+        str(merged_data['dt'].max())
+    ]
+    
+    if earliest_date is None or merged_data['dt'].min() < earliest_date:
+        earliest_date = merged_data['dt'].min()
+    
+    if most_recent_date is None or merged_data['dt'].max() > most_recent_date:
+        most_recent_date = merged_data['dt'].max()
+
+    station = {
+        "type": "Feature",
+        "properties": {
+            "file name": csv_file,
+            "station name": station_name,
+            "description": [description],
+            "variables": {
+                "0:": {
+                    "column name": variables_data["0"]["column name"],
+                    "plain text description": variables_data["0"]["plain text description"],
+                    "unit of measurement": variables_data["0"]["unit of measurement"],
+                    "precision": variables_data["0"]["precision"],
+                    "na value": variables_data["0"]["na value"],
+                },
+                "1:": {
+                    "column name": variables_data["1"]["column name"],
+                    "plain text description": variables_data["1"]["plain text description"],
+                    "unit of measurement": variables_data["1"]["unit of measurement"],
+                    "precision": variables_data["1"]["precision"],
+                    "na value": variables_data["1"]["na value"]
+                },
+                "2:": {
+                    "column name": variables_data["2"]["column name"],
+                    "plain text description": variables_data["2"]["plain text description"],
+                    "unit of measurement": variables_data["2"]["unit of measurement"],
+                    "precision": variables_data["2"]["precision"],
+                    "na value": variables_data["2"]["na value"]
+                }
+            },
+            "date range": date_range
+        }
+    }
+    stations["features"].append(station)
+
+# Save stations.json
+stations_file_path = os.path.join('CME_DDIF', 'stations.json')
+with open(stations_file_path, 'w') as outfile:
+    json.dump(stations, outfile, indent=4)
+
+# Generate metadata.json
+metadata = {
+    "compression": None,
+    "name": "cme_ddif",
+    "documentation": "https://www.cmegroup.com/content/dam/cmegroup/rulebook/CME/IV/400/403/403.pdf",
+    "description": "HDD, CDD, and CAT futures settlement data from the Chicago Mercantile Exchange (CME) by city",
+    "publisher": "Chicago Mercantile Exchange",
+    "source data url": "ftp.cmegroup.com",
+    "tags": [
+        "temperature",
+        "Europe",
+        "U.S",
+        "CME"
+    ],
+    "date range": [
+        str(earliest_date),
+        str(most_recent_date)
+    ],
+    "station metadata": "stations.json",
+    "previous hash": None,
+    "time generated": "",
+    "data dictionary": {
+        "0:": {
+            "column name": variables_data["0"]["column name"],
+            "plain text description": variables_data["0"]["plain text description"],
+            "unit of measurement": variables_data["0"]["unit of measurement"],
+            "precision": variables_data["0"]["precision"],
+            "na value": variables_data["0"]["na value"],
+        },
+        "1:": {
+            "column name": variables_data["1"]["column name"],
+            "plain text description": variables_data["1"]["plain text description"],
+            "unit of measurement": variables_data["1"]["unit of measurement"],
+            "precision": variables_data["1"]["precision"],
+            "na value": variables_data["1"]["na value"]
+        },
+        "2:": {
+            "column name": variables_data["2"]["column name"],
+            "plain text description": variables_data["2"]["plain text description"],
+            "unit of measurement": variables_data["2"]["unit of measurement"],
+            "precision": variables_data["2"]["precision"],
+            "na value": variables_data["2"]["na value"]
+        }
+    }
+}
+
+# Save metadata.json
+metadata_file_path = os.path.join('CME_DDIF', 'metadata.json')
+with open(metadata_file_path, 'w') as outfile:
+    json.dump(metadata, outfile, indent=4)
+
diff --git a/CME_Merged.py b/CME_Merged.py
@@ -11,7 +11,7 @@
 input_zip_file = '/Users/maana/Documents/GitHub/dWeather-Python-Client/cme_futures_hist_by_station.zip'
 
 # Output folder for merged data
-output_folder = 'Merge'
+output_folder = 'CME_DDIF'
 
 # Create a dictionary to store the station data
 station_data = {}
@@ -22,58 +22,67 @@
         if file.endswith('.csv'):
             station_name = os.path.splitext(file)[0]
             file_path = os.path.join(root, file)
-            
+
             # Read the station CSV file and store the data in the dictionary
             station_data[station_name] = pd.read_csv(file_path)
 
 # Process the zip files
 with zipfile.ZipFile(input_zip_file, 'r') as input_zip_ref:
     file_list = input_zip_ref.namelist()
-    with open('station_data.txt','w',encoding='ascii') as station_data_ref:
+    with open('station_data.txt', 'w', encoding='ascii') as station_data_ref:
         keys = station_data.keys()
         for key in keys:
-            station_data_ref.write('!'+key+'!'+"\n")
-    
+            station_data_ref.write('!' + key + '!' + "\n")
+
     for file_name in file_list:
         # Check if the file is a CSV file
         if not file_name.endswith('.csv'):
             continue
-        
+
         # Extract the station name from the file name
         station_name = re.match(r'(.+)/data.csv', file_name).group(1)
-        with open('station_name.txt','a',encoding='ascii') as station_name_ref:
-            station_name_ref.write('!'+station_name+'!'+"\n")
+        with open('station_name.txt', 'a', encoding='ascii') as station_name_ref:
+            station_name_ref.write('!' + station_name + '!' + "\n")
         if station_name is None:
             continue
-        
-    
-        
+
         # Check if the station has corresponding data in the station data dictionary
-        if station_name not in station_data:
-            continue
-        
-        # Read the CSV data from the zip file
-        with input_zip_ref.open(file_name) as input_csv_file_ref:
-            binary_data = input_csv_file_ref.read()
-            ascii_data = binary_data.decode('ascii')
-            zip_data = pd.read_csv(StringIO(ascii_data))
-        
-        # Merge the station data with the zip data
-        merged_data = pd.concat([station_data[station_name], zip_data], ignore_index=True)
-        merged_data.drop_duplicates(subset=['dt', 'SETT'], keep='last', inplace=True)
-        
-        # Save the merged data as a new CSV file for the station
-        station_folder = os.path.join(output_folder, station_name)
-        os.makedirs(station_folder, exist_ok=True)
-        output_csv_file = os.path.join(station_folder, 'merged_data.csv')
-        merged_data.to_csv(output_csv_file, index=False)
-
-# Create a zip file containing the merged data for each station
-output_zip_file = 'merged_data_by_station.zip'
-
-with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as output_zip_file_ref:
-    # Traverse the output folder and add each merged CSV file to the zip file
-    for root, dirs, files in os.walk(output_folder):
-        for file in files:
-            file_path = os.path.join(root, file)
-            output_zip_file_ref.write(file_path, os.path.relpath(file_path, output_folder))
+        if station_name in station_data:
+            # Read the CSV data from the zip file
+            with input_zip_ref.open(file_name) as input_csv_file_ref:
+                binary_data = input_csv_file_ref.read()
+                ascii_data = binary_data.decode('ascii')
+                zip_data = pd.read_csv(StringIO(ascii_data))
+
+            # Merge the station data with the zip data
+            merged_data = pd.concat([station_data[station_name], zip_data], ignore_index=True)
+            merged_data.drop_duplicates(subset=['dt', 'SETT'], keep='last', inplace=True)
+
+            # Save the merged data as a new CSV file for the station
+            os.makedirs(output_folder, exist_ok=True)
+            output_file_name = f"{station_name}.csv"
+            output_csv_file = os.path.join(output_folder, output_file_name)
+            merged_data.to_csv(output_csv_file, index=False)
+
+            # Remove the station entry from the station data dictionary to keep track of processed stations
+            del station_data[station_name]
+        else:
+            # Read the CSV data from the zip file
+            with input_zip_ref.open(file_name) as input_csv_file_ref:
+                binary_data = input_csv_file_ref.read()
+                ascii_data = binary_data.decode('ascii')
+                zip_data = pd.read_csv(StringIO(ascii_data))
+
+            # Save the CSV data as a new CSV file in the output folder
+            os.makedirs(output_folder, exist_ok=True)
+            output_file_name = f"{station_name}.csv"
+            output_csv_file = os.path.join(output_folder, output_file_name)
+            zip_data.to_csv(output_csv_file, index=False)
+
+    # Add the files from the "station" folder that are not in "cme_futures_hist_by_station" to the "CME_DDIF" folder
+    for station_name in station_data.keys():
+        station_csv_data = station_data[station_name]
+        output_file_name = f"{station_name}.csv"
+        output_csv_file = os.path.join(output_folder, output_file_name)
+        station_csv_data.to_csv(output_csv_file, index=False)
+
diff --git a/CME_Rollup.py b/CME_Rollup.py
@@ -85,6 +85,7 @@
     station_df['dt'] = pd.to_datetime(station_df['dt']).dt.strftime("%Y-%m-%d")
     print(station_df)"""
     import ipdb;ipdb.set_trace()
+    
 
 
     
diff --git a/CME_Zip.py b/CME_Zip.py
@@ -155,4 +155,5 @@
 					data_csv_file_ref.write(line)
 
 			# write the file to the zip file
-			output_zip_file_ref.write(data_csv_file, os.path.relpath(data_csv_file, output_folder))
+			output_zip_file_ref.write(data_csv_file, os.path.relpath(data_csv_file, output_folder))
+