Mar-22-2021, 10:10 AM
The problem is that when I scraping data from multiple .xlsm file,in ("DataSource": fn) it is taking the whole path of the file location "C:\Users\ShantanuGupta\Desktop\Test Python\202009 - September - Diamond Plod Day & Night MKY025.xlsm". But I wanted only the last portion "202009 - September - Diamond Plod Day & Night MKY025.xlsm" not the whole file location. Also attach picture.I have mentioned in the code with comment where the problem.
Any help????
![[Image: view?usp=sharing]](https://drive.google.com/file/d/1eRpqlHeJagq-_2ss6xNsDQ3lAxCZNY62/view?usp=sharing)
Any help????
from pathlib import Path
import time
import parser
import argparse
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser(description="Process some integers.")
parser.add_argument("path", help="define the directory to folder/file")
parser.add_argument("--verbose", help="display processing information")
start = time.time()
def main(path_xlsm, verbose):
if (".xlsm" in str(path_xlsm).lower()) and path_xlsm.is_file():
xlsm_files = [Path(path_xlsm)]
else:
xlsm_files = list(Path(path_xlsm).glob("*.xlsm"))
df = pd.DataFrame()
for fn in xlsm_files:
all_dfs = pd.read_excel(fn, sheet_name=None, header=None, engine="openpyxl")
list_data = all_dfs.keys()
all_dfs.pop("Date", None)
all_dfs.pop("Ops Report", None)
all_dfs.pop("Fuel Report", None)
all_dfs.pop("Bit Report", None)
all_dfs.pop("Plod Example", None)
all_dfs.pop("Plod Definitions", None)
all_dfs.pop("Consumables", None)
all_dfs.pop("Tables", None)
for ws in list_data: # Looping for excel sheet
df1 = all_dfs[ws]
df2 = pd.DataFrame()
if df1.iloc[41, 1] == "Drillers Comments":
row = 42
elif df1.iloc[44, 1] == "Drillers Comments":
row = 45
new_row = {
"PlodDate": df1.iloc[4, 3],
"PlodShift": df1.iloc[5, 3],
"RigNo": df1.iloc[2, 9],
"Location": df1.iloc[3, 3],
"DrillersComments": df1.iloc[row, 1],
"GeologistComments": df1.iloc[row, 14],
"MaintenanceComments": df1.iloc[row, 26],
"TravelInName": df1.iloc[2, 36],
"TravelInHours": df1.iloc[2, 45],
"TravelOutName": df1.iloc[3, 36],
"TravelOutHours": df1.iloc[3, 45],
"DataSource": fn, #Problem Problem Problem
}
df2 = df2.append(new_row, ignore_index=True)
df2 = df2[
[
"PlodDate",
"PlodShift",
"RigNo",
"Location",
"DrillersComments",
"GeologistComments",
"MaintenanceComments",
"TravelInName",
"TravelInHours",
"TravelOutName",
"TravelOutHours",
"DataSource",
]
]
cols = [
"Location",
"DrillersComments",
"GeologistComments",
"MaintenanceComments",
"TravelInName",
"TravelInHours",
"TravelOutName",
"TravelOutHours",
]
df2[cols].replace("", np.nan, inplace=True)
df2 = df2.dropna(subset=cols, how="all")
df2 = df2.replace(",", ";", regex=True)
df2 = df2.replace("\n", " ", regex=True)
df2 = df2.replace("\r", " ", regex=True)
df2 = df2.replace("\t", " ", regex=True)
df = df.append(df2)
df.to_csv("McKayPlod-1.csv", index=False)
if __name__ == "__main__":
start = time.time()
args = parser.parse_args()
path = Path(args.path)
verbose = args.verbose
main(path, verbose) #Calling Main Function
print("Processed time:", time.time() - start) #Total TimeImage
