Hi,
I am a beginner in python. I've read the Fluent in Python book, and I am trying to dive deeper.
Here is a file based merge sort. I have a file that contains around 10 million numbers.Each number is in their own line. The program will read 1 million numbers at a time, sort them, and store them in temporary files. Finally, a merge process will merge all temporary files into one output file.
I am not very confident about what I coded. Can you guys help make this little program more pythonic? Is there anything I missed that could save more memory or cpu time? Any help is appreciated.
I am a beginner in python. I've read the Fluent in Python book, and I am trying to dive deeper.
Here is a file based merge sort. I have a file that contains around 10 million numbers.Each number is in their own line. The program will read 1 million numbers at a time, sort them, and store them in temporary files. Finally, a merge process will merge all temporary files into one output file.
I am not very confident about what I coded. Can you guys help make this little program more pythonic? Is there anything I missed that could save more memory or cpu time? Any help is appreciated.
from itertools import zip_longest
import glob
def grouper(iterable, n, fillvalue=None):
"""Collect data into fixed-length chunks or blocks"""
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
class MergeSort:
# temp folder
TEMP_FOLDER = "temp/"
# number of lines to read at a time
N = 1000000
def __init__(self, input_file_name, output_file_name):
self.input_file_name = input_file_name
self.output_file_name = output_file_name
def __divide(self):
"""read numbers to sort, divide into chunks, sort each chunk and write to temp files"""
with open(self.input_file_name, "r") as fin:
i = 1 # only used for logging purpose
for lines in grouper(fin, self.N):
print("Sorting batch " + str(i) + "...")
data = sorted(lines, key=int)
with open(self.TEMP_FOLDER + str(i) + ".text", "w+") as temp:
print("Writing batch " + str(i) + "...")
temp.writelines(data)
i += 1
def __merge(self):
"""merge sorted temp files into one"""
files = []
for f in glob.glob(self.TEMP_FOLDER + "*.text"):
print(f)
fin = open(f, "r")
files.append(fin)
with open(self.output_file_name, "w+") as fout:
values = []
for f in files:
val = int(f.readline())
values.append(val)
while files:
minval = min(values)
index = values.index(minval)
print(minval)
fout.write(str(minval) + "\n")
# read next line
data = files[index].readline()
if data:
del values[index] # values.pop(index)
values.insert(index, int(data))
else:
files[index].close() # close temp file
del files[index] # files.pop(index)
del values[index] # values.pop(index)
def mergesort(self):
self.__divide()
self.__merge()
if __name__ == '__main__':
# file to sort
INPUT_FILE_NAME = "numbers_to_sort.text"
# sorted file
OUTPUT_FILE_NAME = "sorted_numbers.text"
MergeSort(INPUT_FILE_NAME, OUTPUT_FILE_NAME).mergesort()
