Aug-12-2020, 03:35 PM
Hi
I have two large input files (>10 GBs, Nx4). Task is to sort these files based on column-2 as fast as possible. Right now I am chunking and saving the sorted lines in text files (code below). Though it works, I need better speeds!
Is there any fast way of doing this? I later have to read the sorted files in chunks, how can this be done using Pytables or H5Py module? Or any other suggestions?
I have two large input files (>10 GBs, Nx4). Task is to sort these files based on column-2 as fast as possible. Right now I am chunking and saving the sorted lines in text files (code below). Though it works, I need better speeds!
Is there any fast way of doing this? I later have to read the sorted files in chunks, how can this be done using Pytables or H5Py module? Or any other suggestions?
filename = ['Input-1.txt', 'Input-2.txt']
savename = ['Sort-1.txt', 'Sort-2.txt']
chunksize = 100_000_00 # chunk's size to read
for findex in range(2):
nrows = sum(1 for line in open(filename[findex])) # no. of lines in each file
# storing chunk files in /dump
this_dir = os.path.dirname(__file__)
path_1 = ["dump/chunk1_{}.tsv","dump/chunk2_{}.tsv"] # chunks in .tsv
path_2 = ["dump/chunk1_*.tsv", "dump/chunk2_*.tsv"]
path_w = os.path.join(this_dir, path_1[findex])
path_r = os.path.join(this_dir, path_2[findex])
fid = 1
lines = []
with open(filename[findex], 'r') as f_in:
# creates chunk file(s)
f_out = open(path_w.format(fid), 'w')
for line_num, line in enumerate(f_in, 1):
# keep appending until you reach chunksize (boundary)
lines.append(line)
# enter as line_num reaches chunksize
if line_num % chunksize == 0:
# updates list with sorted values
lines = sorted(lines, key=lambda k: float(k.split(',')[1]))
f_out.writelines(lines)
f_out.close()
lines = []
fid += 1
# open next chunk
f_out = open(path_w.format(fid), 'w')
# last chunk
if lines:
lines = sorted(lines, key=lambda k: float(k.split(',')[1]))
f_out.writelines(lines)
f_out.close()
lines = []
print(f'==> Writing {savename[findex]}')
from heapq import merge
chunks = []
for filename[findex] in glob.glob(path_r):
chunks += [open(filename[findex], 'r')]
#print(filename[findex], savename[findex])
with open(savename[findex], 'w') as f_out:
f_out.writelines(merge(*chunks, key=lambda k: float(k.split(',')[1])))
