Question :
I would like to split a large text file around size of 50GB into multiple files.
Data in the files are like this-[x= any integer between 0-9]
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
xxx.xxx.xxx.xxx
...............
...............
There might be few billions of lines in the file and i would like write for example 30/40 millions per file.
I guess the steps would be-
- I’ve to open the file
- then using readline() have to read the file line by line and write at the same time to a new file
- and as soon as it hits the maximum number of lines it will create another file and
starts writing again.
I’m wondering, how to put all these steps together in a memory efficient and faster way. I’ve seen some examples in stack but none of them totally helping what i exactly need. I would really appreciate if anyone could help me out.
Answer #1:
This working solution uses split
command available in shell. Since the author has already accepted a possibility of a non-python solution, please do not downvote.
First, I created a test file with 1000M entries (15 GB) with
awk 'BEGIN{for (i = 0; i < 1000000000; i++) {print "123.123.123.123"} }' > t.txt
Then I used split
:
split --lines=30000000 --numeric-suffixes --suffix-length=2 t.txt t
It took 5 min to produce a set of 34 small files with names t00
–t33
. 33 files are 458 MB each and the last t33
is 153 MB.
Answer #2:
from itertools import chain, islice
def chunks(iterable, n):
"chunks(ABCDE,2) => AB CD E"
iterable = iter(iterable)
while True:
# store one line in memory,
# chain it to an iterator on the rest of the chunk
yield chain([next(iterable)], islice(iterable, n-1))
l = 30*10**6
file_large = 'large_file.txt'
with open(file_large) as bigfile:
for i, lines in enumerate(chunks(bigfile, l)):
file_split = '{}.{}'.format(file_large, i)
with open(file_split, 'w') as f:
f.writelines(lines)
Answer #3:
I would use the Unix utility split, if it is available to you and your only task is to split the file. Here is however a pure Python solution:
import contextlib
file_large = 'large_file.txt'
l = 30*10**6 # lines per split file
with contextlib.ExitStack() as stack:
fd_in = stack.enter_context(open(file_large))
for i, line in enumerate(fd_in):
if not i % l:
file_split = '{}.{}'.format(file_large, i//l)
fd_out = stack.enter_context(open(file_split, 'w'))
fd_out.write('{}n'.format(line))
If all of your lines have 4 3-digit numbers on them and you have multiple cores available, then you can exploit file seek and run multiple processes.
Answer #4:
This class may solve your problem.
I’ve tested it on Linux and Windows operating system, and it’s worked perfectly on both of them.
Also, I’ve tested binary and text file with different sizes each time and it was great.
Enjoy 🙂
import os
import math
class FileSpliter:
# If file type is text then CHUNK_SIZE is count of chars
# If file type is binary then CHUNK_SIZE is count of bytes
def __init__(self, InputFile, FileType="b", CHUNK_SIZE=524288, OutFile="outFile"):
self.CHUNK_SIZE = CHUNK_SIZE # byte or char
self.InputFile = InputFile
self.FileType = FileType # b: binary, t: text
self.OutFile = OutFile
self.FileSize = 0
self.Parts = None
self.CurrentPartNo = 0
self.Progress = 0.0
def Prepare(self):
if not(os.path.isfile(self.InputFile) and os.path.getsize(self.InputFile) > 0):
print("ERROR: The file is not exists or empty!")
return False
self.FileSize = os.path.getsize(self.InputFile)
if self.CHUNK_SIZE >= self.FileSize:
self.Parts = 1
else:
self.Parts = math.ceil(self.FileSize / self.CHUNK_SIZE)
return True
def Split(self):
if self.FileSize == 0 or self.Parts == None:
print("ERROR: File is not prepared for split!")
return False
with open(self.InputFile, "r" + self.FileType) as f:
while True:
if self.FileType == "b":
buf = bytearray(f.read(self.CHUNK_SIZE))
elif self.FileType == "t":
buf = f.read(self.CHUNK_SIZE)
else:
print("ERROR: File type error!")
if not buf:
# we've read the entire file in, so we're done.
break
of = self.OutFile + str(self.CurrentPartNo)
outFile = open(of, "w" + self.FileType)
outFile.write(buf)
outFile.close()
self.CurrentPartNo += 1
self.ProgressBar()
return True
def Rebuild(self):
self.CurrentPartNo = 0
if self.Parts == None:
return False
with open(self.OutFile, "w" + self.FileType) as f:
while self.CurrentPartNo < self.Parts:
If = self.OutFile + str(self.CurrentPartNo)
if not(os.path.isfile(If) and os.path.getsize(If) > 0):
print("ERROR: The file [" + If + "] is not exists or empty!")
return False
InputFile = open(If, "r" + self.FileType)
buf = InputFile.read()
if not buf:
# we've read the entire file in, so we're done.
break
f.write(buf)
InputFile.close()
os.remove(If)
self.CurrentPartNo += 1
self.ProgressBar()
return True
def ProgressBar(self, BarLength=20, ProgressIcon="#", BarIcon="-"):
try:
# You can't have a progress bar with zero or negative length.
if BarLength <1:
BarLength = 20
# Use status variable for going to the next line after progress completion.
Status = ""
# Calcuting progress between 0 and 1 for percentage.
self.Progress = float(self.CurrentPartNo) / float(self.Parts)
# Doing this conditions at final progressing.
if self.Progress >= 1.:
self.Progress = 1
Status = "rn" # Going to the next line
# Calculating how many places should be filled
Block = int(round(BarLength * self.Progress))
# Show this
Bar = "r[{}] {:.0f}% {}".format(ProgressIcon * Block + BarIcon * (BarLength - Block), round(self.Progress * 100, 0), Status)
print(Bar, end="")
except:
print("rERROR")
def main():
fp = FileSpliter(InputFile="inFile", FileType="b") #, CHUNK_SIZE=300000)
if fp.Prepare():
# Spliting ...
print("Spliting ...")
sr = fp.Split()
if sr == True:
print("The file splited successfully.")
print()
# Rebuilding ...
print("Rebuilding ...")
rr = fp.Rebuild()
if rr == True:
print("The file rebuilded successfully.")
if __name__ == "__main__":
main()
Answer #5:
I am writing a Python3 code solution which I usually use to split files having size in MBs.
However, I have not yet tried for files having size in GBs.
TextFileSplitter.py
import traceback
#get a file name to be read
fileToRead = input("Enter file name : ")
# max lines you want to write in a single file
fileLineCount = 2000
lineCount = 0
fileCount = 1
try:
print('Start splitting...')
#read a file
fileReader = open(fileToRead)
line = fileReader.readline()
fileWriter = open(str(fileCount)+".txt","a")
while line != '':#empty is EOF
if lineCount == 0:
#create a file in append mode
fileWriter = open(str(fileCount)+".txt","a")
#increment file count, use it for new file name
fileCount += 1
#write a line
fileWriter.write(line+"n")
lineCount += 1
if lineCount == fileLineCount:
lineCount = 0
fileWriter.close()
#read a line
line = fileReader.readline()
fileWriter.close()
except Exception as e:
#print the exception if any
print(e.__traceback__)
traceback.print_exc()
finally:
#close the file reader
fileReader.close()
o/p will look like, files, each having fileLineCount(i.e. 2000) lines, created in a same directory as :
1.txt
2.txt
3.txt
.
.
.
.
n.txt