#!/usr/bin/env python3 # Copyright (C) 2013 Intra2net AG # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see # import argparse BUFSIZE = 16 * 1024 def split_file(separator, prefix, input_file, new_file_func=None): ''' splits a file when it finds a regexp, including the regexp in the begining of the new file ''' i = 0 pos = 0 buf = b"" sep_len = len(separator) if sep_len == 0: raise Exception("empty separator") if new_file_func is None: new_file_func = lambda prefix, i: open(prefix + str(i), 'wb') output = new_file_func(prefix, i) # buffered search. we try not to have the while input file in memory, as # it's not needed with open(input_file, 'rb') as f: while True: buf += f.read(BUFSIZE) if len(buf) == 0: break # split using the separator while separator in buf: idx = buf.index(separator) if idx > 0: output.write(buf[0:idx]) output.close() i += 1 output = new_file_func(prefix, i) output.write(buf[idx:idx + sep_len]) else: output.write(buf[0:sep_len]) buf = buf[idx + sep_len:] # corner case: separator is between this buf and next one. In this # case, we write to current output everything before that and # iterate if len(buf) > sep_len and separator[0] in buf[-sep_len:]: output.write(buf[:-sep_len]) buf = buf[-sep_len:] continue # else: continue writing to the current output and iterate output.write(buf) buf = b"" output.close() def chunk_file(input_file, output_file, from_pos, to_pos): ifl = open(input_file, 'rb') ofl = open(output_file, 'wb') ifl.seek(from_pos) ofl.write(ifl.read(to_pos-from_pos)) ifl.close() ofl.close() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-s", "--separator", help="string for the separator") parser.add_argument("-p", "--prefix", help="prefix for split files") parser.add_argument("input_file", help="input file") parser.add_argument("-f", "--from-pos", type=int, default=-1) parser.add_argument("-t", "--to-pos", type=int, default=-1) parser.add_argument("-o", "--output", default=None) args = parser.parse_args() if args.from_pos > -1 and args.to_pos > -1: chunk_file(input_file=args.input_file, output_file=args.output, from_pos=args.from_pos, to_pos=args.to_pos) else: split_file(separator=args.separator.encode('UTF-8', errors="surrogateescape"), prefix=args.prefix, input_file=args.input_file)