| 1 | #!/usr/bin/env python3 |
| 2 | |
| 3 | # Copyright (C) 2013 Intra2net AG |
| 4 | # |
| 5 | # This program is free software; you can redistribute it and/or modify |
| 6 | # it under the terms of the GNU Lesser General Public License as published |
| 7 | # by the Free Software Foundation; either version 3 of the License, or |
| 8 | # (at your option) any later version. |
| 9 | # |
| 10 | # This program is distributed in the hope that it will be useful, |
| 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | # GNU Lesser General Public License for more details. |
| 14 | # |
| 15 | # You should have received a copy of the GNU General Public License |
| 16 | # along with this program. If not, see |
| 17 | # <http://www.gnu.org/licenses/lgpl-3.0.html> |
| 18 | |
| 19 | |
| 20 | import argparse |
| 21 | |
| 22 | BUFSIZE = 16 * 1024 |
| 23 | |
| 24 | def split_file(separator, prefix, input_file, new_file_func=None): |
| 25 | ''' |
| 26 | splits a file when it finds a regexp, including the regexp in the begining |
| 27 | of the new file |
| 28 | ''' |
| 29 | i = 0 |
| 30 | pos = 0 |
| 31 | buf = b"" |
| 32 | sep_len = len(separator) |
| 33 | if sep_len == 0: |
| 34 | raise Exception("empty separator") |
| 35 | |
| 36 | if new_file_func is None: |
| 37 | new_file_func = lambda prefix, i: open(prefix + str(i), 'wb') |
| 38 | |
| 39 | output = new_file_func(prefix, i) |
| 40 | |
| 41 | # buffered search. we try not to have the while input file in memory, as |
| 42 | # it's not needed |
| 43 | with open(input_file, 'rb') as f: |
| 44 | while True: |
| 45 | buf += f.read(BUFSIZE) |
| 46 | if len(buf) == 0: |
| 47 | break |
| 48 | |
| 49 | # split using the separator |
| 50 | while separator in buf: |
| 51 | idx = buf.index(separator) |
| 52 | |
| 53 | if idx > 0: |
| 54 | output.write(buf[0:idx]) |
| 55 | output.close() |
| 56 | i += 1 |
| 57 | output = new_file_func(prefix, i) |
| 58 | output.write(buf[idx:idx + sep_len]) |
| 59 | else: |
| 60 | output.write(buf[0:sep_len]) |
| 61 | |
| 62 | buf = buf[idx + sep_len:] |
| 63 | |
| 64 | # corner case: separator is between this buf and next one. In this |
| 65 | # case, we write to current output everything before that and |
| 66 | # iterate |
| 67 | if len(buf) > sep_len and separator[0] in buf[-sep_len:]: |
| 68 | output.write(buf[:-sep_len]) |
| 69 | buf = buf[-sep_len:] |
| 70 | continue |
| 71 | |
| 72 | # else: continue writing to the current output and iterate |
| 73 | output.write(buf) |
| 74 | buf = b"" |
| 75 | |
| 76 | output.close() |
| 77 | |
| 78 | def chunk_file(input_file, output_file, from_pos, to_pos): |
| 79 | ifl = open(input_file, 'rb') |
| 80 | ofl = open(output_file, 'wb') |
| 81 | |
| 82 | ifl.seek(from_pos) |
| 83 | ofl.write(ifl.read(to_pos-from_pos)) |
| 84 | ifl.close() |
| 85 | ofl.close() |
| 86 | |
| 87 | if __name__ == "__main__": |
| 88 | parser = argparse.ArgumentParser() |
| 89 | |
| 90 | parser.add_argument("-s", "--separator", help="string for the separator") |
| 91 | parser.add_argument("-p", "--prefix", help="prefix for split files") |
| 92 | parser.add_argument("input_file", help="input file") |
| 93 | parser.add_argument("-f", "--from-pos", type=int, default=-1) |
| 94 | parser.add_argument("-t", "--to-pos", type=int, default=-1) |
| 95 | parser.add_argument("-o", "--output", default=None) |
| 96 | |
| 97 | args = parser.parse_args() |
| 98 | if args.from_pos > -1 and args.to_pos > -1: |
| 99 | chunk_file(input_file=args.input_file, output_file=args.output, |
| 100 | from_pos=args.from_pos, to_pos=args.to_pos) |
| 101 | else: |
| 102 | split_file(separator=args.separator.encode('UTF-8', errors="surrogateescape"), |
| 103 | prefix=args.prefix, input_file=args.input_file) |