Commit | Line | Data |
---|---|---|
6b2fa38f | 1 | #!/usr/bin/env python3 |
3759f796 | 2 | |
866c42e6 DGM |
3 | # Copyright (C) 2013 Intra2net AG |
4 | # | |
494b38aa DGM |
5 | # This program is free software; you can redistribute it and/or modify |
6 | # it under the terms of the GNU Lesser General Public License as published | |
7 | # by the Free Software Foundation; either version 3 of the License, or | |
866c42e6 DGM |
8 | # (at your option) any later version. |
9 | # | |
10 | # This program is distributed in the hope that it will be useful, | |
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
494b38aa | 13 | # GNU Lesser General Public License for more details. |
866c42e6 DGM |
14 | # |
15 | # You should have received a copy of the GNU General Public License | |
494b38aa DGM |
16 | # along with this program. If not, see |
17 | # <http://www.gnu.org/licenses/lgpl-3.0.html> | |
866c42e6 DGM |
18 | |
19 | ||
3759f796 ERE |
20 | import argparse |
21 | ||
22 | BUFSIZE = 16 * 1024 | |
23 | ||
0112ba0d | 24 | def split_file(separator, prefix, input_file, new_file_func=None): |
3759f796 ERE |
25 | ''' |
26 | splits a file when it finds a regexp, including the regexp in the begining | |
27 | of the new file | |
28 | ''' | |
29 | i = 0 | |
30 | pos = 0 | |
be60ffd0 | 31 | buf = b"" |
3759f796 ERE |
32 | sep_len = len(separator) |
33 | if sep_len == 0: | |
34 | raise Exception("empty separator") | |
35 | ||
0112ba0d | 36 | if new_file_func is None: |
be60ffd0 | 37 | new_file_func = lambda prefix, i: open(prefix + str(i), 'wb') |
0112ba0d ERE |
38 | |
39 | output = new_file_func(prefix, i) | |
3759f796 ERE |
40 | |
41 | # buffered search. we try not to have the while input file in memory, as | |
42 | # it's not needed | |
be60ffd0 | 43 | with open(input_file, 'rb') as f: |
3759f796 ERE |
44 | while True: |
45 | buf += f.read(BUFSIZE) | |
46 | if len(buf) == 0: | |
47 | break | |
48 | ||
49 | # split using the separator | |
50 | while separator in buf: | |
51 | idx = buf.index(separator) | |
52 | ||
53 | if idx > 0: | |
54 | output.write(buf[0:idx]) | |
55 | output.close() | |
56 | i += 1 | |
0112ba0d | 57 | output = new_file_func(prefix, i) |
3759f796 ERE |
58 | output.write(buf[idx:idx + sep_len]) |
59 | else: | |
60 | output.write(buf[0:sep_len]) | |
61 | ||
62 | buf = buf[idx + sep_len:] | |
63 | ||
64 | # corner case: separator is between this buf and next one. In this | |
65 | # case, we write to current output everything before that and | |
66 | # iterate | |
a6210a35 | 67 | if len(buf) > sep_len and separator[0] in buf[-sep_len:]: |
3759f796 ERE |
68 | output.write(buf[:-sep_len]) |
69 | buf = buf[-sep_len:] | |
70 | continue | |
71 | ||
72 | # else: continue writing to the current output and iterate | |
73 | output.write(buf) | |
be60ffd0 | 74 | buf = b"" |
3759f796 ERE |
75 | |
76 | output.close() | |
77 | ||
67389434 | 78 | def chunk_file(input_file, output_file, from_pos, to_pos): |
be60ffd0 ERE |
79 | ifl = open(input_file, 'rb') |
80 | ofl = open(output_file, 'wb') | |
67389434 ERE |
81 | |
82 | ifl.seek(from_pos) | |
83 | ofl.write(ifl.read(to_pos-from_pos)) | |
84 | ifl.close() | |
85 | ofl.close() | |
86 | ||
3759f796 ERE |
87 | if __name__ == "__main__": |
88 | parser = argparse.ArgumentParser() | |
89 | ||
67389434 ERE |
90 | parser.add_argument("-s", "--separator", help="string for the separator") |
91 | parser.add_argument("-p", "--prefix", help="prefix for split files") | |
3759f796 | 92 | parser.add_argument("input_file", help="input file") |
67389434 ERE |
93 | parser.add_argument("-f", "--from-pos", type=int, default=-1) |
94 | parser.add_argument("-t", "--to-pos", type=int, default=-1) | |
95 | parser.add_argument("-o", "--output", default=None) | |
3759f796 ERE |
96 | |
97 | args = parser.parse_args() | |
67389434 ERE |
98 | if args.from_pos > -1 and args.to_pos > -1: |
99 | chunk_file(input_file=args.input_file, output_file=args.output, | |
100 | from_pos=args.from_pos, to_pos=args.to_pos) | |
101 | else: | |
c7609167 ERE |
102 | split_file(separator=args.separator.encode('UTF-8', errors="surrogateescape"), |
103 | prefix=args.prefix, input_file=args.input_file) |