Commit | Line | Data |
---|---|---|
3759f796 ERE |
1 | #!/usr/bin/env python |
2 | ||
866c42e6 DGM |
3 | # Copyright (C) 2013 Intra2net AG |
4 | # | |
5 | # This program is free software: you can redistribute it and/or modify | |
6 | # it under the terms of the GNU General Public License as published by | |
7 | # the Free Software Foundation, either version 3 of the License, or | |
8 | # (at your option) any later version. | |
9 | # | |
10 | # This program is distributed in the hope that it will be useful, | |
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | # GNU General Public License for more details. | |
14 | # | |
15 | # You should have received a copy of the GNU General Public License | |
16 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | ||
3759f796 ERE |
19 | import argparse |
20 | ||
21 | BUFSIZE = 16 * 1024 | |
22 | ||
0112ba0d | 23 | def split_file(separator, prefix, input_file, new_file_func=None): |
3759f796 ERE |
24 | ''' |
25 | splits a file when it finds a regexp, including the regexp in the begining | |
26 | of the new file | |
27 | ''' | |
28 | i = 0 | |
29 | pos = 0 | |
30 | buf = "" | |
31 | sep_len = len(separator) | |
32 | if sep_len == 0: | |
33 | raise Exception("empty separator") | |
34 | ||
0112ba0d ERE |
35 | if new_file_func is None: |
36 | new_file_func = lambda prefix, i: open(prefix + str(i), 'w') | |
37 | ||
38 | output = new_file_func(prefix, i) | |
3759f796 ERE |
39 | |
40 | # buffered search. we try not to have the while input file in memory, as | |
41 | # it's not needed | |
42 | with open(input_file, 'r') as f: | |
43 | while True: | |
44 | buf += f.read(BUFSIZE) | |
45 | if len(buf) == 0: | |
46 | break | |
47 | ||
48 | # split using the separator | |
49 | while separator in buf: | |
50 | idx = buf.index(separator) | |
51 | ||
52 | if idx > 0: | |
53 | output.write(buf[0:idx]) | |
54 | output.close() | |
55 | i += 1 | |
0112ba0d | 56 | output = new_file_func(prefix, i) |
3759f796 ERE |
57 | output.write(buf[idx:idx + sep_len]) |
58 | else: | |
59 | output.write(buf[0:sep_len]) | |
60 | ||
61 | buf = buf[idx + sep_len:] | |
62 | ||
63 | # corner case: separator is between this buf and next one. In this | |
64 | # case, we write to current output everything before that and | |
65 | # iterate | |
66 | if separator[0] in buf[-sep_len:]: | |
67 | output.write(buf[:-sep_len]) | |
68 | buf = buf[-sep_len:] | |
69 | continue | |
70 | ||
71 | # else: continue writing to the current output and iterate | |
72 | output.write(buf) | |
73 | buf = "" | |
74 | ||
75 | output.close() | |
76 | ||
77 | if __name__ == "__main__": | |
78 | parser = argparse.ArgumentParser() | |
79 | ||
80 | parser.add_argument("-s", "--separator", required=True, | |
81 | help="string for the separator") | |
82 | parser.add_argument("-p", "--prefix", required=True, | |
83 | help="prefix for split files") | |
84 | parser.add_argument("input_file", help="input file") | |
85 | ||
86 | args = parser.parse_args() | |
87 | split_file(separator=args.separator, prefix=args.prefix, input_file=args.input_file) |