Commit | Line | Data |
---|---|---|
3759f796 ERE |
1 | #!/usr/bin/env python |
2 | ||
866c42e6 DGM |
3 | # Copyright (C) 2013 Intra2net AG |
4 | # | |
494b38aa DGM |
5 | # This program is free software; you can redistribute it and/or modify |
6 | # it under the terms of the GNU Lesser General Public License as published | |
7 | # by the Free Software Foundation; either version 3 of the License, or | |
866c42e6 DGM |
8 | # (at your option) any later version. |
9 | # | |
10 | # This program is distributed in the hope that it will be useful, | |
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
494b38aa | 13 | # GNU Lesser General Public License for more details. |
866c42e6 DGM |
14 | # |
15 | # You should have received a copy of the GNU General Public License | |
494b38aa DGM |
16 | # along with this program. If not, see |
17 | # <http://www.gnu.org/licenses/lgpl-3.0.html> | |
866c42e6 DGM |
18 | |
19 | ||
3759f796 ERE |
20 | import argparse |
21 | ||
22 | BUFSIZE = 16 * 1024 | |
23 | ||
0112ba0d | 24 | def split_file(separator, prefix, input_file, new_file_func=None): |
3759f796 ERE |
25 | ''' |
26 | splits a file when it finds a regexp, including the regexp in the begining | |
27 | of the new file | |
28 | ''' | |
29 | i = 0 | |
30 | pos = 0 | |
31 | buf = "" | |
32 | sep_len = len(separator) | |
33 | if sep_len == 0: | |
34 | raise Exception("empty separator") | |
35 | ||
0112ba0d ERE |
36 | if new_file_func is None: |
37 | new_file_func = lambda prefix, i: open(prefix + str(i), 'w') | |
38 | ||
39 | output = new_file_func(prefix, i) | |
3759f796 ERE |
40 | |
41 | # buffered search. we try not to have the while input file in memory, as | |
42 | # it's not needed | |
43 | with open(input_file, 'r') as f: | |
44 | while True: | |
45 | buf += f.read(BUFSIZE) | |
46 | if len(buf) == 0: | |
47 | break | |
48 | ||
49 | # split using the separator | |
50 | while separator in buf: | |
51 | idx = buf.index(separator) | |
52 | ||
53 | if idx > 0: | |
54 | output.write(buf[0:idx]) | |
55 | output.close() | |
56 | i += 1 | |
0112ba0d | 57 | output = new_file_func(prefix, i) |
3759f796 ERE |
58 | output.write(buf[idx:idx + sep_len]) |
59 | else: | |
60 | output.write(buf[0:sep_len]) | |
61 | ||
62 | buf = buf[idx + sep_len:] | |
63 | ||
64 | # corner case: separator is between this buf and next one. In this | |
65 | # case, we write to current output everything before that and | |
66 | # iterate | |
a6210a35 | 67 | if len(buf) > sep_len and separator[0] in buf[-sep_len:]: |
3759f796 ERE |
68 | output.write(buf[:-sep_len]) |
69 | buf = buf[-sep_len:] | |
70 | continue | |
71 | ||
72 | # else: continue writing to the current output and iterate | |
73 | output.write(buf) | |
74 | buf = "" | |
75 | ||
76 | output.close() | |
77 | ||
78 | if __name__ == "__main__": | |
79 | parser = argparse.ArgumentParser() | |
80 | ||
81 | parser.add_argument("-s", "--separator", required=True, | |
82 | help="string for the separator") | |
83 | parser.add_argument("-p", "--prefix", required=True, | |
84 | help="prefix for split files") | |
85 | parser.add_argument("input_file", help="input file") | |
86 | ||
87 | args = parser.parse_args() | |
88 | split_file(separator=args.separator, prefix=args.prefix, input_file=args.input_file) |