Commit | Line | Data |
---|---|---|
3759f796 ERE |
1 | #!/usr/bin/env python |
2 | ||
3 | import argparse | |
4 | ||
5 | BUFSIZE = 16 * 1024 | |
6 | ||
7 | def split_file(separator, prefix, input_file): | |
8 | ''' | |
9 | splits a file when it finds a regexp, including the regexp in the begining | |
10 | of the new file | |
11 | ''' | |
12 | i = 0 | |
13 | pos = 0 | |
14 | buf = "" | |
15 | sep_len = len(separator) | |
16 | if sep_len == 0: | |
17 | raise Exception("empty separator") | |
18 | ||
19 | output = open(prefix + str(i), 'w') | |
20 | ||
21 | # buffered search. we try not to have the while input file in memory, as | |
22 | # it's not needed | |
23 | with open(input_file, 'r') as f: | |
24 | while True: | |
25 | buf += f.read(BUFSIZE) | |
26 | if len(buf) == 0: | |
27 | break | |
28 | ||
29 | # split using the separator | |
30 | while separator in buf: | |
31 | idx = buf.index(separator) | |
32 | ||
33 | if idx > 0: | |
34 | output.write(buf[0:idx]) | |
35 | output.close() | |
36 | i += 1 | |
37 | output = open(prefix + str(i), 'w') | |
38 | output.write(buf[idx:idx + sep_len]) | |
39 | else: | |
40 | output.write(buf[0:sep_len]) | |
41 | ||
42 | buf = buf[idx + sep_len:] | |
43 | ||
44 | # corner case: separator is between this buf and next one. In this | |
45 | # case, we write to current output everything before that and | |
46 | # iterate | |
47 | if separator[0] in buf[-sep_len:]: | |
48 | output.write(buf[:-sep_len]) | |
49 | buf = buf[-sep_len:] | |
50 | continue | |
51 | ||
52 | # else: continue writing to the current output and iterate | |
53 | output.write(buf) | |
54 | buf = "" | |
55 | ||
56 | output.close() | |
57 | ||
58 | if __name__ == "__main__": | |
59 | parser = argparse.ArgumentParser() | |
60 | ||
61 | parser.add_argument("-s", "--separator", required=True, | |
62 | help="string for the separator") | |
63 | parser.add_argument("-p", "--prefix", required=True, | |
64 | help="prefix for split files") | |
65 | parser.add_argument("input_file", help="input file") | |
66 | ||
67 | args = parser.parse_args() | |
68 | split_file(separator=args.separator, prefix=args.prefix, input_file=args.input_file) |