Commit | Line | Data |
---|---|---|
3237d2a6 CH |
1 | # The software in this package is distributed under the GNU General |
2 | # Public License version 2 (with a special exception described below). | |
3 | # | |
4 | # A copy of GNU General Public License (GPL) is included in this distribution, | |
5 | # in the file COPYING.GPL. | |
6 | # | |
7 | # As a special exception, if other files instantiate templates or use macros | |
8 | # or inline functions from this file, or you compile this file and link it | |
9 | # with other works to produce a work based on this file, this file | |
10 | # does not by itself cause the resulting work to be covered | |
11 | # by the GNU General Public License. | |
12 | # | |
13 | # However the source code for this file must still be made available | |
14 | # in accordance with section (3) of the GNU General Public License. | |
15 | # | |
16 | # This exception does not invalidate any other reasons why a work based | |
17 | # on this file might be covered by the GNU General Public License. | |
f365f614 CH |
18 | # |
19 | # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com> | |
3237d2a6 | 20 | |
e2e13005 | 21 | """ |
e2e13005 CH |
22 | Iterative reading of log files, similar to shell command `tail -f`. |
23 | ||
24 | Copyright: Intra2net AG | |
25 | ||
7c362208 | 26 | Basic Functionality (class :py:class:`IterativeReader`): |
e7d49180 CH |
27 | Runs stat in a loop to find out whether file size has changed. Then reads the |
28 | new data and forwards that | |
29 | ||
e2e13005 | 30 | .. todo:: Want to also use lsof to find out whether file/pipe/socket was |
545f3458 | 31 | closed, so can automatically return from read loop. |
e7d49180 CH |
32 | |
33 | :py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns | |
34 | it line-wise as is normal for log files | |
35 | ||
36 | :py:class:`LogParser` takes those lines and tries to parse them into fields | |
37 | like date, time, module name, urgency and message. | |
38 | ||
2713b352 | 39 | .. todo:: auto-detect log line layout |
e7d49180 | 40 | """ |
e7d49180 | 41 | import os |
215a12fd | 42 | import os.path |
445afb23 | 43 | import re |
e7d49180 | 44 | from warnings import warn |
77a07d09 | 45 | import logging |
a51235e2 | 46 | from contextlib import contextmanager |
1242b1cf | 47 | from .iter_helpers import zip_longest |
1242b1cf | 48 | from .type_helpers import is_str_or_byte, is_file_obj |
e7d49180 CH |
49 | |
50 | ||
51 | class LogReadWarning(UserWarning): | |
e2e13005 | 52 | """Warnings issued by classes in this module.""" |
e7d49180 CH |
53 | pass |
54 | ||
55 | ||
e2e13005 CH |
56 | def true_func(_): |
57 | """Replacement for :py:func:`check_is_used`. Returns `True` always.""" | |
e7d49180 CH |
58 | return True |
59 | ||
60 | ||
545f3458 CH |
61 | def false_func(_): |
62 | """Replacement for :py:func:`check_is_used`. Returns `False` always.""" | |
63 | return False | |
64 | ||
65 | ||
e2e13005 CH |
66 | def check_is_used(file_handle): |
67 | """ | |
68 | Check whether file is being written to. | |
69 | ||
70 | To be implemented, e.g. using lsof. | |
e7d49180 | 71 | |
e2e13005 CH |
72 | If beneficial could also easily supply python file object as arg. |
73 | ||
74 | :param int file_handle: OS-level file descriptor | |
e7d49180 | 75 | """ |
e2e13005 | 76 | raise NotImplementedError(file_handle) |
e7d49180 CH |
77 | |
78 | ||
e2e13005 | 79 | #: counter for unknown sources in :py:func:`create_description` |
e7d49180 CH |
80 | _create_description_unknown_counter = 0 |
81 | ||
acc472a4 | 82 | |
3e0b3965 | 83 | def create_description(file_obj, file_handle): |
e2e13005 CH |
84 | """ |
85 | Create some description for given file-like object / file descriptor. | |
e7d49180 CH |
86 | |
87 | :param file_obj: file-like object | |
3e0b3965 | 88 | :param int file_handle: os-level file descriptor |
e2e13005 CH |
89 | :returns: Short description for file-like object |
90 | :rtype: string | |
e7d49180 | 91 | """ |
e7d49180 CH |
92 | global _create_description_unknown_counter |
93 | ||
94 | try: | |
95 | desc = file_obj.name | |
96 | if desc: | |
97 | return desc | |
98 | except AttributeError: | |
99 | pass | |
100 | ||
3e0b3965 CH |
101 | if file_handle is not None: |
102 | return 'file{0}'.format(file_handle) | |
e7d49180 CH |
103 | else: |
104 | _create_description_unknown_counter += 1 | |
105 | return 'unknown{0}'.format(_create_description_unknown_counter) | |
106 | ||
107 | ||
108 | #: error message for IterativeReader constructor | |
109 | _STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \ | |
110 | 'files --> use with open(file_name)!' | |
111 | ||
112 | ||
879f0150 | 113 | class IterativeReader(object): |
e2e13005 CH |
114 | """ |
115 | Read continuously from a given file. | |
e7d49180 | 116 | |
e2e13005 CH |
117 | Use `os.stat(file_obj.fileno()).st_size` as measure whether file has |
118 | changed or not; Always reads as much data as possible. | |
e7d49180 | 119 | |
e2e13005 | 120 | Does not care about closing files, so does not accept file names. |
e7d49180 CH |
121 | |
122 | This is the base for class :py:class:`LineReader` that just has to | |
e2e13005 | 123 | implement a different :py:meth:`prepare_result` method. |
e7d49180 CH |
124 | """ |
125 | ||
545f3458 | 126 | def __init__(self, sources, descs=None, keep_watching=False): |
e2e13005 CH |
127 | """ |
128 | Create a reader; do some basic checks on args. | |
e7d49180 CH |
129 | |
130 | :param sources: iterable over sources. Sources can be opened file | |
131 | objects or read-opened os-level file descriptors. | |
132 | Calling code has to ensure they are closed properly, so | |
133 | best use this within a "with open(file_name) as | |
134 | file_handle:"-context. If sources is a single file | |
135 | obj/descriptor, both source and desc will be converted | |
136 | to lists of length 1 | |
137 | :param descs: can be anything of same length as sources. If sources is | |
138 | a single source, then descs is also converted to a list | |
139 | of length 1. If not given (i.e. None), will use | |
140 | :py:func:`create_description` to guess descriptions | |
545f3458 CH |
141 | :param bool keep_watching: keep watching file that is not changing in |
142 | size. Need to manually tell whether file | |
143 | is being written to or not since auto-detect | |
144 | is not implemented yet. | |
e7d49180 CH |
145 | :raises: OSError when testing fstat on source |
146 | """ | |
147 | if not sources: | |
148 | raise ValueError('need at least some source!') | |
149 | elif is_str_or_byte(sources): | |
150 | raise ValueError(_STR_ERR.format(sources)) | |
151 | elif is_file_obj(sources) or isinstance(sources, int): | |
152 | source_input = [sources, ] | |
153 | desc_input = [descs, ] | |
154 | else: | |
155 | source_input = sources # assume some iterable | |
156 | desc_input = descs | |
157 | ||
158 | # now divide sources into os-level file descriptors for os.fstat, | |
159 | # and file objects for read() | |
160 | self.file_objs = [] | |
3e0b3965 | 161 | self.file_handles = [] # file descriptOR, not descriptION |
e7d49180 CH |
162 | for source in source_input: |
163 | if is_file_obj(source): | |
164 | self.file_objs.append(source) | |
3e0b3965 | 165 | self.file_handles.append(source.fileno()) |
e7d49180 CH |
166 | elif isinstance(source, int): |
167 | self.file_objs.append(os.fdopen(source)) | |
3e0b3965 | 168 | self.file_handles.append(source) |
e7d49180 CH |
169 | elif is_str_or_byte(source): |
170 | raise ValueError(_STR_ERR.format(source)) | |
171 | else: | |
172 | raise ValueError('source {0} is neither file obj nor file ' | |
173 | 'descriptor!') | |
174 | ||
d910eba5 | 175 | # try to fstat the new file descriptor just for testing |
3e0b3965 | 176 | os.fstat(self.file_handles[-1]) |
e7d49180 CH |
177 | |
178 | # guess descriptions if not given | |
179 | if not desc_input: | |
3e0b3965 CH |
180 | self.descriptions = [create_description(obj, file_handle) |
181 | for obj, file_handle | |
182 | in zip(self.file_objs, self.file_handles)] | |
e7d49180 CH |
183 | else: |
184 | try: | |
185 | if len(desc_input) != len(self.file_objs): | |
186 | raise ValueError('need same number of sources and ' | |
187 | 'descriptions!') | |
188 | except TypeError: | |
189 | pass # desc_input is generator or so | |
190 | ||
191 | self.descriptions = [] | |
3e0b3965 CH |
192 | for obj, file_handle, description in \ |
193 | zip_longest(self.file_objs, self.file_handles, desc_input): | |
e7d49180 CH |
194 | if obj is None: |
195 | raise ValueError('more descriptions than sources!') | |
196 | elif description is None: | |
197 | self.descriptions.append(create_description(obj, | |
3e0b3965 | 198 | file_handle)) |
e7d49180 CH |
199 | else: |
200 | self.descriptions.append(description) | |
201 | ||
202 | self.last_sizes = [0 for _ in self.file_objs] | |
203 | self.ignore = [False for _ in self.file_objs] | |
204 | ||
545f3458 | 205 | if keep_watching: |
3e0b3965 | 206 | self.is_used_func = true_func |
545f3458 CH |
207 | else: |
208 | self.is_used_func = false_func | |
209 | # use some day: self.is_used_func = check_is_used | |
e7d49180 | 210 | |
3e0b3965 CH |
211 | for obj, file_handle, description in \ |
212 | zip(self.file_objs, self.file_handles, self.descriptions): | |
77a07d09 CH |
213 | logging.debug('log_read initialized with file descriptor {0}, ' |
214 | 'file obj {1}, description "{2}"' | |
3e0b3965 | 215 | .format(file_handle, obj, description)) |
e7d49180 CH |
216 | |
217 | def n_sources(self): | |
e2e13005 | 218 | """Return number of sources given to constructor.""" |
e7d49180 CH |
219 | return len(self.file_objs) |
220 | ||
221 | def n_active_sources(self): | |
e2e13005 | 222 | """Return number of sources we are actually watching.""" |
e7d49180 CH |
223 | return len(self.ignore) - sum(self.ignore) |
224 | ||
225 | def __iter__(self): | |
e2e13005 CH |
226 | """ |
227 | Continue reading from sources, yield results. | |
228 | ||
44353454 CH |
229 | yields result of :py:meth:`prepare_result`, which depends on what |
230 | subclass you called this function from. | |
e2e13005 | 231 | """ |
e7d49180 | 232 | while True: |
545f3458 CH |
233 | if all(self.ignore): |
234 | break | |
235 | ||
3e0b3965 CH |
236 | for idx, (obj, file_handle, description, last_size, do_ignore) in \ |
237 | enumerate(zip(self.file_objs, self.file_handles, | |
e7d49180 CH |
238 | self.descriptions, self.last_sizes, |
239 | self.ignore)): | |
e7d49180 CH |
240 | if do_ignore: |
241 | continue | |
242 | ||
243 | # get new file size | |
3e0b3965 | 244 | new_size = os.fstat(file_handle).st_size |
e7d49180 CH |
245 | |
246 | # compare to old size | |
247 | if new_size == last_size: | |
3e0b3965 | 248 | if not self.is_used_func(file_handle): |
acc472a4 | 249 | self.ignore[idx] = True |
9f2fbfa7 | 250 | else: |
ea8b01a3 CH |
251 | if new_size < last_size: # happened at start of some tests |
252 | warn('{0} / {1} has become smaller ({2} --> {3})! ' | |
253 | .format(obj, description, last_size, new_size) | |
254 | + 'Maybe you are reading from a half-initialized ' | |
255 | + 'file?', | |
9f2fbfa7 | 256 | category=LogReadWarning) |
e7d49180 CH |
257 | try: |
258 | new_data = obj.read() | |
259 | except OSError as ose: # includes IOErrors | |
260 | warn('io error reading from {0} / {1}: {2})' | |
261 | .format(obj, description, ose), | |
262 | category=LogReadWarning) | |
9451b2ce CH |
263 | new_data = str(ose) |
264 | except UnicodeDecodeError as ude: | |
265 | warn('unicode error reading from {0} / {1}: {2}' | |
266 | .format(obj, description, ude), | |
267 | category=LogReadWarning) | |
268 | new_data = str(ude) | |
e7d49180 CH |
269 | |
270 | # post-processing | |
44353454 | 271 | for result in self.prepare_result(description, new_data, idx): |
e7d49180 CH |
272 | yield result |
273 | ||
274 | # prepare next iteration | |
275 | self.last_sizes[idx] = new_size | |
276 | ||
277 | def prepare_result(self, description, data, idx): | |
e2e13005 CH |
278 | """ |
279 | From raw new data create some yield-able results. | |
e7d49180 | 280 | |
7628bc48 | 281 | Intended for overwriting in subclasses. |
e7d49180 | 282 | |
df036fbe | 283 | This function is called from `__iter__` for each new data that becomes |
44353454 | 284 | available. It has to provide results which are forwarded to caller. |
e7d49180 | 285 | |
44353454 CH |
286 | This base implementation just yields its input, so new data is yielded |
287 | from `__iter__` as-is. | |
e2e13005 CH |
288 | |
289 | :param str description: Description of source of lines, one of | |
290 | :py:data:`self.descriptions` | |
7628bc48 | 291 | :param str data: Text data read from source |
e2e13005 | 292 | :param idx: Index of data source |
df036fbe | 293 | :returns: nothing but yields [(description, data, idx)], same as input |
e7d49180 | 294 | """ |
44353454 | 295 | yield description, data, idx |
e7d49180 CH |
296 | |
297 | ||
e2e13005 | 298 | #: characters to `rstrip()` from end of complete lines |
e7d49180 CH |
299 | LINE_SPLITTERS = '\n\r' |
300 | ||
acc472a4 | 301 | |
e7d49180 | 302 | class LineReader(IterativeReader): |
e2e13005 CH |
303 | """ |
304 | An :py:class:`IterativeReader` that returns new data line-wise. | |
edd68a74 | 305 | |
e2e13005 | 306 | This means buffering partial line data. |
e7d49180 CH |
307 | """ |
308 | ||
309 | def __init__(self, *args, **kwargs): | |
e2e13005 | 310 | """Create an :py:class:`IterativeReader and buffers for sources.""" |
01fe1580 | 311 | super(LineReader, self).__init__(*args, **kwargs) |
d910eba5 | 312 | self.line_buffers = ['' for _ in range(self.n_sources())] |
e7d49180 | 313 | |
d910eba5 | 314 | def prepare_result(self, description, new_data, idx): |
e2e13005 CH |
315 | """ |
316 | Take raw new data and split it into lines. | |
e7d49180 | 317 | |
e2e13005 | 318 | If line is not complete, then buffer it. |
e7d49180 | 319 | |
e2e13005 | 320 | Args: see super class method :py:meth:`IterativeReader.prepare_result` |
df036fbe | 321 | |
aef38def CH |
322 | :returns: list of 3-tuples `(description, line, idx)` where |
323 | `description` and `idx` are same as args, and `line` is | |
e2e13005 | 324 | without trailing newline characters |
aef38def | 325 | :rtype: [(str, str, int)] |
e7d49180 | 326 | """ |
e7d49180 | 327 | all_data = self.line_buffers[idx] + new_data |
d910eba5 | 328 | self.line_buffers[idx] = '' |
e7d49180 | 329 | should_be_no_new_lines = False |
879f0150 | 330 | for line in all_data.splitlines(True): |
e7d49180 | 331 | if line[-1] in LINE_SPLITTERS: |
44353454 | 332 | yield description, line.rstrip(LINE_SPLITTERS), idx |
e7d49180 | 333 | elif should_be_no_new_lines: |
37288ebb CH |
334 | # self-check |
335 | raise ValueError('Programming error: something went wrong with ' | |
336 | 'line splitting/buffering.') | |
e7d49180 CH |
337 | else: |
338 | self.line_buffers[idx] = line | |
339 | should_be_no_new_lines = True # (this should be the last) | |
340 | ||
e7d49180 | 341 | |
445afb23 | 342 | class LogParser(LineReader): |
e2e13005 CH |
343 | """ |
344 | Takes lines from :py:class:`LineReader` and parses their contents. | |
445afb23 | 345 | |
e2e13005 | 346 | Requires a pattern for log lines, auto-detection is not implemented yet. |
445afb23 | 347 | |
545f3458 | 348 | Iteration returns :py:class:`re.match` result or -- if matching failed -- |
44353454 | 349 | None. The latest unparsed line is available as `self.last_unparsed_line`. |
df036fbe | 350 | Usage recommendation:: |
545f3458 CH |
351 | |
352 | with open(log_file_name, 'rt') as file_handle: | |
44353454 CH |
353 | parser = log_read.LogParser(file_handle, pattern=my_pattern): |
354 | for _, data, _ in parser: | |
355 | if data is None: | |
356 | print(f'Failed to parse line {parser.last_unparsed_line}') | |
545f3458 | 357 | continue |
44353454 | 358 | line_parts = data.groupdict() |
545f3458 | 359 | ...do stuff with line_parts... |
445afb23 CH |
360 | """ |
361 | ||
362 | def __init__(self, log_file, pattern=None): | |
e2e13005 CH |
363 | """ |
364 | Create a LogParser. | |
445afb23 CH |
365 | |
366 | :param str log_file: name of log file to parse (required!) | |
367 | :param pattern: regexp to split log lines; None (default) to return | |
368 | line as they are | |
e2e13005 | 369 | :type pattern: str or None (default) |
445afb23 CH |
370 | """ |
371 | super(LogParser, self).__init__(log_file) | |
372 | ||
373 | self.pattern = pattern | |
44353454 | 374 | self.last_unparsed_line = '' |
445afb23 CH |
375 | |
376 | def prepare_result(self, *args): | |
e2e13005 CH |
377 | """ |
378 | Try parsing lines. | |
379 | ||
380 | Args: see super class method :py:meth:`IterativeReader.prepare_result` | |
df036fbe | 381 | |
aef38def CH |
382 | :returns: 3-tuples `(description, line, idx)` where `description` and |
383 | `idx` are same as input args and `line` is either a | |
384 | :py:class:`re.Match` if line matched :py:data:`self.pattern` | |
385 | or just str if line did not match. | |
386 | :rtype: [(str, :py:class:`re.Match` OR str, int)] | |
e2e13005 | 387 | """ |
445afb23 | 388 | # let super class split data into lines |
aef38def CH |
389 | for description, raw_line, idx in \ |
390 | super(LogParser, self).prepare_result(*args): | |
1a537f05 CH |
391 | matches = re.match(self.pattern, raw_line) |
392 | if matches: | |
44353454 | 393 | yield description, matches, idx |
445afb23 | 394 | else: |
44353454 CH |
395 | self.last_unparsed_line = raw_line |
396 | yield description, None, idx | |
a51235e2 CH |
397 | |
398 | @classmethod | |
399 | @contextmanager | |
400 | def create_for(cls, filename, *args, **kwargs): | |
401 | """ | |
402 | Open single file, yield LogParser. Ensures file is closed afterwards. | |
403 | ||
df036fbe | 404 | This allows opening file and creation LogParser for it to one line:: |
a51235e2 CH |
405 | |
406 | with LogParser.create_for('/var/log/messages', SYS_LOG_PATTERN) as parser: | |
407 | for _, matches, _ in parser: | |
408 | try: | |
409 | print(matches.groupdict()) | |
410 | except Exception: | |
411 | print(f'UNPARSED: {parser.last_unparsed_line}') | |
412 | ||
413 | :param str filename: something that :py:meth:`open` accepts | |
414 | :param args: Forwarded to constructor | |
415 | :param kwargs: Forwarded to constructor | |
416 | """ | |
417 | with open(filename) as file_handle: | |
418 | yield cls(file_handle, *args, **kwargs) | |
0dd2fb19 CH |
419 | |
420 | ||
421 | ################################################################################ | |
422 | # PATTERNS FOR FREQUENT LOG FILES | |
423 | ################################################################################ | |
424 | ||
425 | # pattern of squid proxy logs. group names are best guesses | |
426 | PROXY_LOG_PATTERN = \ | |
427 | r'\s*(?P<timestamp>\d+\.\d+\.\d+\s+\d+:\d+:\d+|\d+\.\d+)\s+(?P<size1>\d+)\s+' \ | |
428 | + r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+(?P<status_text>[A-Z_]+)/(?P<status_code>\d+)\s+' \ | |
429 | + r'(?P<size2>\d+)\s+(?P<command>\S+)\s+(?P<url>\S+)\s+(?P<user>\S+)\s+' \ | |
430 | + r'(?P<action>[A-Z_]+)/(?P<origin>\S+)\s+(?P<mimetype>\S+)\s+(?P<unknown>.*)\s*' | |
431 | ||
432 | # pattern for linux system logs (usually "messages" or "syslog" also "maillog" | |
433 | SYS_LOG_PATTERN = \ | |
802fdc29 | 434 | r'\s*(?P<timestamp>\w{3} +\d{1,2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) ' \ |
0dd2fb19 | 435 | + r'(?P<procname>[^\[\]:]+)(?:\[(?P<pid>\d+)\])?: (?P<message>.*)' |