Increase version to 1.7.3
[pyi2ncommon] / src / log_read.py
... / ...
CommitLineData
1# The software in this package is distributed under the GNU General
2# Public License version 2 (with a special exception described below).
3#
4# A copy of GNU General Public License (GPL) is included in this distribution,
5# in the file COPYING.GPL.
6#
7# As a special exception, if other files instantiate templates or use macros
8# or inline functions from this file, or you compile this file and link it
9# with other works to produce a work based on this file, this file
10# does not by itself cause the resulting work to be covered
11# by the GNU General Public License.
12#
13# However the source code for this file must still be made available
14# in accordance with section (3) of the GNU General Public License.
15#
16# This exception does not invalidate any other reasons why a work based
17# on this file might be covered by the GNU General Public License.
18#
19# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
20
21"""
22Iterative reading of log files, similar to shell command `tail -f`.
23
24Copyright: Intra2net AG
25
26Basic Functionality (class :py:class:`IterativeReader`):
27Runs stat in a loop to find out whether file size has changed. Then reads the
28new data and forwards that
29
30.. todo:: Want to also use lsof to find out whether file/pipe/socket was
31 closed, so can automatically return from read loop.
32
33:py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
34it line-wise as is normal for log files
35
36:py:class:`LogParser` takes those lines and tries to parse them into fields
37like date, time, module name, urgency and message.
38
39.. todo:: auto-detect log line layout
40"""
41import os
42import os.path
43import re
44from warnings import warn
45import logging
46from contextlib import contextmanager
47from .iter_helpers import zip_longest
48from .type_helpers import is_str_or_byte, is_file_obj
49
50
51class LogReadWarning(UserWarning):
52 """Warnings issued by classes in this module."""
53 pass
54
55
56def true_func(_):
57 """Replacement for :py:func:`check_is_used`. Returns `True` always."""
58 return True
59
60
61def false_func(_):
62 """Replacement for :py:func:`check_is_used`. Returns `False` always."""
63 return False
64
65
66def check_is_used(file_handle):
67 """
68 Check whether file is being written to.
69
70 To be implemented, e.g. using lsof.
71
72 If beneficial could also easily supply python file object as arg.
73
74 :param int file_handle: OS-level file descriptor
75 """
76 raise NotImplementedError(file_handle)
77
78
79#: counter for unknown sources in :py:func:`create_description`
80_create_description_unknown_counter = 0
81
82
83def create_description(file_obj, file_handle):
84 """
85 Create some description for given file-like object / file descriptor.
86
87 :param file_obj: file-like object
88 :param int file_handle: os-level file descriptor
89 :returns: Short description for file-like object
90 :rtype: string
91 """
92 global _create_description_unknown_counter
93
94 try:
95 desc = file_obj.name
96 if desc:
97 return desc
98 except AttributeError:
99 pass
100
101 if file_handle is not None:
102 return 'file{0}'.format(file_handle)
103 else:
104 _create_description_unknown_counter += 1
105 return 'unknown{0}'.format(_create_description_unknown_counter)
106
107
108#: error message for IterativeReader constructor
109_STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
110 'files --> use with open(file_name)!'
111
112
113class IterativeReader(object):
114 """
115 Read continuously from a given file.
116
117 Use `os.stat(file_obj.fileno()).st_size` as measure whether file has
118 changed or not; Always reads as much data as possible.
119
120 Does not care about closing files, so does not accept file names.
121
122 This is the base for class :py:class:`LineReader` that just has to
123 implement a different :py:meth:`prepare_result` method.
124 """
125
126 def __init__(self, sources, descs=None, keep_watching=False):
127 """
128 Create a reader; do some basic checks on args.
129
130 :param sources: iterable over sources. Sources can be opened file
131 objects or read-opened os-level file descriptors.
132 Calling code has to ensure they are closed properly, so
133 best use this within a "with open(file_name) as
134 file_handle:"-context. If sources is a single file
135 obj/descriptor, both source and desc will be converted
136 to lists of length 1
137 :param descs: can be anything of same length as sources. If sources is
138 a single source, then descs is also converted to a list
139 of length 1. If not given (i.e. None), will use
140 :py:func:`create_description` to guess descriptions
141 :param bool keep_watching: keep watching file that is not changing in
142 size. Need to manually tell whether file
143 is being written to or not since auto-detect
144 is not implemented yet.
145 :raises: OSError when testing fstat on source
146 """
147 if not sources:
148 raise ValueError('need at least some source!')
149 elif is_str_or_byte(sources):
150 raise ValueError(_STR_ERR.format(sources))
151 elif is_file_obj(sources) or isinstance(sources, int):
152 source_input = [sources, ]
153 desc_input = [descs, ]
154 else:
155 source_input = sources # assume some iterable
156 desc_input = descs
157
158 # now divide sources into os-level file descriptors for os.fstat,
159 # and file objects for read()
160 self.file_objs = []
161 self.file_handles = [] # file descriptOR, not descriptION
162 for source in source_input:
163 if is_file_obj(source):
164 self.file_objs.append(source)
165 self.file_handles.append(source.fileno())
166 elif isinstance(source, int):
167 self.file_objs.append(os.fdopen(source))
168 self.file_handles.append(source)
169 elif is_str_or_byte(source):
170 raise ValueError(_STR_ERR.format(source))
171 else:
172 raise ValueError('source {0} is neither file obj nor file '
173 'descriptor!')
174
175 # try to fstat the new file descriptor just for testing
176 os.fstat(self.file_handles[-1])
177
178 # guess descriptions if not given
179 if not desc_input:
180 self.descriptions = [create_description(obj, file_handle)
181 for obj, file_handle
182 in zip(self.file_objs, self.file_handles)]
183 else:
184 try:
185 if len(desc_input) != len(self.file_objs):
186 raise ValueError('need same number of sources and '
187 'descriptions!')
188 except TypeError:
189 pass # desc_input is generator or so
190
191 self.descriptions = []
192 for obj, file_handle, description in \
193 zip_longest(self.file_objs, self.file_handles, desc_input):
194 if obj is None:
195 raise ValueError('more descriptions than sources!')
196 elif description is None:
197 self.descriptions.append(create_description(obj,
198 file_handle))
199 else:
200 self.descriptions.append(description)
201
202 self.last_sizes = [0 for _ in self.file_objs]
203 self.ignore = [False for _ in self.file_objs]
204
205 if keep_watching:
206 self.is_used_func = true_func
207 else:
208 self.is_used_func = false_func
209 # use some day: self.is_used_func = check_is_used
210
211 for obj, file_handle, description in \
212 zip(self.file_objs, self.file_handles, self.descriptions):
213 logging.debug('log_read initialized with file descriptor {0}, '
214 'file obj {1}, description "{2}"'
215 .format(file_handle, obj, description))
216
217 def n_sources(self):
218 """Return number of sources given to constructor."""
219 return len(self.file_objs)
220
221 def n_active_sources(self):
222 """Return number of sources we are actually watching."""
223 return len(self.ignore) - sum(self.ignore)
224
225 def __iter__(self):
226 """
227 Continue reading from sources, yield results.
228
229 yields result of :py:meth:`prepare_result`, which depends on what
230 subclass you called this function from.
231 """
232 while True:
233 if all(self.ignore):
234 break
235
236 for idx, (obj, file_handle, description, last_size, do_ignore) in \
237 enumerate(zip(self.file_objs, self.file_handles,
238 self.descriptions, self.last_sizes,
239 self.ignore)):
240 if do_ignore:
241 continue
242
243 # get new file size
244 new_size = os.fstat(file_handle).st_size
245
246 # compare to old size
247 if new_size == last_size:
248 if not self.is_used_func(file_handle):
249 self.ignore[idx] = True
250 else:
251 if new_size < last_size: # happened at start of some tests
252 warn('{0} / {1} has become smaller ({2} --> {3})! '
253 .format(obj, description, last_size, new_size)
254 + 'Maybe you are reading from a half-initialized '
255 + 'file?',
256 category=LogReadWarning)
257 try:
258 new_data = obj.read()
259 except OSError as ose: # includes IOErrors
260 warn('io error reading from {0} / {1}: {2})'
261 .format(obj, description, ose),
262 category=LogReadWarning)
263 new_data = str(ose)
264 except UnicodeDecodeError as ude:
265 warn('unicode error reading from {0} / {1}: {2}'
266 .format(obj, description, ude),
267 category=LogReadWarning)
268 new_data = str(ude)
269
270 # post-processing
271 for result in self.prepare_result(description, new_data, idx):
272 yield result
273
274 # prepare next iteration
275 self.last_sizes[idx] = new_size
276
277 def prepare_result(self, description, data, idx):
278 """
279 From raw new data create some yield-able results.
280
281 Intended for overwriting in subclasses.
282
283 This function is called from `__iter__` for each new data that becomes
284 available. It has to provide results which are forwarded to caller.
285
286 This base implementation just yields its input, so new data is yielded
287 from `__iter__` as-is.
288
289 :param str description: Description of source of lines, one of
290 :py:data:`self.descriptions`
291 :param str data: Text data read from source
292 :param idx: Index of data source
293 :returns: nothing but yields [(description, data, idx)], same as input
294 """
295 yield description, data, idx
296
297
298#: characters to `rstrip()` from end of complete lines
299LINE_SPLITTERS = '\n\r'
300
301
302class LineReader(IterativeReader):
303 """
304 An :py:class:`IterativeReader` that returns new data line-wise.
305
306 This means buffering partial line data.
307 """
308
309 def __init__(self, *args, **kwargs):
310 """Create an :py:class:`IterativeReader and buffers for sources."""
311 super(LineReader, self).__init__(*args, **kwargs)
312 self.line_buffers = ['' for _ in range(self.n_sources())]
313
314 def prepare_result(self, description, new_data, idx):
315 """
316 Take raw new data and split it into lines.
317
318 If line is not complete, then buffer it.
319
320 Args: see super class method :py:meth:`IterativeReader.prepare_result`
321
322 :returns: list of 3-tuples `(description, line, idx)` where
323 `description` and `idx` are same as args, and `line` is
324 without trailing newline characters
325 :rtype: [(str, str, int)]
326 """
327 all_data = self.line_buffers[idx] + new_data
328 self.line_buffers[idx] = ''
329 should_be_no_new_lines = False
330 for line in all_data.splitlines(True):
331 if line[-1] in LINE_SPLITTERS:
332 yield description, line.rstrip(LINE_SPLITTERS), idx
333 elif should_be_no_new_lines:
334 # self-check
335 raise ValueError('Programming error: something went wrong with '
336 'line splitting/buffering.')
337 else:
338 self.line_buffers[idx] = line
339 should_be_no_new_lines = True # (this should be the last)
340
341
342class LogParser(LineReader):
343 """
344 Takes lines from :py:class:`LineReader` and parses their contents.
345
346 Requires a pattern for log lines, auto-detection is not implemented yet.
347
348 Iteration returns :py:class:`re.match` result or -- if matching failed --
349 None. The latest unparsed line is available as `self.last_unparsed_line`.
350 Usage recommendation::
351
352 with open(log_file_name, 'rt') as file_handle:
353 parser = log_read.LogParser(file_handle, pattern=my_pattern):
354 for _, data, _ in parser:
355 if data is None:
356 print(f'Failed to parse line {parser.last_unparsed_line}')
357 continue
358 line_parts = data.groupdict()
359 ...do stuff with line_parts...
360 """
361
362 def __init__(self, log_file, pattern=None):
363 """
364 Create a LogParser.
365
366 :param log_file: source of log lines to parse
367 :type log_file: see arg `sources` of constructor of :py:class:`IterativeReader`
368 :param pattern: regexp to parse log lines; None (default) to return
369 line as they are
370 :type pattern: str or None (default)
371 """
372 super(LogParser, self).__init__(log_file)
373
374 self.pattern = pattern
375 self.last_unparsed_line = ''
376
377 def prepare_result(self, *args):
378 """
379 Try parsing lines.
380
381 Args: see super class method :py:meth:`IterativeReader.prepare_result`
382
383 :returns: 3-tuples `(description, line, idx)` where `description` and
384 `idx` are same as input args and `line` is either a
385 :py:class:`re.Match` if line matched :py:data:`self.pattern`
386 or just str if line did not match.
387 :rtype: [(str, :py:class:`re.Match` OR str, int)]
388 """
389 # let super class split data into lines
390 for description, raw_line, idx in \
391 super(LogParser, self).prepare_result(*args):
392 matches = re.match(self.pattern, raw_line)
393 if matches:
394 yield description, matches, idx
395 else:
396 self.last_unparsed_line = raw_line
397 yield description, None, idx
398
399 @classmethod
400 @contextmanager
401 def create_for(cls, filename, *args, **kwargs):
402 """
403 Open single file, yield LogParser. Ensures file is closed afterwards.
404
405 This allows opening file and creation LogParser for it to one line::
406
407 with LogParser.create_for('/var/log/messages', SYS_LOG_PATTERN) as parser:
408 for _, matches, _ in parser:
409 try:
410 print(matches.groupdict())
411 except Exception:
412 print(f'UNPARSED: {parser.last_unparsed_line}')
413
414 :param str filename: something that :py:meth:`open` accepts
415 :param args: Forwarded to constructor
416 :param kwargs: Forwarded to constructor
417 """
418 with open(filename) as file_handle:
419 yield cls(file_handle, *args, **kwargs)
420
421
422################################################################################
423# PATTERNS FOR FREQUENT LOG FILES
424################################################################################
425
426# pattern of squid proxy logs. group names are best guesses
427PROXY_LOG_PATTERN = \
428 r'\s*(?P<timestamp>\d+\.\d+\.\d+\s+\d+:\d+:\d+|\d+\.\d+)\s+(?P<size1>\d+)\s+' \
429 + r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+(?P<status_text>[A-Z_]+)/(?P<status_code>\d+)\s+' \
430 + r'(?P<size2>\d+)\s+(?P<command>\S+)\s+(?P<url>\S+)\s+(?P<user>\S+)\s+' \
431 + r'(?P<action>[A-Z_]+)/(?P<origin>\S+)\s+(?P<mimetype>\S+)\s+(?P<unknown>.*)\s*'
432
433# pattern for linux system logs (usually "messages" or "syslog" also "maillog"
434SYS_LOG_PATTERN = \
435 r'\s*(?P<timestamp>\w{3} +\d{1,2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) ' \
436 + r'(?P<procname>[^\[\]:]+)(?:\[(?P<pid>\d+)\])?: (?P<message>.*)'