1 # The software in this package is distributed under the GNU General
2 # Public License version 2 (with a special exception described below).
4 # A copy of GNU General Public License (GPL) is included in this distribution,
5 # in the file COPYING.GPL.
7 # As a special exception, if other files instantiate templates or use macros
8 # or inline functions from this file, or you compile this file and link it
9 # with other works to produce a work based on this file, this file
10 # does not by itself cause the resulting work to be covered
11 # by the GNU General Public License.
13 # However the source code for this file must still be made available
14 # in accordance with section (3) of the GNU General Public License.
16 # This exception does not invalidate any other reasons why a work based
17 # on this file might be covered by the GNU General Public License.
19 # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
22 Iterative reading of log files, similar to shell command `tail -f`.
24 Copyright: Intra2net AG
26 Basic Functionality (class :py:class:`IterativeReader`):
27 Runs stat in a loop to find out whether file size has changed. Then reads the
28 new data and forwards that
30 .. todo:: Want to also use lsof to find out whether file/pipe/socket was
31 closed, so can automatically return from read loop.
33 :py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
34 it line-wise as is normal for log files
36 :py:class:`LogParser` takes those lines and tries to parse them into fields
37 like date, time, module name, urgency and message.
39 .. todo:: auto-detect log line layout
44 from warnings import warn
46 from contextlib import contextmanager
47 from .iter_helpers import zip_longest
48 from .type_helpers import is_str_or_byte, is_file_obj
51 class LogReadWarning(UserWarning):
52 """Warnings issued by classes in this module."""
57 """Replacement for :py:func:`check_is_used`. Returns `True` always."""
62 """Replacement for :py:func:`check_is_used`. Returns `False` always."""
66 def check_is_used(file_handle):
68 Check whether file is being written to.
70 To be implemented, e.g. using lsof.
72 If beneficial could also easily supply python file object as arg.
74 :param int file_handle: OS-level file descriptor
76 raise NotImplementedError(file_handle)
79 #: counter for unknown sources in :py:func:`create_description`
80 _create_description_unknown_counter = 0
83 def create_description(file_obj, file_handle):
85 Create some description for given file-like object / file descriptor.
87 :param file_obj: file-like object
88 :param int file_handle: os-level file descriptor
89 :returns: Short description for file-like object
92 global _create_description_unknown_counter
98 except AttributeError:
101 if file_handle is not None:
102 return 'file{0}'.format(file_handle)
104 _create_description_unknown_counter += 1
105 return 'unknown{0}'.format(_create_description_unknown_counter)
108 #: error message for IterativeReader constructor
109 _STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
110 'files --> use with open(file_name)!'
113 class IterativeReader(object):
115 Read continuously from a given file.
117 Use `os.stat(file_obj.fileno()).st_size` as measure whether file has
118 changed or not; Always reads as much data as possible.
120 Does not care about closing files, so does not accept file names.
122 This is the base for class :py:class:`LineReader` that just has to
123 implement a different :py:meth:`prepare_result` method.
126 def __init__(self, sources, descs=None, keep_watching=False):
128 Create a reader; do some basic checks on args.
130 :param sources: iterable over sources. Sources can be opened file
131 objects or read-opened os-level file descriptors.
132 Calling code has to ensure they are closed properly, so
133 best use this within a "with open(file_name) as
134 file_handle:"-context. If sources is a single file
135 obj/descriptor, both source and desc will be converted
137 :param descs: can be anything of same length as sources. If sources is
138 a single source, then descs is also converted to a list
139 of length 1. If not given (i.e. None), will use
140 :py:func:`create_description` to guess descriptions
141 :param bool keep_watching: keep watching file that is not changing in
142 size. Need to manually tell whether file
143 is being written to or not since auto-detect
144 is not implemented yet.
145 :raises: OSError when testing fstat on source
148 raise ValueError('need at least some source!')
149 elif is_str_or_byte(sources):
150 raise ValueError(_STR_ERR.format(sources))
151 elif is_file_obj(sources) or isinstance(sources, int):
152 source_input = [sources, ]
153 desc_input = [descs, ]
155 source_input = sources # assume some iterable
158 # now divide sources into os-level file descriptors for os.fstat,
159 # and file objects for read()
161 self.file_handles = [] # file descriptOR, not descriptION
162 for source in source_input:
163 if is_file_obj(source):
164 self.file_objs.append(source)
165 self.file_handles.append(source.fileno())
166 elif isinstance(source, int):
167 self.file_objs.append(os.fdopen(source))
168 self.file_handles.append(source)
169 elif is_str_or_byte(source):
170 raise ValueError(_STR_ERR.format(source))
172 raise ValueError('source {0} is neither file obj nor file '
175 # try to fstat the new file descriptor just for testing
176 os.fstat(self.file_handles[-1])
178 # guess descriptions if not given
180 self.descriptions = [create_description(obj, file_handle)
182 in zip(self.file_objs, self.file_handles)]
185 if len(desc_input) != len(self.file_objs):
186 raise ValueError('need same number of sources and '
189 pass # desc_input is generator or so
191 self.descriptions = []
192 for obj, file_handle, description in \
193 zip_longest(self.file_objs, self.file_handles, desc_input):
195 raise ValueError('more descriptions than sources!')
196 elif description is None:
197 self.descriptions.append(create_description(obj,
200 self.descriptions.append(description)
202 self.last_sizes = [0 for _ in self.file_objs]
203 self.ignore = [False for _ in self.file_objs]
206 self.is_used_func = true_func
208 self.is_used_func = false_func
209 # use some day: self.is_used_func = check_is_used
211 for obj, file_handle, description in \
212 zip(self.file_objs, self.file_handles, self.descriptions):
213 logging.debug('log_read initialized with file descriptor {0}, '
214 'file obj {1}, description "{2}"'
215 .format(file_handle, obj, description))
218 """Return number of sources given to constructor."""
219 return len(self.file_objs)
221 def n_active_sources(self):
222 """Return number of sources we are actually watching."""
223 return len(self.ignore) - sum(self.ignore)
227 Continue reading from sources, yield results.
229 yields result of :py:meth:`prepare_result`, which depends on what
230 subclass you called this function from.
236 for idx, (obj, file_handle, description, last_size, do_ignore) in \
237 enumerate(zip(self.file_objs, self.file_handles,
238 self.descriptions, self.last_sizes,
244 new_size = os.fstat(file_handle).st_size
246 # compare to old size
247 if new_size == last_size:
248 if not self.is_used_func(file_handle):
249 self.ignore[idx] = True
251 if new_size < last_size: # happened at start of some tests
252 warn('{0} / {1} has become smaller ({2} --> {3})! '
253 .format(obj, description, last_size, new_size)
254 + 'Maybe you are reading from a half-initialized '
256 category=LogReadWarning)
258 new_data = obj.read()
259 except OSError as ose: # includes IOErrors
260 warn('io error reading from {0} / {1}: {2})'
261 .format(obj, description, ose),
262 category=LogReadWarning)
264 except UnicodeDecodeError as ude:
265 warn('unicode error reading from {0} / {1}: {2}'
266 .format(obj, description, ude),
267 category=LogReadWarning)
271 for result in self.prepare_result(description, new_data, idx):
274 # prepare next iteration
275 self.last_sizes[idx] = new_size
277 def prepare_result(self, description, data, idx):
279 From raw new data create some yield-able results.
281 Intended for overwriting in subclasses.
283 This function is called from `__iter__` for each new data that becomes
284 available. It has to provide results which are forwarded to caller.
286 This base implementation just yields its input, so new data is yielded
287 from `__iter__` as-is.
289 :param str description: Description of source of lines, one of
290 :py:data:`self.descriptions`
291 :param str data: Text data read from source
292 :param idx: Index of data source
293 :returns: nothing but yields [(description, data, idx)], same as input
295 yield description, data, idx
298 #: characters to `rstrip()` from end of complete lines
299 LINE_SPLITTERS = '\n\r'
302 class LineReader(IterativeReader):
304 An :py:class:`IterativeReader` that returns new data line-wise.
306 This means buffering partial line data.
309 def __init__(self, *args, **kwargs):
310 """Create an :py:class:`IterativeReader and buffers for sources."""
311 super(LineReader, self).__init__(*args, **kwargs)
312 self.line_buffers = ['' for _ in range(self.n_sources())]
314 def prepare_result(self, description, new_data, idx):
316 Take raw new data and split it into lines.
318 If line is not complete, then buffer it.
320 Args: see super class method :py:meth:`IterativeReader.prepare_result`
322 :returns: list of 3-tuples `(description, line, idx)` where
323 `description` and `idx` are same as args, and `line` is
324 without trailing newline characters
325 :rtype: [(str, str, int)]
327 all_data = self.line_buffers[idx] + new_data
328 self.line_buffers[idx] = ''
329 should_be_no_new_lines = False
330 for line in all_data.splitlines(True):
331 if line[-1] in LINE_SPLITTERS:
332 yield description, line.rstrip(LINE_SPLITTERS), idx
333 elif should_be_no_new_lines:
335 raise ValueError('Programming error: something went wrong with '
336 'line splitting/buffering.')
338 self.line_buffers[idx] = line
339 should_be_no_new_lines = True # (this should be the last)
342 class LogParser(LineReader):
344 Takes lines from :py:class:`LineReader` and parses their contents.
346 Requires a pattern for log lines, auto-detection is not implemented yet.
348 Iteration returns :py:class:`re.match` result or -- if matching failed --
349 None. The latest unparsed line is available as `self.last_unparsed_line`.
350 Usage recommendation::
352 with open(log_file_name, 'rt') as file_handle:
353 parser = log_read.LogParser(file_handle, pattern=my_pattern):
354 for _, data, _ in parser:
356 print(f'Failed to parse line {parser.last_unparsed_line}')
358 line_parts = data.groupdict()
359 ...do stuff with line_parts...
362 def __init__(self, log_file, pattern=None):
366 :param str log_file: name of log file to parse (required!)
367 :param pattern: regexp to split log lines; None (default) to return
369 :type pattern: str or None (default)
371 super(LogParser, self).__init__(log_file)
373 self.pattern = pattern
374 self.last_unparsed_line = ''
376 def prepare_result(self, *args):
380 Args: see super class method :py:meth:`IterativeReader.prepare_result`
382 :returns: 3-tuples `(description, line, idx)` where `description` and
383 `idx` are same as input args and `line` is either a
384 :py:class:`re.Match` if line matched :py:data:`self.pattern`
385 or just str if line did not match.
386 :rtype: [(str, :py:class:`re.Match` OR str, int)]
388 # let super class split data into lines
389 for description, raw_line, idx in \
390 super(LogParser, self).prepare_result(*args):
391 matches = re.match(self.pattern, raw_line)
393 yield description, matches, idx
395 self.last_unparsed_line = raw_line
396 yield description, None, idx
400 def create_for(cls, filename, *args, **kwargs):
402 Open single file, yield LogParser. Ensures file is closed afterwards.
404 This allows opening file and creation LogParser for it to one line::
406 with LogParser.create_for('/var/log/messages', SYS_LOG_PATTERN) as parser:
407 for _, matches, _ in parser:
409 print(matches.groupdict())
411 print(f'UNPARSED: {parser.last_unparsed_line}')
413 :param str filename: something that :py:meth:`open` accepts
414 :param args: Forwarded to constructor
415 :param kwargs: Forwarded to constructor
417 with open(filename) as file_handle:
418 yield cls(file_handle, *args, **kwargs)
421 ################################################################################
422 # PATTERNS FOR FREQUENT LOG FILES
423 ################################################################################
425 # pattern of squid proxy logs. group names are best guesses
426 PROXY_LOG_PATTERN = \
427 r'\s*(?P<timestamp>\d+\.\d+\.\d+\s+\d+:\d+:\d+|\d+\.\d+)\s+(?P<size1>\d+)\s+' \
428 + r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+(?P<status_text>[A-Z_]+)/(?P<status_code>\d+)\s+' \
429 + r'(?P<size2>\d+)\s+(?P<command>\S+)\s+(?P<url>\S+)\s+(?P<user>\S+)\s+' \
430 + r'(?P<action>[A-Z_]+)/(?P<origin>\S+)\s+(?P<mimetype>\S+)\s+(?P<unknown>.*)\s*'
432 # pattern for linux system logs (usually "messages" or "syslog" also "maillog"
434 r'\s*(?P<timestamp>\w{3} +\d{2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) ' \
435 + r'(?P<procname>[^\[\]:]+)(?:\[(?P<pid>\d+)\])?: (?P<message>.*)'