#!/usr/bin/env python3 """Extract data captured by Apache mod_dumpio. Scan the input looking for lines from mod_dumpio representing the input received from the remote system. Capture the data. When the remote client ID changes dump the captured data. Note that we do absolutely everything using binary I/O and byte-arrays rather than text I/O and strings. This is because we don't actually care what encoding is used in the mod_dumpio output or the Apache HTTP error log. We're potentially dealing with non-ASCII and non-UTF-8 data (e.g., binary data such as a Zip archive). Usage: error_log_data_extract < error_log > decoded_data Note that you do not have to pipe the entire error log through this program. You can preprocess the log by, for example, extracting with grep just the lines associated with a given IP address and sending that subset of lines through this program. This program assumes you're running the corrected mod_dumpio module I posted to Apache PR57045 (https://bz.apache.org/bugzilla/show_bug.cgi?id=57045). It can be used on output from the broken mod_dumpio module but it won't, obviously, correctly handle 0x00 or 0xFF bytes. """ import re import sys data_sep = (b'=#=' * (80 // 3)) + b'\n' dumpio_in_re1 = re.compile(rb'^.*? \[client (?P.+?)\]\s' rb'mod_dumpio:\s+dumpio_in\s+(?P.*)$') dumpio_in_re2 = re.compile(rb'^\(data-HEAP\):\s' rb'(?:\d+ bytes|error reading data)$') dumpio_in_re3 = re.compile(rb'^\(data-HEAP\):\s(?P.*)$') escape_re = re.compile(rb'(\\\\|\\x[fF][fF]\\x0[12]|' rb'\\x[0-9a-fA-F][0-9a-fA-F]|\\[abfnrtv"])') hex_escape_to_bin = {'\\x{:02x}'.format(i).encode('ascii'): bytes([i]) for i in range(256)} escape_to_char = {b'\\"': b'"', b'\\\\': b'\\', b'\\a': b'\a', b'\\b': b'\b', b'\\f': b'\f', b'\\n': b'\n', b'\\r': b'\r', b'\\t': b'\t', b'\\v': b'\v', b'\\xff\\x02': b'\xff', b'\\xff\\x01': b'\x00'} escape_to_char.update(hex_escape_to_bin) def SubEscapes(match): """Convert text escaped by the mod_dumpio module to their original chars.""" return escape_to_char.get(match.group(0).lower()) def DumpCapturedData(captured_data, out_fh): """Write the unescaped data we captured from the mod_dumpio module.""" if not captured_data: return out_fh.write(data_sep) eol = '' for data_in in captured_data: eol = data_in[-1] if data_in else '' out_fh.write(data_in) if eol != b'\n': out_fh.write(b'\n') out_fh.write(data_sep) def main(): """Filter an Apache error log containing mod_dumpio data.""" prev_client_id = None captured_data = [] in_fh = open(sys.stdin.fileno(), 'rb', closefd=False) out_fh = open(sys.stdout.fileno(), 'wb', closefd=False) for line in in_fh: match = dumpio_in_re1.match(line) if match: client_id = match.group('client_id') dumpio_in = match.group('dumpio_in') if client_id != prev_client_id and prev_client_id is not None: DumpCapturedData(captured_data, out_fh) captured_data = [] prev_client_id = client_id match = dumpio_in_re2.match(dumpio_in) if not match: match = dumpio_in_re3.match(dumpio_in) if match: data_in = match.group('data_in') if isinstance(data_in, str): continue data_in = escape_re.sub(SubEscapes, data_in) captured_data.append(data_in) out_fh.write(line) DumpCapturedData(captured_data, out_fh) if __name__ == '__main__': main()