203 lines
6.9 KiB
Python
Executable File
203 lines
6.9 KiB
Python
Executable File
#!/usr/bin/env python2
|
|
|
|
# Copyright 2017 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
"""Script to upload metrics from apache access logs to Monarch."""
|
|
|
|
from __future__ import print_function
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
import urlparse
|
|
|
|
import common
|
|
|
|
from chromite.lib import ts_mon_config
|
|
from chromite.lib import metrics
|
|
|
|
from autotest_lib.site_utils.stats import log_daemon_common
|
|
# Not used, but needed for importing rpc_interface.
|
|
from autotest_lib.frontend import setup_django_environment
|
|
from autotest_lib.frontend.afe import rpc_interface
|
|
from autotest_lib.frontend.afe import moblab_rpc_interface
|
|
|
|
|
|
"""
|
|
The log format is set to:
|
|
%v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\" %T
|
|
|
|
These are documented as follows:
|
|
(from https://httpd.apache.org/docs/current/mod/mod_log_config.html)
|
|
|
|
%h: Remote host
|
|
%l: Remote logname (from identd, if supplied)
|
|
%O: Bytes sent, including headers. May be zero in rare cases such as when a
|
|
request is aborted before a response is sent. You need to enable mod_logio
|
|
to use this.
|
|
%p: The canonical Port of the server serving the request
|
|
%r: First line of request
|
|
%s: Status. For requests that got internally redirected, this is
|
|
the status of the *original* request --- %...>s for the last.
|
|
%t: Time, in common log format time format (standard english format)
|
|
%T: The time taken to serve the request, in seconds.
|
|
%u: Remote user (from auth; may be bogus if return status (%s) is 401)
|
|
%v: The canonical ServerName of the server serving the request.
|
|
"""
|
|
|
|
# Lemma: a regex to match sections delimited be double-quotes ("), which
|
|
# possible contained escaped quotes (\").
|
|
# This works by matching non-quotes or the string r'\"' repeatedly; then it ends
|
|
# when finding a quote (") preceeded by a character which is not a backslash.
|
|
MATCH_UNTIL_QUOTE = r'([^"]|\\")*[^\\]'
|
|
|
|
ACCESS_MATCHER = re.compile(
|
|
r'^'
|
|
r'\S+ \S+ \S+ \S+ ' # Ignore %v:%p %h %l %u
|
|
r'\[[^]]+\] ' # Ignore %t
|
|
r'"(' # Begin %r
|
|
r'(?P<request_method>\S+) ' # e.g. POST
|
|
r'(?P<endpoint>\S+)' # e.g. /afe/server/noauth/rpc/
|
|
+ MATCH_UNTIL_QUOTE + # Ignore protocol (e.g. HTTP/1.1)
|
|
r'|-' # The request data might just be "-"
|
|
r')" ' # End %r
|
|
r'(?P<response_code>\d+) ' # %>s (e.g. 200)
|
|
r'(?P<bytes_sent>\d+)' # %O
|
|
r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore Referer
|
|
r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore User-Agent
|
|
r' ?(?P<response_seconds>\d+?)' # The server time in seconds
|
|
r'.*' # Allow adding extra stuff afterward.
|
|
)
|
|
|
|
ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/response_seconds'
|
|
ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/response_bytes'
|
|
|
|
RPC_ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/rpc/response_seconds'
|
|
RPC_ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/rpc/response_bytes'
|
|
RPC_METHOD_ARGUMENT = 'method'
|
|
|
|
|
|
# TODO(phobbs) use a memory-efficient structure to detect non-unique paths.
|
|
# We can't just include the endpoint because it will cause a cardinality
|
|
# explosion.
|
|
ALLOWLISTED_ENDPOINTS = frozenset((
|
|
'/',
|
|
'/afe/',
|
|
'/new_tko/server/rpc/',
|
|
'/afe/server/rpc/',
|
|
'/___rPc_sWiTcH___',
|
|
'*',
|
|
'/afe/server/noauth/rpc/',
|
|
))
|
|
|
|
|
|
# A bad actor could DOS Monarch by requesting millions of different RPC methods,
|
|
# each of which would create a different stream. Only allow an allowlist of
|
|
# methods to be recorded in Monarch.
|
|
ALLOWLISTED_METHODS = (frozenset(dir(rpc_interface))
|
|
| frozenset(dir(moblab_rpc_interface)))
|
|
|
|
|
|
def EmitRequestMetrics(m):
|
|
"""Emits metrics for each line in the access log.
|
|
|
|
@param m: A regex match object
|
|
"""
|
|
fields = {
|
|
'request_method': m.groupdict().get('request_method', ''),
|
|
'endpoint': SanitizeEndpoint(m.group('endpoint')),
|
|
'response_code': int(m.group('response_code')),
|
|
}
|
|
|
|
send_rpc_metrics = (
|
|
'?' in m.group('endpoint') and '/rpc' in m.group('endpoint'))
|
|
if send_rpc_metrics:
|
|
EmitRPCMetrics(m)
|
|
|
|
# Request seconds and bytes sent are both extremely high cardinality, so
|
|
# they must be the VAL of a metric, not a metric field.
|
|
if m.group('response_seconds'):
|
|
response_seconds = int(m.group('response_seconds'))
|
|
metrics.SecondsDistribution(ACCESS_TIME_METRIC).add(response_seconds,
|
|
fields=fields)
|
|
|
|
bytes_sent = int(m.group('bytes_sent'))
|
|
metrics.CumulativeDistribution(ACCESS_BYTES_METRIC).add(
|
|
bytes_sent, fields=fields)
|
|
|
|
|
|
def EmitRPCMetrics(m):
|
|
"""Emit a special metric including the method when the request was an RPC."""
|
|
fields = {
|
|
'request_method': m.groupdict().get('request_method', ''),
|
|
'rpc_method': ParseRPCMethod(m.group('endpoint')),
|
|
'response_code': int(m.group('response_code')),
|
|
}
|
|
|
|
if m.group('response_seconds'):
|
|
response_seconds = int(m.group('response_seconds'))
|
|
metrics.SecondsDistribution(RPC_ACCESS_TIME_METRIC).add(
|
|
response_seconds, fields=fields)
|
|
|
|
bytes_sent = int(m.group('bytes_sent'))
|
|
metrics.CumulativeDistribution(RPC_ACCESS_BYTES_METRIC).add(bytes_sent,
|
|
fields=fields)
|
|
|
|
|
|
def ParseRPCMethod(url):
|
|
"""Parses the RPC method from an RPC query string.
|
|
|
|
Args:
|
|
url: The URL requested.
|
|
"""
|
|
query = urlparse.urlparse(url).query
|
|
return urlparse.parse_qs(query)[RPC_METHOD_ARGUMENT][-1]
|
|
|
|
|
|
def SanitizeEndpoint(endpoint):
|
|
"""Returns empty string if endpoint is not allowlisted.
|
|
|
|
@param endpoint: The endpoint to sanitize.
|
|
"""
|
|
if endpoint in ALLOWLISTED_ENDPOINTS:
|
|
return endpoint
|
|
else:
|
|
return ''
|
|
|
|
|
|
MATCHERS = [
|
|
(ACCESS_MATCHER, EmitRequestMetrics),
|
|
]
|
|
|
|
|
|
def ParseArgs():
|
|
"""Parses the command line arguments."""
|
|
p = argparse.ArgumentParser(
|
|
description='Parses apache logs and emits metrics to Monarch')
|
|
p.add_argument('--output-logfile')
|
|
p.add_argument('--debug-metrics-file',
|
|
help='Output metrics to the given file instead of sending '
|
|
'them to production.')
|
|
return p.parse_args()
|
|
|
|
|
|
def Main():
|
|
"""Sets up logging and runs matchers against stdin."""
|
|
args = ParseArgs()
|
|
log_daemon_common.SetupLogging(args)
|
|
|
|
# Set up metrics sending and go.
|
|
ts_mon_args = {}
|
|
if args.debug_metrics_file:
|
|
ts_mon_args['debug_file'] = args.debug_metrics_file
|
|
|
|
with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics',
|
|
**ts_mon_args):
|
|
log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
Main()
|