Skip to content

feat(discover2) Add more aggregate field support #14321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 13, 2019
96 changes: 21 additions & 75 deletions src/sentry/api/bases/organization_events.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,19 @@
from __future__ import absolute_import

from copy import deepcopy
from rest_framework.exceptions import PermissionDenied
import six
from rest_framework.exceptions import PermissionDenied
from enum import Enum

from sentry import features
from sentry.api.bases import OrganizationEndpoint, OrganizationEventsError
from sentry.api.event_search import get_snuba_query_args, InvalidSearchQuery
from sentry.api.event_search import (
get_snuba_query_args,
resolve_field_list,
InvalidSearchQuery
)
from sentry.models.project import Project
from sentry.utils import snuba

# We support 4 "special fields" on the v2 events API which perform some
# additional calculations over aggregated event data
SPECIAL_FIELDS = {
'issue_title': {
'aggregations': [['anyHeavy', 'title', 'issue_title']],
},
'last_seen': {
'aggregations': [['max', 'timestamp', 'last_seen']],
},
'event_count': {
'aggregations': [['uniq', 'id', 'event_count']],
},
'user_count': {
'aggregations': [['uniq', 'user', 'user_count']],
},
'latest_event': {
'fields': [
['argMax', ['id', 'timestamp'], 'latest_event'],
],
},
}


class Direction(Enum):
NEXT = 0
Expand Down Expand Up @@ -67,47 +48,29 @@ def get_snuba_query_args(self, request, organization, params):
except InvalidSearchQuery as exc:
raise OrganizationEventsError(exc.message)

fields = request.GET.getlist('field')[:]
aggregations = []
groupby = request.GET.getlist('groupby')
special_fields = set()

if fields:
# If project.name is requested, get the project.id from Snuba so we
# can use this to look up the name in Sentry
if 'project.name' in fields:
fields.remove('project.name')
if 'project.id' not in fields:
fields.append('project.id')

for field in fields[:]:
if field in SPECIAL_FIELDS:
special_fields.add(field)
special_field = deepcopy(SPECIAL_FIELDS[field])
fields.remove(field)
fields.extend(special_field.get('fields', []))
aggregations.extend(special_field.get('aggregations', []))
groupby.extend(special_field.get('groupby', []))

snuba_args['selected_columns'] = fields

self._filter_unspecified_special_fields_in_conditions(snuba_args, special_fields)
if aggregations:
snuba_args['aggregations'] = aggregations

if groupby:
snuba_args['groupby'] = groupby

sort = request.GET.getlist('sort')
if sort and snuba.valid_orderby(sort, SPECIAL_FIELDS):
if sort:
snuba_args['orderby'] = sort

# Deprecated. `sort` should be used as it is supported by
# more endpoints.
orderby = request.GET.getlist('orderby')
if orderby and snuba.valid_orderby(orderby, SPECIAL_FIELDS) and 'orderby' not in snuba_args:
if orderby and 'orderby' not in snuba_args:
snuba_args['orderby'] = orderby

if request.GET.get('rollup'):
try:
snuba_args['rollup'] = int(request.GET.get('rollup'))
except ValueError:
raise OrganizationEventsError('rollup must be an integer.')

fields = request.GET.getlist('field')[:]
if fields:
try:
snuba_args.update(resolve_field_list(fields, snuba_args))
except InvalidSearchQuery as exc:
raise OrganizationEventsError(exc.message)

# TODO(lb): remove once boolean search is fully functional
has_boolean_op_flag = features.has(
'organizations:boolean-search',
Expand Down Expand Up @@ -147,9 +110,6 @@ def get_snuba_query_args_legacy(self, request, organization):
except InvalidSearchQuery as exc:
raise OrganizationEventsError(exc.message)

# Filter out special aggregates.
self._filter_unspecified_special_fields_in_conditions(snuba_args, set())
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was removed as we won't be supporting filtering on aggregate columns in the short term, and with fully dynamic aggregate columns we would need expand the query parser to accomodate aggregate filtering.

Copy link
Member

@lynnagara lynnagara Aug 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we go ahead and remove the function completely? sorry, just saw you did this


# TODO(lb): remove once boolean search is fully functional
has_boolean_op_flag = features.has(
'organizations:boolean-search',
Expand Down Expand Up @@ -212,17 +172,3 @@ def _get_next_or_prev_id(self, direction, request, organization, snuba_args, eve
return None

return six.text_type(result['data'][0]['event_id'])

def _filter_unspecified_special_fields_in_conditions(self, snuba_args, special_fields):
conditions = []
for condition in snuba_args['conditions']:
field = condition[0]
if (
not isinstance(field, (list, tuple))
and field in SPECIAL_FIELDS
and field not in special_fields
):
# skip over special field.
continue
conditions.append(condition)
snuba_args['conditions'] = conditions
35 changes: 16 additions & 19 deletions src/sentry/api/endpoints/organization_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from sentry import features
from sentry.models.project import Project

ALLOWED_GROUPINGS = frozenset(('issue.id', 'project.id', 'transaction'))
logger = logging.getLogger(__name__)


Expand All @@ -31,16 +30,8 @@ def get(self, request, organization):
try:
params = self.get_filter_params(request, organization)
snuba_args = self.get_snuba_query_args(request, organization, params)
fields = snuba_args.get('selected_columns')
groupby = snuba_args.get('groupby', [])

if not fields and not groupby:
return Response({'detail': 'No fields or groupings provided'}, status=400)

if any(field for field in groupby if field not in ALLOWED_GROUPINGS):
message = ('Invalid groupby value requested. Allowed values are ' +
', '.join(ALLOWED_GROUPINGS))
return Response({'detail': message}, status=400)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was removed because the endpoint no longer allows the end user to define groupby. Instead groupings are implicit based on the fields requested.

if not snuba_args.get('selected_columns') and not snuba_args.get('aggregations'):
return Response({'detail': 'No fields provided'}, status=400)

except OrganizationEventsError as exc:
return Response({'detail': exc.message}, status=400)
Expand Down Expand Up @@ -130,16 +121,22 @@ def get_legacy(self, request, organization):
)

def handle_results(self, request, organization, project_ids, results):
if not results:
return results

first_row = results[0]
if not ('project.id' in first_row or 'projectid' in first_row):
return results

fields = request.GET.getlist('field')
projects = {p['id']: p['slug'] for p in Project.objects.filter(
organization=organization,
id__in=project_ids).values('id', 'slug')}

fields = request.GET.getlist('field')

if 'project.name' in fields:
for result in results:
result['project.name'] = projects[result['project.id']]
if 'project.id' not in fields:
del result['project.id']
for result in results:
for key in ('projectid', 'project.id'):
if key in result:
result['project.name'] = projects[result[key]]
if key not in fields:
del result[key]

return results
166 changes: 160 additions & 6 deletions src/sentry/api/event_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import re
from collections import namedtuple, defaultdict
from copy import deepcopy
from datetime import datetime

import six
Expand Down Expand Up @@ -142,9 +143,8 @@ def translate(pat):
'first_seen': 'first_seen',
'last_seen': 'last_seen',
'times_seen': 'times_seen',
# OrganizationEvents aggregations
'event_count': 'event_count',
'user_count': 'user_count',
# TODO(mark) figure out how to safelist aggregate functions/field aliases
# so they can be used in conditions
}, **SENTRY_SNUBA_MAP)
no_conversion = set(['project_id', 'start', 'end'])

Expand Down Expand Up @@ -215,9 +215,8 @@ class SearchVisitor(NodeVisitor):
'device.battery_level', 'device.charging', 'device.online',
'device.simulator', 'error.handled', 'issue.id', 'stack.colno',
'stack.in_app', 'stack.lineno', 'stack.stack_level',
# OrganizationEvents aggregations
'event_count', 'user_count',
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing these as they aren't a thing anymore.


# TODO(mark) figure out how to safelist aggregate functions/field aliases
# so they can be used in conditions
])
date_keys = set([
'start', 'end', 'first_seen', 'last_seen', 'time', 'timestamp',
Expand Down Expand Up @@ -662,3 +661,158 @@ def get_snuba_query_args(query=None, params=None):
kwargs['has_boolean_terms'] = True
kwargs['conditions'].append(convert_search_boolean_to_snuba_query(term))
return kwargs


FIELD_ALIASES = {
'issue_title': {
'aggregations': [['anyHeavy', 'title', 'issue_title']],
},
'last_seen': {
'aggregations': [['max', 'timestamp', 'last_seen']],
},
'latest_event': {
'aggregations': [
# TODO(mark) This is a hack to work around jsonschema limitations
# in snuba.
['argMax(event_id, timestamp)', '', 'latest_event'],
],
},
'project': {
'fields': ['project.id'],
},
'user': {
'fields': ['user.id', 'user.name', 'user.username', 'user.email', 'user.ip'],
}
# TODO(mark) Add rpm alias.
}

VALID_AGGREGATES = {
'count_unique': {
'snuba_name': 'uniq',
'fields': '*',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can imagine also wanting to do all fields excluding some small list

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lynnagara im wondering, is there an aggregate function where we would want to exclude certain columns?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally would prefer to use safe-lists rather than ban-lists for the acceptable aggregate columns.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The situation where I was thinking we might need a ban list is where we want to support the aggregate on custom user-defined tag columns (can't use a safe list) but might need to exclude some internal stuff

},
'count': {
'snuba_name': 'count',
'fields': '*'
},
'avg': {
'snuba_name': 'avg',
'fields': ['duration'],
},
'min': {
'snuba_name': 'min',
'fields': ['timestamp', 'duration'],
},
'max': {
'snuba_name': 'max',
'fields': ['timestamp', 'duration'],
},
'sum': {
'snuba_name': 'sum',
'fields': ['duration'],
},
# This doesn't work yet, but is an illustration of how it could work
'p75': {
'snuba_name': 'quantileTiming(0.75)',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we wanted to stop passing function calls like this explicitly, and try something like
[quantileTiming, [0.75]]. Not sure if this is something that would be directly passed to snuba though?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with an array form as well. The quantile functions are a bit funny as they need to end up in clickhouse like quantileTiming(0.75)(duration) I'm not sure we've done that kind of function call in snuba before.

Copy link
Member

@wedamija wedamija Aug 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might need snuba to translate that for us, not sure. I feel like we had something similar to this before, can't remember exactly what it was though. Could potentially do [quantileTiming, [0.75, 'duration']] and have snuba sort it out. I think other patterns we've done were to extract information from the name, but that's probably not a great practice (something like [quantileTiming_75, ['duration']])

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah the implementation of the quantile functions is still imcomplete, but we can figure that out once storage for transactions is sorted.

'fields': ['duration'],
},
}

AGGREGATE_PATTERN = re.compile(r'^(?P<function>[^\(]+)\((?P<column>[a-z\._]*)\)$')


def validate_aggregate(field, match):
function_name = match.group('function')
if function_name not in VALID_AGGREGATES:
raise InvalidSearchQuery("Unknown aggregate function '%s'" % field)

function_data = VALID_AGGREGATES[function_name]
column = match.group('column')
if column not in function_data['fields'] and function_data['fields'] != '*':
raise InvalidSearchQuery(
"Invalid column '%s' in aggregate function '%s'" % (column, function_name))


def validate_orderby(orderby, fields):
orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby]
for column in orderby:
column = column.lstrip('-')
if column not in fields:
raise InvalidSearchQuery('Cannot order by an field that is not selected.')


def resolve_field_list(fields, snuba_args):
"""
Expand a list of fields based on aliases and aggregate functions.

Returns a dist of aggregations, selected_columns, and
groupby that can be merged into the result of get_snuba_query_args()
to build a more complete snuba query based on event search conventions.
"""
# If project.name is requested, get the project.id from Snuba so we
# can use this to look up the name in Sentry
if 'project.name' in fields:
fields.remove('project.name')
if 'project.id' not in fields:
fields.append('project.id')

aggregations = []
groupby = []
columns = []
for field in fields:
if not isinstance(field, six.string_types):
raise InvalidSearchQuery('Field names must be strings')

if field in FIELD_ALIASES:
special_field = deepcopy(FIELD_ALIASES[field])
columns.extend(special_field.get('fields', []))
aggregations.extend(special_field.get('aggregations', []))
continue

# Basic fields don't require additional validation. They could be tag
# names which we have no way of validating at this point.
match = AGGREGATE_PATTERN.search(field)
if not match:
columns.append(field)
continue

validate_aggregate(field, match)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could be nice if all the validations were done in one place up front, but i'm guessing this is not as easy?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function (resolve_field_list) is called pretty early in the organization_events endpoint processing. I would like to use a DRF serializer for this work but other request validation steps get_filter_params() are embedded in endpoint logic.

aggregations.append([
VALID_AGGREGATES[match.group('function')]['snuba_name'],
match.group('column'),
u'{}_{}'.format(match.group('function'), match.group('column')).rstrip('_')
])

rollup = snuba_args.get('rollup')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What will rollup be used for?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rollup gets used by timeseries endpoints, which I'm planning to update soon.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link
Member

@dashed dashed Aug 9, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.

if not rollup:
# Ensure fields we require to build a functioning interface
# are present. We don't add fields when using a rollup as the additional fields
# would be aggregated away. When there are aggregations
# we use argMax to get the latest event/projectid so we can create links.
# The `projectid` output name is not a typo, using `project_id` triggers
# generates invalid queries.
if not aggregations and 'id' not in columns:
columns.append('id')
columns.append('project.id')
if aggregations and 'latest_event' not in fields:
aggregations.extend(deepcopy(FIELD_ALIASES['latest_event']['aggregations']))
if aggregations and 'project.id' not in columns:
aggregations.append(['argMax(project_id, timestamp)', '', 'projectid'])

if rollup and columns and not aggregations:
raise InvalidSearchQuery('You cannot use rollup without an aggregate field.')

orderby = snuba_args.get('orderby')
if orderby:
validate_orderby(orderby, fields)

# If aggregations are present all columns
# need to be added to the group by so that the query is valid.
if aggregations:
groupby.extend(columns)

return {
'selected_columns': columns,
'aggregations': aggregations,
'groupby': groupby,
}
Loading