Source code for hatchet.query.string_dialect

# Copyright 2017-2023 Lawrence Livermore National Security, LLC and other
# Hatchet Project Developers. See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: MIT

from numbers import Real
import re
import sys
import pandas as pd  # noqa: F401
from pandas.api.types import is_numeric_dtype, is_string_dtype  # noqa: F401
import numpy as np  # noqa: F401
from textx import metamodel_from_str
from textx.exceptions import TextXError
import warnings

from .errors import InvalidQueryPath, InvalidQueryFilter, RedundantQueryFilterWarning
from .query import Query


# PEG grammar for the String-based dialect
CYPHER_GRAMMAR = u"""
FullQuery: path_expr=MatchExpr(cond_expr=WhereExpr)?;
MatchExpr: 'MATCH' path=PathQuery;
PathQuery: '(' nodes=NodeExpr ')'('->' '(' nodes=NodeExpr ')')*;
NodeExpr: ((wcard=INT | wcard=STRING) ',' name=ID) | (wcard=INT | wcard=STRING) |  name=ID;
WhereExpr: 'WHERE' ConditionExpr;
ConditionExpr: conditions+=CompoundCond;
CompoundCond: UnaryCond | BinaryCond;
BinaryCond: AndCond | OrCond;
AndCond: 'AND' subcond=UnaryCond;
OrCond: 'OR' subcond=UnaryCond;
UnaryCond: NotCond | SingleCond;
NotCond: 'NOT' subcond=SingleCond;
SingleCond: StringCond | NumberCond | NoneCond | NotNoneCond | LeafCond | NotLeafCond;
NoneCond: name=ID '.' prop=STRING 'IS NONE';
NotNoneCond: name=ID '.' prop=STRING 'IS NOT NONE';
LeafCond: name=ID 'IS LEAF';
NotLeafCond: name=ID 'IS NOT LEAF';
StringCond: StringEq | StringStartsWith | StringEndsWith | StringContains | StringMatch;
StringEq: name=ID '.' prop=STRING '=' val=STRING;
StringStartsWith: name=ID '.' prop=STRING 'STARTS WITH' val=STRING;
StringEndsWith: name=ID '.' prop=STRING 'ENDS WITH' val=STRING;
StringContains: name=ID '.' prop=STRING 'CONTAINS' val=STRING;
StringMatch: name=ID '.' prop=STRING '=~' val=STRING;
NumberCond: NumEq | NumLt | NumGt | NumLte | NumGte | NumNan | NumNotNan | NumInf | NumNotInf;
NumEq: name=ID '.' prop=STRING '=' val=NUMBER;
NumLt: name=ID '.' prop=STRING '<' val=NUMBER;
NumGt: name=ID '.' prop=STRING '>' val=NUMBER;
NumLte: name=ID '.' prop=STRING '<=' val=NUMBER;
NumGte: name=ID '.' prop=STRING '>=' val=NUMBER;
NumNan: name=ID '.' prop=STRING 'IS NAN';
NumNotNan: name=ID '.' prop=STRING 'IS NOT NAN';
NumInf: name=ID '.' prop=STRING 'IS INF';
NumNotInf: name=ID '.' prop=STRING 'IS NOT INF';
"""

# TextX metamodel for the String-based dialect
cypher_query_mm = metamodel_from_str(CYPHER_GRAMMAR)


[docs]def cname(obj): """Utility function to get the name of the rule represented by the input""" return obj.__class__.__name__
[docs]def filter_check_types(type_check, df_row, filt_lambda): """Utility function used in String-based predicates to make sure the node data used in the actual boolean predicate is of the correct type. Arguments: type_check (str): a string containing a boolean Python expression used to validate node data typing df_row (pandas.Series or pandas.DataFrame): the row (or sub-DataFrame) representing the data for the current node being tested filt_lambda (Callable): the lambda used to actually confirm whether the node satisfies the predicate Returns: (bool): True if the node satisfies the predicate. False otherwise """ try: if type_check == "" or eval(type_check): return filt_lambda(df_row) else: raise InvalidQueryFilter("Type mismatch in filter") except KeyError: return False
[docs]class StringQuery(Query): """Class for representing and parsing queries using the String-based dialect.""" def __init__(self, cypher_query, multi_index_mode="off"): """Builds a new StringQuery object representing a query in the String-based dialect. Arguments: cypher_query (str): a query in the String-based dialect """ if sys.version_info[0] == 2: super(StringQuery, self).__init__() else: super().__init__() assert multi_index_mode in ["off", "all", "any"] self.multi_index_mode = multi_index_mode model = None try: model = cypher_query_mm.model_from_str(cypher_query) except TextXError as e: # TODO Change to a "raise-from" expression when Python 2.7 support is dropped raise InvalidQueryPath( "Invalid String Dialect Query Detected. Parser Error Message: {}".format( e.message ) ) self.wcards = [] self.wcard_pos = {} self._parse_path(model.path_expr) self.filters = [[] for _ in self.wcards] self._parse_conditions(model.cond_expr) self.lambda_filters = [None for _ in self.wcards] self._build_lambdas() self._build_query() def _build_query(self): """Builds the entire query using 'match' and 'rel' using the pre-parsed quantifiers and predicates. """ for i in range(0, len(self.wcards)): wcard = self.wcards[i][0] # TODO Remove this when Python 2.7 support is dropped. if sys.version_info[0] == 2 and not isinstance(wcard, Real): wcard = wcard.encode("ascii", "ignore") filt_str = self.lambda_filters[i] if filt_str is None: if i == 0: self.match(quantifier=wcard) else: self.rel(quantifier=wcard) else: if i == 0: self.match(quantifier=wcard, predicate=eval(filt_str)) else: self.rel(quantifier=wcard, predicate=eval(filt_str)) def _build_lambdas(self): """Constructs the final predicate lambdas from the pre-parsed predicate information. """ for i in range(0, len(self.wcards)): n = self.wcards[i] if n[1] != "": bool_expr = "" type_check = "" for j, cond in enumerate(self.filters[i]): if cond[0] is not None: bool_expr += " {}".format(cond[0]) bool_expr += " {}".format(cond[1]) if cond[2] is not None: if j == 0: type_check += " {}".format(cond[2]) else: type_check += " and {}".format(cond[2]) bool_expr = "lambda df_row: {}".format(bool_expr) bool_expr = ( 'lambda df_row: filter_check_types("{}", df_row, {})'.format( type_check, bool_expr ) ) self.lambda_filters[i] = bool_expr def _parse_path(self, path_obj): """Parses the MATCH statement of a String-based query.""" nodes = path_obj.path.nodes idx = len(self.wcards) for n in nodes: new_node = [n.wcard, n.name] if n.wcard is None or n.wcard == "" or n.wcard == 0: new_node[0] = "." self.wcards.append(new_node) if n.name != "": self.wcard_pos[n.name] = idx idx += 1 def _parse_conditions(self, cond_expr): """Top level function for parsing the WHERE statement of a String-based query. """ conditions = cond_expr.conditions for cond in conditions: converted_condition = None if self._is_unary_cond(cond): converted_condition = self._parse_unary_cond(cond) elif self._is_binary_cond(cond): converted_condition = self._parse_binary_cond(cond) else: raise RuntimeError("Bad Condition") self.filters[self.wcard_pos[converted_condition[1]]].append( [converted_condition[0], converted_condition[2], converted_condition[3]] ) for i in range(0, len(self.filters)): if len(self.filters[i]) > 0: if self.filters[i][0][0] != "not": self.filters[i][0][0] = None def _is_unary_cond(self, obj): """Detect whether a predicate is unary or not.""" if ( cname(obj) == "NotCond" or self._is_str_cond(obj) or self._is_num_cond(obj) or cname(obj) in ["NoneCond", "NotNoneCond", "LeafCond", "NotLeafCond"] ): return True return False def _is_binary_cond(self, obj): """Detect whether a predicate is binary or not.""" if cname(obj) in ["AndCond", "OrCond"]: return True return False def _parse_binary_cond(self, obj): """Top level function for parsing binary predicates.""" if cname(obj) == "AndCond": return self._parse_and_cond(obj) if cname(obj) == "OrCond": return self._parse_or_cond(obj) raise RuntimeError("Bad Binary Condition") def _parse_or_cond(self, obj): """Top level function for parsing predicates combined with logical OR.""" converted_subcond = self._parse_unary_cond(obj.subcond) converted_subcond[0] = "or" return converted_subcond def _parse_and_cond(self, obj): """Top level function for parsing predicates combined with logical AND.""" converted_subcond = self._parse_unary_cond(obj.subcond) converted_subcond[0] = "and" return converted_subcond def _parse_unary_cond(self, obj): """Top level function for parsing unary predicates.""" if cname(obj) == "NotCond": return self._parse_not_cond(obj) return self._parse_single_cond(obj) def _parse_not_cond(self, obj): """Parse predicates containing the logical NOT operator.""" converted_subcond = self._parse_single_cond(obj.subcond) converted_subcond[2] = "not {}".format(converted_subcond[2]) return converted_subcond def _run_method_based_on_multi_idx_mode(self, method_name, obj): real_method_name = method_name if self.multi_index_mode != "off": real_method_name = method_name + "_multi_idx" method = eval("StringQuery.{}".format(real_method_name)) return method(self, obj) def _parse_single_cond(self, obj): """Top level function for parsing individual numeric or string predicates.""" if self._is_str_cond(obj): return self._parse_str(obj) if self._is_num_cond(obj): return self._parse_num(obj) if cname(obj) == "NoneCond": return self._run_method_based_on_multi_idx_mode("_parse_none", obj) if cname(obj) == "NotNoneCond": return self._run_method_based_on_multi_idx_mode("_parse_not_none", obj) if cname(obj) == "LeafCond": return self._run_method_based_on_multi_idx_mode("_parse_leaf", obj) if cname(obj) == "NotLeafCond": return self._run_method_based_on_multi_idx_mode("_parse_not_leaf", obj) raise RuntimeError("Bad Single Condition") def _parse_none(self, obj): """Parses 'property IS NONE'.""" if obj.prop == "depth": return [ None, obj.name, "df_row.name._depth is None", None, ] if obj.prop == "node_id": return [ None, obj.name, "df_row.name._hatchet_nid is None", None, ] return [ None, obj.name, 'df_row["{}"] is None'.format(obj.prop), None, ] def _add_aggregation_call_to_multi_idx_predicate(self, predicate): if self.multi_index_mode == "any": return predicate + ".any()" return predicate + ".all()" def _parse_none_multi_idx(self, obj): if obj.prop == "depth": return [ None, obj.name, "df_row.index.get_level_values('node')[0]._depth is None", None, ] if obj.prop == "node_id": return [ None, obj.name, "df_row.index.get_level_values('node')[0]._hatchet_nid is None", None, ] if self.multi_index_mode == "any": return [ None, obj.name, "df_row['{}'].apply(lambda elem: elem is None).any()".format(obj.prop), None, ] # if self.multi_index_mode == "all": return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( "df_row['{}'].apply(lambda elem: elem is None)".format(obj.prop) ), None, ] def _parse_not_none(self, obj): """Parses 'property IS NOT NONE'.""" if obj.prop == "depth": return [ None, obj.name, "df_row.name._depth is not None", None, ] if obj.prop == "node_id": return [ None, obj.name, "df_row.name._hatchet_nid is not None", None, ] return [ None, obj.name, 'df_row["{}"] is not None'.format(obj.prop), None, ] def _parse_not_none_multi_idx(self, obj): if obj.prop == "depth": return [ None, obj.name, "df_row.index.get_level_values('node')[0]._depth is not None", None, ] if obj.prop == "node_id": return [ None, obj.name, "df_row.index.get_level_values('node')[0]._hatchet_nid is not None", None, ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( "df_row['{}'].apply(lambda elem: elem is not None)".format(obj.prop) ), None, ] def _parse_leaf(self, obj): """Parses 'node IS LEAF'.""" return [ None, obj.name, "len(df_row.name.children) == 0", None, ] def _parse_leaf_multi_idx(self, obj): return [ None, obj.name, "len(df_row.index.get_level_values('node')[0].children) == 0", None, ] def _parse_not_leaf(self, obj): """Parses 'node IS NOT LEAF'.""" return [ None, obj.name, "len(df_row.name.children) > 0", None, ] def _parse_not_leaf_multi_idx(self, obj): return [ None, obj.name, "len(df_row.index.get_level_values('node')[0].children) > 0", None, ] def _is_str_cond(self, obj): """Determines whether a predicate is for string data.""" if cname(obj) in [ "StringEq", "StringStartsWith", "StringEndsWith", "StringContains", "StringMatch", ]: return True return False def _is_num_cond(self, obj): """Determines whether a predicate is for numeric data.""" if cname(obj) in [ "NumEq", "NumLt", "NumGt", "NumLte", "NumGte", "NumNan", "NumNotNan", "NumInf", "NumNotInf", ]: return True return False def _parse_str(self, obj): """Function that redirects processing of string predicates to the correct function. """ if cname(obj) == "StringEq": return self._run_method_based_on_multi_idx_mode("_parse_str_eq", obj) if cname(obj) == "StringStartsWith": return self._run_method_based_on_multi_idx_mode( "_parse_str_starts_with", obj ) if cname(obj) == "StringEndsWith": return self._run_method_based_on_multi_idx_mode("_parse_str_ends_with", obj) if cname(obj) == "StringContains": return self._run_method_based_on_multi_idx_mode("_parse_str_contains", obj) if cname(obj) == "StringMatch": return self._run_method_based_on_multi_idx_mode("_parse_str_match", obj) raise RuntimeError("Bad String Op Class") def _parse_str_eq(self, obj): """Processes string equivalence predicates.""" return [ None, obj.name, 'df_row["{}"] == "{}"'.format(obj.prop, obj.val), "isinstance(df_row['{}'], str)".format(obj.prop), ] def _parse_str_eq_multi_idx(self, obj): return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: elem == "{}")'.format( obj.prop, obj.val ) ), "is_string_dtype(df_row['{}'])".format(obj.prop), ] def _parse_str_starts_with(self, obj): """Processes string 'startswith' predicates.""" return [ None, obj.name, 'df_row["{}"].startswith("{}")'.format(obj.prop, obj.val), "isinstance(df_row['{}'], str)".format(obj.prop), ] def _parse_str_starts_with_multi_idx(self, obj): return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: elem.startswith("{}"))'.format( obj.prop, obj.val ) ), "is_string_dtype(df_row['{}'])".format(obj.prop), ] def _parse_str_ends_with(self, obj): """Processes string 'endswith' predicates.""" return [ None, obj.name, 'df_row["{}"].endswith("{}")'.format(obj.prop, obj.val), "isinstance(df_row['{}'], str)".format(obj.prop), ] def _parse_str_ends_with_multi_idx(self, obj): return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: elem.endswith("{}"))'.format( obj.prop, obj.val ) ), "is_string_dtype(df_row['{}'])".format(obj.prop), ] def _parse_str_contains(self, obj): """Processes string 'contains' predicates.""" return [ None, obj.name, '"{}" in df_row["{}"]'.format(obj.val, obj.prop), "isinstance(df_row['{}'], str)".format(obj.prop), ] def _parse_str_contains_multi_idx(self, obj): return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: "{}" in elem)'.format( obj.prop, obj.val ) ), "is_string_dtype(df_row['{}'])".format(obj.prop), ] def _parse_str_match(self, obj): """Processes string regex match predicates.""" return [ None, obj.name, 're.match("{}", df_row["{}"]) is not None'.format(obj.val, obj.prop), "isinstance(df_row['{}'], str)".format(obj.prop), ] def _parse_str_match_multi_idx(self, obj): return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: re.match("{}", elem) is not None)'.format( obj.prop, obj.val ) ), "is_string_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num(self, obj): """Function that redirects processing of numeric predicates to the correct function. """ if cname(obj) == "NumEq": return self._run_method_based_on_multi_idx_mode("_parse_num_eq", obj) if cname(obj) == "NumLt": return self._run_method_based_on_multi_idx_mode("_parse_num_lt", obj) if cname(obj) == "NumGt": return self._run_method_based_on_multi_idx_mode("_parse_num_gt", obj) if cname(obj) == "NumLte": return self._run_method_based_on_multi_idx_mode("_parse_num_lte", obj) if cname(obj) == "NumGte": return self._run_method_based_on_multi_idx_mode("_parse_num_gte", obj) if cname(obj) == "NumNan": return self._run_method_based_on_multi_idx_mode("_parse_num_nan", obj) if cname(obj) == "NumNotNan": return self._run_method_based_on_multi_idx_mode("_parse_num_not_nan", obj) if cname(obj) == "NumInf": return self._run_method_based_on_multi_idx_mode("_parse_num_inf", obj) if cname(obj) == "NumNotInf": return self._run_method_based_on_multi_idx_mode("_parse_num_not_inf", obj) raise RuntimeError("Bad Number Op Class") def _parse_num_eq(self, obj): """Processes numeric equivalence predicates.""" if obj.prop == "depth": if obj.val == -1: return [ None, obj.name, "len(df_row.name.children) == 0", None, ] elif obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.name._depth, Real)", ] return [ None, obj.name, "df_row.name._depth == {}".format(obj.val), "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.name._hatchet_nid == {}".format(obj.val), "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'df_row["{}"] == {}'.format(obj.prop, obj.val), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_eq_multi_idx(self, obj): if obj.prop == "depth": if obj.val == -1: return [ None, obj.name, "len(df_row.index.get_level_values('node')[0].children) == 0", None, ] elif obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._depth == {}".format(obj.val), "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._hatchet_nid == {}".format( obj.val ), "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: elem == {})'.format(obj.prop, obj.val) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num_lt(self, obj): """Processes numeric less-than predicates.""" if obj.prop == "depth": if obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.name._depth, Real)", ] return [ None, obj.name, "df_row.name._depth < {}".format(obj.val), "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.name._hatchet_nid < {}".format(obj.val), "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'df_row["{}"] < {}'.format(obj.prop, obj.val), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_lt_multi_idx(self, obj): if obj.prop == "depth": if obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._depth < {}".format(obj.val), "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._hatchet_nid < {}".format( obj.val ), "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: elem < {})'.format(obj.prop, obj.val) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num_gt(self, obj): """Processes numeric greater-than predicates.""" if obj.prop == "depth": if obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be true. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "True", "isinstance(df_row.name._depth, Real)", ] return [ None, obj.name, "df_row.name._depth > {}".format(obj.val), "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be true. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "True", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.name._hatchet_nid > {}".format(obj.val), "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'df_row["{}"] > {}'.format(obj.prop, obj.val), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_gt_multi_idx(self, obj): if obj.prop == "depth": if obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be true. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "True", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._depth > {}".format(obj.val), "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be true. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "True", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._hatchet_nid > {}".format( obj.val ), "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: elem > {})'.format(obj.prop, obj.val) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num_lte(self, obj): """Processes numeric less-than-or-equal-to predicates.""" if obj.prop == "depth": if obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.name._depth, Real)", ] return [ None, obj.name, "df_row.name._depth <= {}".format(obj.val), "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.name._hatchet_nid <= {}".format(obj.val), "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'df_row["{}"] <= {}'.format(obj.prop, obj.val), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_lte_multi_idx(self, obj): if obj.prop == "depth": if obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._depth <= {}".format(obj.val), "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be false. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "False", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._hatchet_nid <= {}".format( obj.val ), "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: elem <= {})'.format(obj.prop, obj.val) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num_gte(self, obj): """Processes numeric greater-than-or-equal-to predicates.""" if obj.prop == "depth": if obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be true. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "True", "isinstance(df_row.name._depth, Real)", ] return [ None, obj.name, "df_row.name._depth >= {}".format(obj.val), "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be true. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "True", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.name._hatchet_nid >= {}".format(obj.val), "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'df_row["{}"] >= {}'.format(obj.prop, obj.val), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_gte_multi_idx(self, obj): if obj.prop == "depth": if obj.val < 0: warnings.warn( """ The 'depth' property of a Node is strictly non-negative. This condition will always be true. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "True", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._depth >= {}".format(obj.val), "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": if obj.val < 0: warnings.warn( """ The 'node_id' property of a Node is strictly non-negative. This condition will always be true. The statement that triggered this warning is: {} """.format( obj ), RedundantQueryFilterWarning, ) return [ None, obj.name, "True", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, "df_row.index.get_level_values('node')[0]._hatchet_nid >= {}".format( obj.val ), "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'df_row["{}"].apply(lambda elem: elem >= {})'.format(obj.prop, obj.val) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num_nan(self, obj): """Processes predicates that check for NaN.""" if obj.prop == "depth": return [ None, obj.name, "pd.isna(df_row.name._depth)", "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": return [ None, obj.name, "pd.isna(df_row.name._hatchet_nid)", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'pd.isna(df_row["{}"])'.format(obj.prop), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_nan_multi_idx(self, obj): if obj.prop == "depth": return [ None, obj.name, "pd.isna(df_row.index.get_level_values('node')[0]._depth)", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": return [ None, obj.name, "pd.isna(df_row.index.get_level_values('node')[0]._hatchet_nid)", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'pd.isna(df_row["{}"])'.format(obj.prop) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num_not_nan(self, obj): """Processes predicates that check for NaN.""" if obj.prop == "depth": return [ None, obj.name, "not pd.isna(df_row.name._depth)", "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": return [ None, obj.name, "not pd.isna(df_row.name._hatchet_nid)", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'not pd.isna(df_row["{}"])'.format(obj.prop), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_not_nan_multi_idx(self, obj): if obj.prop == "depth": return [ None, obj.name, "not pd.isna(df_row.index.get_level_values('node')[0]._depth)", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": return [ None, obj.name, "not pd.isna(df_row.index.get_level_values('node')[0]._hatchet_nid)", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'not pd.isna(df_row["{}"])'.format(obj.prop) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num_inf(self, obj): """Processes predicates that check for Infinity.""" if obj.prop == "depth": return [ None, obj.name, "np.isinf(df_row.name._depth)", "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": return [ None, obj.name, "np.isinf(df_row.name._hatchet_nid)", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'np.isinf(df_row["{}"])'.format(obj.prop), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_inf_multi_idx(self, obj): if obj.prop == "depth": return [ None, obj.name, "np.isinf(df_row.index.get_level_values('node')[0]._depth)", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": return [ None, obj.name, "np.isinf(df_row.index.get_level_values('node')[0]._hatchet_nid)", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'np.isinf(df_row["{}"])'.format(obj.prop) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ] def _parse_num_not_inf(self, obj): """Processes predicates that check for not-Infinity.""" if obj.prop == "depth": return [ None, obj.name, "not np.isinf(df_row.name._depth)", "isinstance(df_row.name._depth, Real)", ] if obj.prop == "node_id": return [ None, obj.name, "not np.isinf(df_row.name._hatchet_nid)", "isinstance(df_row.name._hatchet_nid, Real)", ] return [ None, obj.name, 'not np.isinf(df_row["{}"])'.format(obj.prop), "isinstance(df_row['{}'], Real)".format(obj.prop), ] def _parse_num_not_inf_multi_idx(self, obj): if obj.prop == "depth": return [ None, obj.name, "not np.isinf(df_row.index.get_level_values('node')[0]._depth)", "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)", ] if obj.prop == "node_id": return [ None, obj.name, "not np.isinf(df_row.index.get_level_values('node')[0]._hatchet_nid)", "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)", ] return [ None, obj.name, self._add_aggregation_call_to_multi_idx_predicate( 'not np.isinf(df_row["{}"])'.format(obj.prop) ), "is_numeric_dtype(df_row['{}'])".format(obj.prop), ]
[docs]def parse_string_dialect(query_str, multi_index_mode="off"): """Parse all types of String-based queries, including multi-queries that leverage the curly brace delimiters. Arguments: query_str (str): the String-based query to be parsed Returns: (Query or CompoundQuery): A Hatchet query object representing the String-based query """ # TODO Check if there's a way to prevent curly braces in a string # from being captured # Find the number of curly brace-delimited regions in the query query_str = query_str.strip() curly_brace_elems = re.findall(r"\{(.*?)\}", query_str) num_curly_brace_elems = len(curly_brace_elems) # If there are no curly brace-delimited regions, just pass the query # off to the CypherQuery constructor if num_curly_brace_elems == 0: if sys.version_info[0] == 2: query_str = query_str.decode("utf-8") return StringQuery(query_str, multi_index_mode) # Create an iterator over the curly brace-delimited regions curly_brace_iter = re.finditer(r"\{(.*?)\}", query_str) # Will store curly brace-delimited regions in the WHERE clause condition_list = None # Will store curly brace-delimited regions that contain entire # mid-level queries (MATCH clause and WHERE clause) query_list = None # If entire queries are in brace-delimited regions, store the indexes # of the regions here so we don't consider brace-delimited regions # within the already-captured region. query_idxes = None # Store which compound queries to apply to the curly brace-delimited regions compound_ops = [] for i, match in enumerate(curly_brace_iter): # Get the substring within curly braces substr = query_str[match.start() + 1 : match.end() - 1] substr = substr.strip() # If an entire query (MATCH + WHERE) is within curly braces, # add the query to "query_list", and add the indexes corresponding # to the query to "query_idxes" if substr.startswith("MATCH"): if query_list is None: query_list = [] if query_idxes is None: query_idxes = [] query_list.append(substr) query_idxes.append((match.start(), match.end())) # If the curly brace-delimited region contains only parts of a # WHERE clause, first, check if the region is within another # curly brace delimited region. If it is, do nothing (it will # be handled later). Otherwise, add the region to "condition_list" elif re.match(r"[a-zA-Z0-9_]+\..*", substr) is not None: is_encapsulated_region = False if query_idxes is not None: for s, e in query_idxes: if match.start() >= s or match.end() <= e: is_encapsulated_region = True break if is_encapsulated_region: continue if condition_list is None: condition_list = [] condition_list.append(substr) # If the curly brace-delimited region is neither a whole query # or part of a WHERE clause, raise an error else: raise ValueError("Invalid grouping (with curly braces) within the query") # If there is a compound operator directly after the curly brace-delimited region, # capture the type of operator, and store the type in "compound_ops" if i + 1 < num_curly_brace_elems: rest_substr = query_str[match.end() :] rest_substr = rest_substr.strip() if rest_substr.startswith("AND"): compound_ops.append("AND") elif rest_substr.startswith("OR"): compound_ops.append("OR") elif rest_substr.startswith("XOR"): compound_ops.append("XOR") else: raise ValueError("Invalid compound operator type found!") # Each call to this function should only consider one of the full query or # WHERE clause versions at a time. If both types were captured, raise an error # because some type of internal logic issue occured. if condition_list is not None and query_list is not None: raise ValueError( "Curly braces must be around either a full mid-level query or a set of conditions in a single mid-level query" ) # This branch is for the WHERE clause version if condition_list is not None: # Make sure you correctly gathered curly brace-delimited regions and # compound operators if len(condition_list) != len(compound_ops) + 1: raise ValueError( "Incompatible number of curly brace elements and compound operators" ) # Get the MATCH clause that will be shared across the subqueries match_comp_obj = re.search(r"MATCH\s+(?P<match_field>.*)\s+WHERE", query_str) match_comp = match_comp_obj.group("match_field") # Iterate over the compound operators full_query = None for i, op in enumerate(compound_ops): # If in the first iteration, set the initial query as a CypherQuery where # the MATCH clause is the shared match clause and the WHERE clause is the # first curly brace-delimited region if i == 0: query1 = "MATCH {} WHERE {}".format(match_comp, condition_list[i]) if sys.version_info[0] == 2: query1 = query1.decode("utf-8") full_query = StringQuery(query1, multi_index_mode) # Get the next query as a CypherQuery where # the MATCH clause is the shared match clause and the WHERE clause is the # next curly brace-delimited region next_query = "MATCH {} WHERE {}".format(match_comp, condition_list[i + 1]) if sys.version_info[0] == 2: next_query = next_query.decode("utf-8") next_query = StringQuery(next_query, multi_index_mode) # Add the next query to the full query using the compound operator # currently being considered if op == "AND": full_query = full_query & next_query elif op == "OR": full_query = full_query | next_query else: full_query = full_query ^ next_query return full_query # This branch is for the full query version else: # Make sure you correctly gathered curly brace-delimited regions and # compound operators if len(query_list) != len(compound_ops) + 1: raise ValueError( "Incompatible number of curly brace elements and compound operators" ) # Iterate over the compound operators full_query = None for i, op in enumerate(compound_ops): # If in the first iteration, set the initial query as the result # of recursively calling this function on the first curly brace-delimited region if i == 0: full_query = parse_string_dialect(query_list[i]) # Get the next query by recursively calling this function # on the next curly brace-delimited region next_query = parse_string_dialect(query_list[i + 1]) # Add the next query to the full query using the compound operator # currently being considered if op == "AND": full_query = full_query & next_query elif op == "OR": full_query = full_query | next_query else: full_query = full_query ^ next_query return full_query