Source code for hatchet.query.string_dialect

# Copyright 2017-2023 Lawrence Livermore National Security, LLC and other
# Hatchet Project Developers. See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: MIT

from numbers import Real
import re
import sys
import pandas as pd  # noqa: F401
from pandas.api.types import is_numeric_dtype, is_string_dtype  # noqa: F401
import numpy as np  # noqa: F401
from textx import metamodel_from_str
from textx.exceptions import TextXError
import warnings

from .errors import InvalidQueryPath, InvalidQueryFilter, RedundantQueryFilterWarning
from .query import Query


# PEG grammar for the String-based dialect
CYPHER_GRAMMAR = u"""
FullQuery: path_expr=MatchExpr(cond_expr=WhereExpr)?;
MatchExpr: 'MATCH' path=PathQuery;
PathQuery: '(' nodes=NodeExpr ')'('->' '(' nodes=NodeExpr ')')*;
NodeExpr: ((wcard=INT | wcard=STRING) ',' name=ID) | (wcard=INT | wcard=STRING) |  name=ID;
WhereExpr: 'WHERE' ConditionExpr;
ConditionExpr: conditions+=CompoundCond;
CompoundCond: UnaryCond | BinaryCond;
BinaryCond: AndCond | OrCond;
AndCond: 'AND' subcond=UnaryCond;
OrCond: 'OR' subcond=UnaryCond;
UnaryCond: NotCond | SingleCond;
NotCond: 'NOT' subcond=SingleCond;
SingleCond: StringCond | NumberCond | NoneCond | NotNoneCond | LeafCond | NotLeafCond;
NoneCond: name=ID '.' prop=STRING 'IS NONE';
NotNoneCond: name=ID '.' prop=STRING 'IS NOT NONE';
LeafCond: name=ID 'IS LEAF';
NotLeafCond: name=ID 'IS NOT LEAF';
StringCond: StringEq | StringStartsWith | StringEndsWith | StringContains | StringMatch;
StringEq: name=ID '.' prop=STRING '=' val=STRING;
StringStartsWith: name=ID '.' prop=STRING 'STARTS WITH' val=STRING;
StringEndsWith: name=ID '.' prop=STRING 'ENDS WITH' val=STRING;
StringContains: name=ID '.' prop=STRING 'CONTAINS' val=STRING;
StringMatch: name=ID '.' prop=STRING '=~' val=STRING;
NumberCond: NumEq | NumLt | NumGt | NumLte | NumGte | NumNan | NumNotNan | NumInf | NumNotInf;
NumEq: name=ID '.' prop=STRING '=' val=NUMBER;
NumLt: name=ID '.' prop=STRING '<' val=NUMBER;
NumGt: name=ID '.' prop=STRING '>' val=NUMBER;
NumLte: name=ID '.' prop=STRING '<=' val=NUMBER;
NumGte: name=ID '.' prop=STRING '>=' val=NUMBER;
NumNan: name=ID '.' prop=STRING 'IS NAN';
NumNotNan: name=ID '.' prop=STRING 'IS NOT NAN';
NumInf: name=ID '.' prop=STRING 'IS INF';
NumNotInf: name=ID '.' prop=STRING 'IS NOT INF';
"""

# TextX metamodel for the String-based dialect
cypher_query_mm = metamodel_from_str(CYPHER_GRAMMAR)


[docs]def cname(obj):
    """Utility function to get the name of the rule represented by the input"""
    return obj.__class__.__name__


[docs]def filter_check_types(type_check, df_row, filt_lambda):
    """Utility function used in String-based predicates
       to make sure the node data used in the actual boolean predicate
       is of the correct type.

    Arguments:
        type_check (str): a string containing a boolean Python expression used to validate node data typing
        df_row (pandas.Series or pandas.DataFrame): the row (or sub-DataFrame) representing the data for the current node being tested
        filt_lambda (Callable): the lambda used to actually confirm whether the node satisfies the predicate

    Returns:
        (bool): True if the node satisfies the predicate. False otherwise
    """
    try:
        if type_check == "" or eval(type_check):
            return filt_lambda(df_row)
        else:
            raise InvalidQueryFilter("Type mismatch in filter")
    except KeyError:
        return False


[docs]class StringQuery(Query):

    """Class for representing and parsing queries using the String-based dialect."""

    def __init__(self, cypher_query, multi_index_mode="off"):
        """Builds a new StringQuery object representing a query in the String-based dialect.

        Arguments:
            cypher_query (str): a query in the String-based dialect
        """
        if sys.version_info[0] == 2:
            super(StringQuery, self).__init__()
        else:
            super().__init__()
        assert multi_index_mode in ["off", "all", "any"]
        self.multi_index_mode = multi_index_mode
        model = None
        try:
            model = cypher_query_mm.model_from_str(cypher_query)
        except TextXError as e:
            # TODO Change to a "raise-from" expression when Python 2.7 support is dropped
            raise InvalidQueryPath(
                "Invalid String Dialect Query Detected. Parser Error Message: {}".format(
                    e.message
                )
            )
        self.wcards = []
        self.wcard_pos = {}
        self._parse_path(model.path_expr)
        self.filters = [[] for _ in self.wcards]
        self._parse_conditions(model.cond_expr)
        self.lambda_filters = [None for _ in self.wcards]
        self._build_lambdas()
        self._build_query()

    def _build_query(self):
        """Builds the entire query using 'match' and 'rel' using
        the pre-parsed quantifiers and predicates.
        """
        for i in range(0, len(self.wcards)):
            wcard = self.wcards[i][0]
            # TODO Remove this when Python 2.7 support is dropped.
            if sys.version_info[0] == 2 and not isinstance(wcard, Real):
                wcard = wcard.encode("ascii", "ignore")
            filt_str = self.lambda_filters[i]
            if filt_str is None:
                if i == 0:
                    self.match(quantifier=wcard)
                else:
                    self.rel(quantifier=wcard)
            else:
                if i == 0:
                    self.match(quantifier=wcard, predicate=eval(filt_str))
                else:
                    self.rel(quantifier=wcard, predicate=eval(filt_str))

    def _build_lambdas(self):
        """Constructs the final predicate lambdas from the pre-parsed
        predicate information.
        """
        for i in range(0, len(self.wcards)):
            n = self.wcards[i]
            if n[1] != "":
                bool_expr = ""
                type_check = ""
                for j, cond in enumerate(self.filters[i]):
                    if cond[0] is not None:
                        bool_expr += " {}".format(cond[0])
                    bool_expr += " {}".format(cond[1])
                    if cond[2] is not None:
                        if j == 0:
                            type_check += " {}".format(cond[2])
                        else:
                            type_check += " and {}".format(cond[2])
                bool_expr = "lambda df_row: {}".format(bool_expr)
                bool_expr = (
                    'lambda df_row: filter_check_types("{}", df_row, {})'.format(
                        type_check, bool_expr
                    )
                )
                self.lambda_filters[i] = bool_expr

    def _parse_path(self, path_obj):
        """Parses the MATCH statement of a String-based query."""
        nodes = path_obj.path.nodes
        idx = len(self.wcards)
        for n in nodes:
            new_node = [n.wcard, n.name]
            if n.wcard is None or n.wcard == "" or n.wcard == 0:
                new_node[0] = "."
            self.wcards.append(new_node)
            if n.name != "":
                self.wcard_pos[n.name] = idx
            idx += 1

    def _parse_conditions(self, cond_expr):
        """Top level function for parsing the WHERE statement of
        a String-based query.
        """
        conditions = cond_expr.conditions
        for cond in conditions:
            converted_condition = None
            if self._is_unary_cond(cond):
                converted_condition = self._parse_unary_cond(cond)
            elif self._is_binary_cond(cond):
                converted_condition = self._parse_binary_cond(cond)
            else:
                raise RuntimeError("Bad Condition")
            self.filters[self.wcard_pos[converted_condition[1]]].append(
                [converted_condition[0], converted_condition[2], converted_condition[3]]
            )
        for i in range(0, len(self.filters)):
            if len(self.filters[i]) > 0:
                if self.filters[i][0][0] != "not":
                    self.filters[i][0][0] = None

    def _is_unary_cond(self, obj):
        """Detect whether a predicate is unary or not."""
        if (
            cname(obj) == "NotCond"
            or self._is_str_cond(obj)
            or self._is_num_cond(obj)
            or cname(obj) in ["NoneCond", "NotNoneCond", "LeafCond", "NotLeafCond"]
        ):
            return True
        return False

    def _is_binary_cond(self, obj):
        """Detect whether a predicate is binary or not."""
        if cname(obj) in ["AndCond", "OrCond"]:
            return True
        return False

    def _parse_binary_cond(self, obj):
        """Top level function for parsing binary predicates."""
        if cname(obj) == "AndCond":
            return self._parse_and_cond(obj)
        if cname(obj) == "OrCond":
            return self._parse_or_cond(obj)
        raise RuntimeError("Bad Binary Condition")

    def _parse_or_cond(self, obj):
        """Top level function for parsing predicates combined with logical OR."""
        converted_subcond = self._parse_unary_cond(obj.subcond)
        converted_subcond[0] = "or"
        return converted_subcond

    def _parse_and_cond(self, obj):
        """Top level function for parsing predicates combined with logical AND."""
        converted_subcond = self._parse_unary_cond(obj.subcond)
        converted_subcond[0] = "and"
        return converted_subcond

    def _parse_unary_cond(self, obj):
        """Top level function for parsing unary predicates."""
        if cname(obj) == "NotCond":
            return self._parse_not_cond(obj)
        return self._parse_single_cond(obj)

    def _parse_not_cond(self, obj):
        """Parse predicates containing the logical NOT operator."""
        converted_subcond = self._parse_single_cond(obj.subcond)
        converted_subcond[2] = "not {}".format(converted_subcond[2])
        return converted_subcond

    def _run_method_based_on_multi_idx_mode(self, method_name, obj):
        real_method_name = method_name
        if self.multi_index_mode != "off":
            real_method_name = method_name + "_multi_idx"
        method = eval("StringQuery.{}".format(real_method_name))
        return method(self, obj)

    def _parse_single_cond(self, obj):
        """Top level function for parsing individual numeric or string predicates."""
        if self._is_str_cond(obj):
            return self._parse_str(obj)
        if self._is_num_cond(obj):
            return self._parse_num(obj)
        if cname(obj) == "NoneCond":
            return self._run_method_based_on_multi_idx_mode("_parse_none", obj)
        if cname(obj) == "NotNoneCond":
            return self._run_method_based_on_multi_idx_mode("_parse_not_none", obj)
        if cname(obj) == "LeafCond":
            return self._run_method_based_on_multi_idx_mode("_parse_leaf", obj)
        if cname(obj) == "NotLeafCond":
            return self._run_method_based_on_multi_idx_mode("_parse_not_leaf", obj)
        raise RuntimeError("Bad Single Condition")

    def _parse_none(self, obj):
        """Parses 'property IS NONE'."""
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "df_row.name._depth is None",
                None,
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "df_row.name._hatchet_nid is None",
                None,
            ]
        return [
            None,
            obj.name,
            'df_row["{}"] is None'.format(obj.prop),
            None,
        ]

    def _add_aggregation_call_to_multi_idx_predicate(self, predicate):
        if self.multi_index_mode == "any":
            return predicate + ".any()"
        return predicate + ".all()"

    def _parse_none_multi_idx(self, obj):
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._depth is None",
                None,
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._hatchet_nid is None",
                None,
            ]
        if self.multi_index_mode == "any":
            return [
                None,
                obj.name,
                "df_row['{}'].apply(lambda elem: elem is None).any()".format(obj.prop),
                None,
            ]
        # if self.multi_index_mode == "all":
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                "df_row['{}'].apply(lambda elem: elem is None)".format(obj.prop)
            ),
            None,
        ]

    def _parse_not_none(self, obj):
        """Parses 'property IS NOT NONE'."""
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "df_row.name._depth is not None",
                None,
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "df_row.name._hatchet_nid is not None",
                None,
            ]
        return [
            None,
            obj.name,
            'df_row["{}"] is not None'.format(obj.prop),
            None,
        ]

    def _parse_not_none_multi_idx(self, obj):
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._depth is not None",
                None,
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._hatchet_nid is not None",
                None,
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                "df_row['{}'].apply(lambda elem: elem is not None)".format(obj.prop)
            ),
            None,
        ]

    def _parse_leaf(self, obj):
        """Parses 'node IS LEAF'."""
        return [
            None,
            obj.name,
            "len(df_row.name.children) == 0",
            None,
        ]

    def _parse_leaf_multi_idx(self, obj):
        return [
            None,
            obj.name,
            "len(df_row.index.get_level_values('node')[0].children) == 0",
            None,
        ]

    def _parse_not_leaf(self, obj):
        """Parses 'node IS NOT LEAF'."""
        return [
            None,
            obj.name,
            "len(df_row.name.children) > 0",
            None,
        ]

    def _parse_not_leaf_multi_idx(self, obj):
        return [
            None,
            obj.name,
            "len(df_row.index.get_level_values('node')[0].children) > 0",
            None,
        ]

    def _is_str_cond(self, obj):
        """Determines whether a predicate is for string data."""
        if cname(obj) in [
            "StringEq",
            "StringStartsWith",
            "StringEndsWith",
            "StringContains",
            "StringMatch",
        ]:
            return True
        return False

    def _is_num_cond(self, obj):
        """Determines whether a predicate is for numeric data."""
        if cname(obj) in [
            "NumEq",
            "NumLt",
            "NumGt",
            "NumLte",
            "NumGte",
            "NumNan",
            "NumNotNan",
            "NumInf",
            "NumNotInf",
        ]:
            return True
        return False

    def _parse_str(self, obj):
        """Function that redirects processing of string predicates
        to the correct function.
        """
        if cname(obj) == "StringEq":
            return self._run_method_based_on_multi_idx_mode("_parse_str_eq", obj)
        if cname(obj) == "StringStartsWith":
            return self._run_method_based_on_multi_idx_mode(
                "_parse_str_starts_with", obj
            )
        if cname(obj) == "StringEndsWith":
            return self._run_method_based_on_multi_idx_mode("_parse_str_ends_with", obj)
        if cname(obj) == "StringContains":
            return self._run_method_based_on_multi_idx_mode("_parse_str_contains", obj)
        if cname(obj) == "StringMatch":
            return self._run_method_based_on_multi_idx_mode("_parse_str_match", obj)
        raise RuntimeError("Bad String Op Class")

    def _parse_str_eq(self, obj):
        """Processes string equivalence predicates."""
        return [
            None,
            obj.name,
            'df_row["{}"] == "{}"'.format(obj.prop, obj.val),
            "isinstance(df_row['{}'], str)".format(obj.prop),
        ]

    def _parse_str_eq_multi_idx(self, obj):
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: elem == "{}")'.format(
                    obj.prop, obj.val
                )
            ),
            "is_string_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_str_starts_with(self, obj):
        """Processes string 'startswith' predicates."""
        return [
            None,
            obj.name,
            'df_row["{}"].startswith("{}")'.format(obj.prop, obj.val),
            "isinstance(df_row['{}'], str)".format(obj.prop),
        ]

    def _parse_str_starts_with_multi_idx(self, obj):
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: elem.startswith("{}"))'.format(
                    obj.prop, obj.val
                )
            ),
            "is_string_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_str_ends_with(self, obj):
        """Processes string 'endswith' predicates."""
        return [
            None,
            obj.name,
            'df_row["{}"].endswith("{}")'.format(obj.prop, obj.val),
            "isinstance(df_row['{}'], str)".format(obj.prop),
        ]

    def _parse_str_ends_with_multi_idx(self, obj):
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: elem.endswith("{}"))'.format(
                    obj.prop, obj.val
                )
            ),
            "is_string_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_str_contains(self, obj):
        """Processes string 'contains' predicates."""
        return [
            None,
            obj.name,
            '"{}" in df_row["{}"]'.format(obj.val, obj.prop),
            "isinstance(df_row['{}'], str)".format(obj.prop),
        ]

    def _parse_str_contains_multi_idx(self, obj):
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: "{}" in elem)'.format(
                    obj.prop, obj.val
                )
            ),
            "is_string_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_str_match(self, obj):
        """Processes string regex match predicates."""
        return [
            None,
            obj.name,
            're.match("{}", df_row["{}"]) is not None'.format(obj.val, obj.prop),
            "isinstance(df_row['{}'], str)".format(obj.prop),
        ]

    def _parse_str_match_multi_idx(self, obj):
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: re.match("{}", elem) is not None)'.format(
                    obj.prop, obj.val
                )
            ),
            "is_string_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num(self, obj):
        """Function that redirects processing of numeric predicates
        to the correct function.
        """
        if cname(obj) == "NumEq":
            return self._run_method_based_on_multi_idx_mode("_parse_num_eq", obj)
        if cname(obj) == "NumLt":
            return self._run_method_based_on_multi_idx_mode("_parse_num_lt", obj)
        if cname(obj) == "NumGt":
            return self._run_method_based_on_multi_idx_mode("_parse_num_gt", obj)
        if cname(obj) == "NumLte":
            return self._run_method_based_on_multi_idx_mode("_parse_num_lte", obj)
        if cname(obj) == "NumGte":
            return self._run_method_based_on_multi_idx_mode("_parse_num_gte", obj)
        if cname(obj) == "NumNan":
            return self._run_method_based_on_multi_idx_mode("_parse_num_nan", obj)
        if cname(obj) == "NumNotNan":
            return self._run_method_based_on_multi_idx_mode("_parse_num_not_nan", obj)
        if cname(obj) == "NumInf":
            return self._run_method_based_on_multi_idx_mode("_parse_num_inf", obj)
        if cname(obj) == "NumNotInf":
            return self._run_method_based_on_multi_idx_mode("_parse_num_not_inf", obj)
        raise RuntimeError("Bad Number Op Class")

    def _parse_num_eq(self, obj):
        """Processes numeric equivalence predicates."""
        if obj.prop == "depth":
            if obj.val == -1:
                return [
                    None,
                    obj.name,
                    "len(df_row.name.children) == 0",
                    None,
                ]
            elif obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.name._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._depth == {}".format(obj.val),
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.name._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._hatchet_nid == {}".format(obj.val),
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'df_row["{}"] == {}'.format(obj.prop, obj.val),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_eq_multi_idx(self, obj):
        if obj.prop == "depth":
            if obj.val == -1:
                return [
                    None,
                    obj.name,
                    "len(df_row.index.get_level_values('node')[0].children) == 0",
                    None,
                ]
            elif obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._depth == {}".format(obj.val),
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._hatchet_nid == {}".format(
                    obj.val
                ),
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: elem == {})'.format(obj.prop, obj.val)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num_lt(self, obj):
        """Processes numeric less-than predicates."""
        if obj.prop == "depth":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.name._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._depth < {}".format(obj.val),
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.name._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._hatchet_nid < {}".format(obj.val),
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'df_row["{}"] < {}'.format(obj.prop, obj.val),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_lt_multi_idx(self, obj):
        if obj.prop == "depth":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._depth < {}".format(obj.val),
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._hatchet_nid < {}".format(
                    obj.val
                ),
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: elem < {})'.format(obj.prop, obj.val)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num_gt(self, obj):
        """Processes numeric greater-than predicates."""
        if obj.prop == "depth":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be true.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "True",
                    "isinstance(df_row.name._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._depth > {}".format(obj.val),
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be true.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "True",
                    "isinstance(df_row.name._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._hatchet_nid > {}".format(obj.val),
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'df_row["{}"] > {}'.format(obj.prop, obj.val),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_gt_multi_idx(self, obj):
        if obj.prop == "depth":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be true.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "True",
                    "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._depth > {}".format(obj.val),
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be true.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "True",
                    "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._hatchet_nid > {}".format(
                    obj.val
                ),
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: elem > {})'.format(obj.prop, obj.val)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num_lte(self, obj):
        """Processes numeric less-than-or-equal-to predicates."""
        if obj.prop == "depth":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.name._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._depth <= {}".format(obj.val),
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.name._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._hatchet_nid <= {}".format(obj.val),
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'df_row["{}"] <= {}'.format(obj.prop, obj.val),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_lte_multi_idx(self, obj):
        if obj.prop == "depth":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._depth <= {}".format(obj.val),
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be false.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "False",
                    "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._hatchet_nid <= {}".format(
                    obj.val
                ),
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: elem <= {})'.format(obj.prop, obj.val)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num_gte(self, obj):
        """Processes numeric greater-than-or-equal-to predicates."""
        if obj.prop == "depth":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be true.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "True",
                    "isinstance(df_row.name._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._depth >= {}".format(obj.val),
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be true.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "True",
                    "isinstance(df_row.name._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.name._hatchet_nid >= {}".format(obj.val),
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'df_row["{}"] >= {}'.format(obj.prop, obj.val),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_gte_multi_idx(self, obj):
        if obj.prop == "depth":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'depth' property of a Node is strictly non-negative.
                    This condition will always be true.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "True",
                    "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._depth >= {}".format(obj.val),
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            if obj.val < 0:
                warnings.warn(
                    """
                    The 'node_id' property of a Node is strictly non-negative.
                    This condition will always be true.
                    The statement that triggered this warning is:
                    {}
                    """.format(
                        obj
                    ),
                    RedundantQueryFilterWarning,
                )
                return [
                    None,
                    obj.name,
                    "True",
                    "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
                ]
            return [
                None,
                obj.name,
                "df_row.index.get_level_values('node')[0]._hatchet_nid >= {}".format(
                    obj.val
                ),
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'df_row["{}"].apply(lambda elem: elem >= {})'.format(obj.prop, obj.val)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num_nan(self, obj):
        """Processes predicates that check for NaN."""
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "pd.isna(df_row.name._depth)",
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "pd.isna(df_row.name._hatchet_nid)",
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'pd.isna(df_row["{}"])'.format(obj.prop),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_nan_multi_idx(self, obj):
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "pd.isna(df_row.index.get_level_values('node')[0]._depth)",
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "pd.isna(df_row.index.get_level_values('node')[0]._hatchet_nid)",
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'pd.isna(df_row["{}"])'.format(obj.prop)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num_not_nan(self, obj):
        """Processes predicates that check for NaN."""
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "not pd.isna(df_row.name._depth)",
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "not pd.isna(df_row.name._hatchet_nid)",
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'not pd.isna(df_row["{}"])'.format(obj.prop),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_not_nan_multi_idx(self, obj):
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "not pd.isna(df_row.index.get_level_values('node')[0]._depth)",
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "not pd.isna(df_row.index.get_level_values('node')[0]._hatchet_nid)",
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'not pd.isna(df_row["{}"])'.format(obj.prop)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num_inf(self, obj):
        """Processes predicates that check for Infinity."""
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "np.isinf(df_row.name._depth)",
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "np.isinf(df_row.name._hatchet_nid)",
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'np.isinf(df_row["{}"])'.format(obj.prop),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_inf_multi_idx(self, obj):
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "np.isinf(df_row.index.get_level_values('node')[0]._depth)",
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "np.isinf(df_row.index.get_level_values('node')[0]._hatchet_nid)",
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'np.isinf(df_row["{}"])'.format(obj.prop)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]

    def _parse_num_not_inf(self, obj):
        """Processes predicates that check for not-Infinity."""
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "not np.isinf(df_row.name._depth)",
                "isinstance(df_row.name._depth, Real)",
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "not np.isinf(df_row.name._hatchet_nid)",
                "isinstance(df_row.name._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            'not np.isinf(df_row["{}"])'.format(obj.prop),
            "isinstance(df_row['{}'], Real)".format(obj.prop),
        ]

    def _parse_num_not_inf_multi_idx(self, obj):
        if obj.prop == "depth":
            return [
                None,
                obj.name,
                "not np.isinf(df_row.index.get_level_values('node')[0]._depth)",
                "isinstance(df_row.index.get_level_values('node')[0]._depth, Real)",
            ]
        if obj.prop == "node_id":
            return [
                None,
                obj.name,
                "not np.isinf(df_row.index.get_level_values('node')[0]._hatchet_nid)",
                "isinstance(df_row.index.get_level_values('node')[0]._hatchet_nid, Real)",
            ]
        return [
            None,
            obj.name,
            self._add_aggregation_call_to_multi_idx_predicate(
                'not np.isinf(df_row["{}"])'.format(obj.prop)
            ),
            "is_numeric_dtype(df_row['{}'])".format(obj.prop),
        ]


[docs]def parse_string_dialect(query_str, multi_index_mode="off"):
    """Parse all types of String-based queries, including multi-queries that leverage
    the curly brace delimiters.

    Arguments:
        query_str (str): the String-based query to be parsed

    Returns:
        (Query or CompoundQuery): A Hatchet query object representing the String-based query
    """
    # TODO Check if there's a way to prevent curly braces in a string
    #      from being captured

    # Find the number of curly brace-delimited regions in the query
    query_str = query_str.strip()
    curly_brace_elems = re.findall(r"\{(.*?)\}", query_str)
    num_curly_brace_elems = len(curly_brace_elems)
    # If there are no curly brace-delimited regions, just pass the query
    # off to the CypherQuery constructor
    if num_curly_brace_elems == 0:
        if sys.version_info[0] == 2:
            query_str = query_str.decode("utf-8")
        return StringQuery(query_str, multi_index_mode)
    # Create an iterator over the curly brace-delimited regions
    curly_brace_iter = re.finditer(r"\{(.*?)\}", query_str)
    # Will store curly brace-delimited regions in the WHERE clause
    condition_list = None
    # Will store curly brace-delimited regions that contain entire
    # mid-level queries (MATCH clause and WHERE clause)
    query_list = None
    # If entire queries are in brace-delimited regions, store the indexes
    # of the regions here so we don't consider brace-delimited regions
    # within the already-captured region.
    query_idxes = None
    # Store which compound queries to apply to the curly brace-delimited regions
    compound_ops = []
    for i, match in enumerate(curly_brace_iter):
        # Get the substring within curly braces
        substr = query_str[match.start() + 1 : match.end() - 1]
        substr = substr.strip()
        # If an entire query (MATCH + WHERE) is within curly braces,
        # add the query to "query_list", and add the indexes corresponding
        # to the query to "query_idxes"
        if substr.startswith("MATCH"):
            if query_list is None:
                query_list = []
            if query_idxes is None:
                query_idxes = []
            query_list.append(substr)
            query_idxes.append((match.start(), match.end()))
        # If the curly brace-delimited region contains only parts of a
        # WHERE clause, first, check if the region is within another
        # curly brace delimited region. If it is, do nothing (it will
        # be handled later). Otherwise, add the region to "condition_list"
        elif re.match(r"[a-zA-Z0-9_]+\..*", substr) is not None:
            is_encapsulated_region = False
            if query_idxes is not None:
                for s, e in query_idxes:
                    if match.start() >= s or match.end() <= e:
                        is_encapsulated_region = True
                        break
            if is_encapsulated_region:
                continue
            if condition_list is None:
                condition_list = []
            condition_list.append(substr)
        # If the curly brace-delimited region is neither a whole query
        # or part of a WHERE clause, raise an error
        else:
            raise ValueError("Invalid grouping (with curly braces) within the query")
        # If there is a compound operator directly after the curly brace-delimited region,
        # capture the type of operator, and store the type in "compound_ops"
        if i + 1 < num_curly_brace_elems:
            rest_substr = query_str[match.end() :]
            rest_substr = rest_substr.strip()
            if rest_substr.startswith("AND"):
                compound_ops.append("AND")
            elif rest_substr.startswith("OR"):
                compound_ops.append("OR")
            elif rest_substr.startswith("XOR"):
                compound_ops.append("XOR")
            else:
                raise ValueError("Invalid compound operator type found!")
    # Each call to this function should only consider one of the full query or
    # WHERE clause versions at a time. If both types were captured, raise an error
    # because some type of internal logic issue occured.
    if condition_list is not None and query_list is not None:
        raise ValueError(
            "Curly braces must be around either a full mid-level query or a set of conditions in a single mid-level query"
        )
    # This branch is for the WHERE clause version
    if condition_list is not None:
        # Make sure you correctly gathered curly brace-delimited regions and
        # compound operators
        if len(condition_list) != len(compound_ops) + 1:
            raise ValueError(
                "Incompatible number of curly brace elements and compound operators"
            )
        # Get the MATCH clause that will be shared across the subqueries
        match_comp_obj = re.search(r"MATCH\s+(?P<match_field>.*)\s+WHERE", query_str)
        match_comp = match_comp_obj.group("match_field")
        # Iterate over the compound operators
        full_query = None
        for i, op in enumerate(compound_ops):
            # If in the first iteration, set the initial query as a CypherQuery where
            # the MATCH clause is the shared match clause and the WHERE clause is the
            # first curly brace-delimited region
            if i == 0:
                query1 = "MATCH {} WHERE {}".format(match_comp, condition_list[i])
                if sys.version_info[0] == 2:
                    query1 = query1.decode("utf-8")
                full_query = StringQuery(query1, multi_index_mode)
            # Get the next query as a CypherQuery where
            # the MATCH clause is the shared match clause and the WHERE clause is the
            # next curly brace-delimited region
            next_query = "MATCH {} WHERE {}".format(match_comp, condition_list[i + 1])
            if sys.version_info[0] == 2:
                next_query = next_query.decode("utf-8")
            next_query = StringQuery(next_query, multi_index_mode)
            # Add the next query to the full query using the compound operator
            # currently being considered
            if op == "AND":
                full_query = full_query & next_query
            elif op == "OR":
                full_query = full_query | next_query
            else:
                full_query = full_query ^ next_query
        return full_query
    # This branch is for the full query version
    else:
        # Make sure you correctly gathered curly brace-delimited regions and
        # compound operators
        if len(query_list) != len(compound_ops) + 1:
            raise ValueError(
                "Incompatible number of curly brace elements and compound operators"
            )
        # Iterate over the compound operators
        full_query = None
        for i, op in enumerate(compound_ops):
            # If in the first iteration, set the initial query as the result
            # of recursively calling this function on the first curly brace-delimited region
            if i == 0:
                full_query = parse_string_dialect(query_list[i])
            # Get the next query by recursively calling this function
            # on the next curly brace-delimited region
            next_query = parse_string_dialect(query_list[i + 1])
            # Add the next query to the full query using the compound operator
            # currently being considered
            if op == "AND":
                full_query = full_query & next_query
            elif op == "OR":
                full_query = full_query | next_query
            else:
                full_query = full_query ^ next_query
        return full_query