Groupby Operands#
Sometimes, you’ll want to look at the data in your MedRecord in groups, assessing the properties and attributes of each one of them individually. For that, the group_by()
method was devised.
1. Grouping Nodes#
You can group nodes based on a specific criterion. For example, you can group all patients by their gender
and then inspect the age
of the patients within each group. This is done by passing a NodeOperandGroupDiscriminator
to the group_by()
method.
In the snippet below, we group the nodes by the gender
attribute and then retrieve the age
for the patients in each of these groups.
def query_node_group_by_gender(
node: NodeOperand,
) -> NodeMultipleValuesWithIndexGroupOperand:
grouped_nodes = node.group_by(NodeOperandGroupDiscriminator.Attribute("gender"))
return grouped_nodes.attribute("age")
medrecord.query_nodes(query_node_group_by_gender)
[('F', {'pat_2': 22, 'pat_3': 96}), ('M', {'pat_1': 42, 'pat_4': 19, 'pat_5': 37})]
Methods used in the snippet
group_by()
: Groups the nodes based on the given discriminator, returning aNodeGroupOperand
.attribute()
: Returns aNodeMultipleValuesWithIndexGroupOperand
to query on the values of the nodes per group for that attribute.query_nodes()
: Retrieves information on the nodes from the MedRecord given the query.
Furthermore, you can perform calculations on these newly formed groups. For instance, you can calculate the mean
age for each gender group.
def query_node_group_by_gender_mean(
node: NodeOperand,
) -> NodeSingleValueWithoutIndexGroupOperand:
grouped_nodes = node.group_by(NodeOperandGroupDiscriminator.Attribute("gender"))
age_groups = grouped_nodes.attribute("age")
return age_groups.mean()
medrecord.query_nodes(query_node_group_by_gender_mean)
[('F', 59.0), ('M', 32.666666666666664)]
Methods used in the snippet
group_by()
: Groups the nodes based on the given discriminator, returning aNodeGroupOperand
.attribute()
: Returns aNodeMultipleValuesWithIndexGroupOperand
to query on the values of the nodes per group for that attribute.mean()
: Calculates the mean of the values within each group.query_nodes()
: Retrieves information on the nodes from the MedRecord given the query.
2. Grouping Edges#
Similarly to nodes, you can also group edges. A common use case is grouping edges by their SourceNode
or TargetNode
(we could also group them per Attribute
). In the following example, we group the edges based on their source node and retrieve their time
attribute.
def query_edge_group_by_source_node(
edge: EdgeOperand,
) -> EdgeMultipleValuesWithIndexGroupOperand:
edge.index().less_than(20)
grouped_edges = edge.group_by(EdgeOperandGroupDiscriminator.SourceNode())
return grouped_edges.attribute("time")
medrecord.query_edges(query_edge_group_by_source_node)
[('pat_2', {12: datetime.datetime(2015, 1, 13, 0, 0), 17: datetime.datetime(2019, 4, 2, 0, 0), 19: datetime.datetime(2020, 2, 11, 0, 0), 13: datetime.datetime(2015, 12, 10, 0, 0), 16: datetime.datetime(2019, 3, 23, 0, 0), 14: datetime.datetime(2018, 12, 10, 0, 0), 15: datetime.datetime(2019, 2, 5, 0, 0), 18: datetime.datetime(2019, 11, 19, 0, 0)}), ('pat_1', {3: datetime.datetime(2014, 10, 18, 0, 0), 9: datetime.datetime(2020, 5, 12, 0, 0), 1: datetime.datetime(2014, 4, 8, 0, 0), 8: datetime.datetime(2020, 5, 12, 0, 0), 11: datetime.datetime(2022, 5, 26, 0, 0), 5: datetime.datetime(2017, 4, 25, 0, 0), 7: datetime.datetime(2020, 5, 12, 0, 0), 0: datetime.datetime(2014, 2, 6, 0, 0), 10: datetime.datetime(2020, 5, 12, 0, 0), 6: datetime.datetime(2019, 5, 7, 0, 0), 2: datetime.datetime(2014, 4, 8, 0, 0), 4: datetime.datetime(2015, 4, 14, 0, 0)})]
Methods used in the snippet
index()
: Returns anEdgeIndicesOperand
representing the indices of the edges queried.less_than()
: Query edge indices that are less than the specified value.group_by()
: Groups the edges based on the given discriminator.attribute()
: Returns aEdgeMultipleValuesWithIndexGroupOperand
to query on the values of the edges per group for that attribute.query_edges()
: Retrieves information on the edges from the MedRecord given the query.
You can also perform aggregations on edge groups, such as counting how many edges are associated with each source node.
def query_edge_group_by_count_edges(edge: EdgeOperand) -> EdgeIndexGroupOperand:
grouped_edges = edge.group_by(EdgeOperandGroupDiscriminator.SourceNode())
return grouped_edges.index().count()
medrecord.query_edges(query_edge_group_by_count_edges)
[('pat_5', 53), ('pat_4', 15), ('pat_2', 34), ('pat_3', 34), ('pat_1', 24)]
Methods used in the snippet
group_by()
: Groups the edges based on the given discriminator.index()
: Returns anEdgeIndicesGroupOperand
representing the indices of the edges queried within each group.count()
: Counts the number of edges within each group.query_edges()
: Retrieves information on the edges from the MedRecord given the query.
3. Full example Code#
The full code examples for this chapter can be found here:
from medmodels import MedRecord
from medmodels.medrecord.querying import (
EdgeIndexGroupOperand,
EdgeMultipleValuesWithIndexGroupOperand,
EdgeOperand,
EdgeOperandGroupDiscriminator,
NodeMultipleValuesWithIndexGroupOperand,
NodeOperand,
NodeOperandGroupDiscriminator,
NodeSingleValueWithoutIndexGroupOperand,
)
medrecord = MedRecord().from_simple_example_dataset()
def query_node_group_by_gender(
node: NodeOperand,
) -> NodeMultipleValuesWithIndexGroupOperand:
grouped_nodes = node.group_by(NodeOperandGroupDiscriminator.Attribute("gender"))
return grouped_nodes.attribute("age")
medrecord.query_nodes(query_node_group_by_gender)
def query_node_group_by_gender_mean(
node: NodeOperand,
) -> NodeSingleValueWithoutIndexGroupOperand:
grouped_nodes = node.group_by(NodeOperandGroupDiscriminator.Attribute("gender"))
age_groups = grouped_nodes.attribute("age")
return age_groups.mean()
medrecord.query_nodes(query_node_group_by_gender_mean)
def query_edge_group_by_source_node(
edge: EdgeOperand,
) -> EdgeMultipleValuesWithIndexGroupOperand:
edge.index().less_than(20)
grouped_edges = edge.group_by(EdgeOperandGroupDiscriminator.SourceNode())
return grouped_edges.attribute("time")
medrecord.query_edges(query_edge_group_by_source_node)
def query_edge_group_by_count_edges(edge: EdgeOperand) -> EdgeIndexGroupOperand:
grouped_edges = edge.group_by(EdgeOperandGroupDiscriminator.SourceNode())
return grouped_edges.index().count()
medrecord.query_edges(query_edge_group_by_count_edges)