diff --git a/.vscode/settings.json b/.vscode/settings.json index efb8c58b5..00ebd4b97 100755 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,6 @@ { "editor.formatOnPaste": false, - "editor.formatOnSave": true, + "editor.formatOnSave": false, "editor.rulers": [ 94 ], diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ce40fd77..05bfcabd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## Release notes ### 0.14.3 -- TBD +- Added - `dj.Top` restriction ([#1024](https://github.com/datajoint/datajoint-python/issues/1024)) PR [#1084](https://github.com/datajoint/datajoint-python/pull/1084) - Fixed - Added encapsulating double quotes to comply with [DOT language](https://graphviz.org/doc/info/lang.html) - PR [#1177](https://github.com/datajoint/datajoint-python/pull/1177) - Added - Ability to set hidden attributes on a table - PR [#1091](https://github.com/datajoint/datajoint-python/pull/1091) diff --git a/datajoint/__init__.py b/datajoint/__init__.py index 096def463..a8a86b706 100644 --- a/datajoint/__init__.py +++ b/datajoint/__init__.py @@ -37,6 +37,7 @@ "Part", "Not", "AndList", + "Top", "U", "Diagram", "Di", @@ -61,7 +62,7 @@ from .schemas import VirtualModule, list_schemas from .table import Table, FreeTable from .user_tables import Manual, Lookup, Imported, Computed, Part -from .expression import Not, AndList, U +from .expression import Not, AndList, U, Top from .diagram import Diagram from .admin import set_password, kill from .blob import MatCell, MatStruct diff --git a/datajoint/condition.py b/datajoint/condition.py index 80786c84c..de6372c6a 100644 --- a/datajoint/condition.py +++ b/datajoint/condition.py @@ -10,6 +10,8 @@ import pandas import json from .errors import DataJointError +from typing import Union, List +from dataclasses import dataclass JSON_PATTERN = re.compile( r"^(?P\w+)(\.(?P[\w.*\[\]]+))?(:(?P[\w(,\s)]+))?$" @@ -61,6 +63,35 @@ def append(self, restriction): super().append(restriction) +@dataclass +class Top: + """ + A restriction to the top entities of a query. + In SQL, this corresponds to ORDER BY ... LIMIT ... OFFSET + """ + + limit: Union[int, None] = 1 + order_by: Union[str, List[str]] = "KEY" + offset: int = 0 + + def __post_init__(self): + self.order_by = self.order_by or ["KEY"] + self.offset = self.offset or 0 + + if self.limit is not None and not isinstance(self.limit, int): + raise TypeError("Top limit must be an integer") + if not isinstance(self.order_by, (str, collections.abc.Sequence)) or not all( + isinstance(r, str) for r in self.order_by + ): + raise TypeError("Top order_by attributes must all be strings") + if not isinstance(self.offset, int): + raise TypeError("The offset argument must be an integer") + if self.offset and self.limit is None: + self.limit = 999999999999 # arbitrary large number to allow query + if isinstance(self.order_by, str): + self.order_by = [self.order_by] + + class Not: """invert restriction""" diff --git a/datajoint/declare.py b/datajoint/declare.py index 9a1706a06..1c02564a8 100644 --- a/datajoint/declare.py +++ b/datajoint/declare.py @@ -455,9 +455,11 @@ def format_attribute(attr): return f"`{attr}`" return f"({attr})" - match = re.match( - r"(?Punique\s+)?index\s*\(\s*(?P.*)\)", line, re.I - ).groupdict() + match = re.match(r"(?Punique\s+)?index\s*\(\s*(?P.*)\)", line, re.I) + if match is None: + raise DataJointError(f'Table definition syntax error in line "{line}"') + match = match.groupdict() + attr_list = re.findall(r"(?:[^,(]|\([^)]*\))+", match["args"]) index_sql.append( "{unique}index ({attrs})".format( diff --git a/datajoint/expression.py b/datajoint/expression.py index d3df0fa71..42be0d59c 100644 --- a/datajoint/expression.py +++ b/datajoint/expression.py @@ -9,6 +9,7 @@ from .preview import preview, repr_html from .condition import ( AndList, + Top, Not, make_condition, assert_join_compatibility, @@ -52,6 +53,7 @@ class QueryExpression: _connection = None _heading = None _support = None + _top = None # If the query will be using distinct _distinct = False @@ -121,17 +123,33 @@ def where_clause(self): else " WHERE (%s)" % ")AND(".join(str(s) for s in self.restriction) ) + def sorting_clauses(self): + if not self._top: + return "" + clause = ", ".join( + _wrap_attributes( + _flatten_attribute_list(self.primary_key, self._top.order_by) + ) + ) + if clause: + clause = f" ORDER BY {clause}" + if self._top.limit is not None: + clause += f" LIMIT {self._top.limit}{f' OFFSET {self._top.offset}' if self._top.offset else ''}" + + return clause + def make_sql(self, fields=None): """ Make the SQL SELECT statement. :param fields: used to explicitly set the select attributes """ - return "SELECT {distinct}{fields} FROM {from_}{where}".format( + return "SELECT {distinct}{fields} FROM {from_}{where}{sorting}".format( distinct="DISTINCT " if self._distinct else "", fields=self.heading.as_sql(fields or self.heading.names), from_=self.from_clause(), where=self.where_clause(), + sorting=self.sorting_clauses(), ) # --------- query operators ----------- @@ -189,6 +207,14 @@ def restrict(self, restriction): string, or an AndList. """ attributes = set() + if isinstance(restriction, Top): + result = ( + self.make_subquery() + if self._top and not self._top.__eq__(restriction) + else copy.copy(self) + ) # make subquery to avoid overwriting existing Top + result._top = restriction + return result new_condition = make_condition(self, restriction, attributes) if new_condition is True: return self # restriction has no effect, return the same object @@ -202,8 +228,10 @@ def restrict(self, restriction): pass # all ok # If the new condition uses any new attributes, a subquery is required. # However, Aggregation's HAVING statement works fine with aliased attributes. - need_subquery = isinstance(self, Union) or ( - not isinstance(self, Aggregation) and self.heading.new_attributes + need_subquery = ( + isinstance(self, Union) + or (not isinstance(self, Aggregation) and self.heading.new_attributes) + or self._top ) if need_subquery: result = self.make_subquery() @@ -539,19 +567,20 @@ def tail(self, limit=25, **fetch_kwargs): def __len__(self): """:return: number of elements in the result set e.g. ``len(q1)``.""" - return self.connection.query( + result = self.make_subquery() if self._top else copy.copy(self) + return result.connection.query( "SELECT {select_} FROM {from_}{where}".format( select_=( "count(*)" - if any(self._left) + if any(result._left) else "count(DISTINCT {fields})".format( - fields=self.heading.as_sql( - self.primary_key, include_aliases=False + fields=result.heading.as_sql( + result.primary_key, include_aliases=False ) ) ), - from_=self.from_clause(), - where=self.where_clause(), + from_=result.from_clause(), + where=result.where_clause(), ) ).fetchone()[0] @@ -619,18 +648,12 @@ def __next__(self): # -- move on to next entry. return next(self) - def cursor(self, offset=0, limit=None, order_by=None, as_dict=False): + def cursor(self, as_dict=False): """ See expression.fetch() for input description. :return: query cursor """ - if offset and limit is None: - raise DataJointError("limit is required when offset is set") sql = self.make_sql() - if order_by is not None: - sql += " ORDER BY " + ", ".join(order_by) - if limit is not None: - sql += " LIMIT %d" % limit + (" OFFSET %d" % offset if offset else "") logger.debug(sql) return self.connection.query(sql, as_dict=as_dict) @@ -701,23 +724,26 @@ def make_sql(self, fields=None): fields = self.heading.as_sql(fields or self.heading.names) assert self._grouping_attributes or not self.restriction distinct = set(self.heading.names) == set(self.primary_key) - return "SELECT {distinct}{fields} FROM {from_}{where}{group_by}".format( - distinct="DISTINCT " if distinct else "", - fields=fields, - from_=self.from_clause(), - where=self.where_clause(), - group_by=( - "" - if not self.primary_key - else ( - " GROUP BY `%s`" % "`,`".join(self._grouping_attributes) - + ( - "" - if not self.restriction - else " HAVING (%s)" % ")AND(".join(self.restriction) + return ( + "SELECT {distinct}{fields} FROM {from_}{where}{group_by}{sorting}".format( + distinct="DISTINCT " if distinct else "", + fields=fields, + from_=self.from_clause(), + where=self.where_clause(), + group_by=( + "" + if not self.primary_key + else ( + " GROUP BY `%s`" % "`,`".join(self._grouping_attributes) + + ( + "" + if not self.restriction + else " HAVING (%s)" % ")AND(".join(self.restriction) + ) ) - ) - ), + ), + sorting=self.sorting_clauses(), + ) ) def __len__(self): @@ -776,7 +802,7 @@ def make_sql(self): ): # no secondary attributes: use UNION DISTINCT fields = arg1.primary_key - return "SELECT * FROM (({sql1}) UNION ({sql2})) as `_u{alias}`".format( + return "SELECT * FROM (({sql1}) UNION ({sql2})) as `_u{alias}{sorting}`".format( sql1=( arg1.make_sql() if isinstance(arg1, Union) @@ -788,6 +814,7 @@ def make_sql(self): else arg2.make_sql(fields) ), alias=next(self.__count), + sorting=self.sorting_clauses(), ) # with secondary attributes, use union of left join with antijoin fields = self.heading.names @@ -939,3 +966,25 @@ def aggr(self, group, **named_attributes): ) aggregate = aggr # alias for aggr + + +def _flatten_attribute_list(primary_key, attrs): + """ + :param primary_key: list of attributes in primary key + :param attrs: list of attribute names, which may include "KEY", "KEY DESC" or "KEY ASC" + :return: generator of attributes where "KEY" is replaced with its component attributes + """ + for a in attrs: + if re.match(r"^\s*KEY(\s+[aA][Ss][Cc])?\s*$", a): + if primary_key: + yield from primary_key + elif re.match(r"^\s*KEY\s+[Dd][Ee][Ss][Cc]\s*$", a): + if primary_key: + yield from (q + " DESC" for q in primary_key) + else: + yield a + + +def _wrap_attributes(attr): + for entry in attr: # wrap attribute names in backquotes + yield re.sub(r"\b((?!asc|desc)\w+)\b", r"`\1`", entry, flags=re.IGNORECASE) diff --git a/datajoint/fetch.py b/datajoint/fetch.py index cd9f3be33..1fe154243 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -1,20 +1,18 @@ from functools import partial from pathlib import Path -import logging import pandas import itertools -import re import json import numpy as np import uuid import numbers + +from datajoint.condition import Top from . import blob, hash from .errors import DataJointError from .settings import config from .utils import safe_write -logger = logging.getLogger(__name__.split(".")[0]) - class key: """ @@ -119,21 +117,6 @@ def _get(connection, attr, data, squeeze, download_path): ) -def _flatten_attribute_list(primary_key, attrs): - """ - :param primary_key: list of attributes in primary key - :param attrs: list of attribute names, which may include "KEY", "KEY DESC" or "KEY ASC" - :return: generator of attributes where "KEY" is replaces with its component attributes - """ - for a in attrs: - if re.match(r"^\s*KEY(\s+[aA][Ss][Cc])?\s*$", a): - yield from primary_key - elif re.match(r"^\s*KEY\s+[Dd][Ee][Ss][Cc]\s*$", a): - yield from (q + " DESC" for q in primary_key) - else: - yield a - - class Fetch: """ A fetch object that handles retrieving elements from the table expression. @@ -174,13 +157,13 @@ def __call__( :param download_path: for fetches that download data, e.g. attachments :return: the contents of the table in the form of a structured numpy.array or a dict list """ - if order_by is not None: - # if 'order_by' passed in a string, make into list - if isinstance(order_by, str): - order_by = [order_by] - # expand "KEY" or "KEY DESC" - order_by = list( - _flatten_attribute_list(self._expression.primary_key, order_by) + if offset or order_by or limit: + self._expression = self._expression.restrict( + Top( + limit, + order_by, + offset, + ) ) attrs_as_dict = as_dict and attrs @@ -212,13 +195,6 @@ def __call__( 'use "array" or "frame"'.format(format) ) - if limit is None and offset is not None: - logger.warning( - "Offset set, but no limit. Setting limit to a large number. " - "Consider setting a limit explicitly." - ) - limit = 8000000000 # just a very large number to effect no limit - get = partial( _get, self._expression.connection, @@ -257,9 +233,7 @@ def __call__( ] ret = return_values[0] if len(attrs) == 1 else return_values else: # fetch all attributes as a numpy.record_array or pandas.DataFrame - cur = self._expression.cursor( - as_dict=as_dict, limit=limit, offset=offset, order_by=order_by - ) + cur = self._expression.cursor(as_dict=as_dict) heading = self._expression.heading if as_dict: ret = [ diff --git a/docs/src/query/operators.md b/docs/src/query/operators.md index 98db22380..39f2488dd 100644 --- a/docs/src/query/operators.md +++ b/docs/src/query/operators.md @@ -1,39 +1,44 @@ # Operators -[Data queries](principles.md) have the form of expressions using operators to derive +[Data queries](principles.md) have the form of expressions using operators to derive the desired table. The expressions themselves do not contain any data. They represent the desired data symbolically. -Once a query is formed, the [fetch](fetch.md) methods are used to bring the data into +Once a query is formed, the [fetch](fetch.md) methods are used to bring the data into the local workspace. -Since the expressions are only symbolic representations, repeated `fetch` calls may +Since the expressions are only symbolic representations, repeated `fetch` calls may yield different results as the state of the database is modified. DataJoint implements a complete algebra of operators on tables: -| operator | notation | meaning | -| -- | -- | -- | -| [restriction](restrict.md) | `A & cond` | The subset of entities from table `A` that meet condition `cond` | -| [restriction](restrict.md) | `A - cond` | The subset of entities from table `A` that do not meet condition `cond` | -| [join](join.md) | `A * B` | Combines all matching information from `A` and `B` | -| [proj](project.md) | `A.proj(...)` | Selects and renames attributes from `A` or computes new attributes | -| [aggr](aggregation.md) | `A.aggr(B, ...)` | Same as projection but allows computations based on matching information in `B` | -| [union](union.md) | `A + B` | All unique entities from both `A` and `B` | +| operator | notation | meaning | +|------------------------------|----------------|-------------------------------------------------------------------------| +| [join](#join) | A * B | All matching information from A and B | +| [restriction](#restriction) | A & cond | The subset of entities from A that meet the condition | +| [restriction](#restriction) | A - cond | The subset of entities from A that do not meet the condition | +| [proj](#proj) | A.proj(...) | Selects and renames attributes from A or computes new attributes | +| [aggr](#aggr) | A.aggr(B, ...) | Same as projection with computations based on matching information in B | +| [union](#union) | A + B | All unique entities from both A and B | +| [universal set](#universal-set)\*| dj.U() | All unique entities from both A and B | +| [top](#top)\*| dj.Top() | The top rows of A + +\*While not technically query operators, it is useful to discuss Universal Set and Top in the +same context. ## Principles of relational algebra -DataJoint's algebra improves upon the classical relational algebra and upon other query -languages to simplify and enhance the construction and interpretation of precise and +DataJoint's algebra improves upon the classical relational algebra and upon other query +languages to simplify and enhance the construction and interpretation of precise and efficient data queries. -1. **Entity integrity**: Data are represented and manipulated in the form of tables +1. **Entity integrity**: Data are represented and manipulated in the form of tables representing [well-formed entity sets](../design/integrity.md). This applies to the inputs and outputs of query operators. - The output of a query operator is an entity set with a well-defined entity type, a + The output of a query operator is an entity set with a well-defined entity type, a primary key, unique attribute names, etc. 2. **Algebraic closure**: All operators operate on entity sets and yield entity sets. - Thus query expressions may be used as operands in other expressions or may be + Thus query expressions may be used as operands in other expressions or may be assigned to variables to be used in other expressions. 3. **Attributes are identified by names**: All attributes have explicit names. This includes results of queries. @@ -42,27 +47,27 @@ representing [well-formed entity sets](../design/integrity.md). ## Matching entities -Binary operators in DataJoint are based on the concept of **matching entities**; this +Binary operators in DataJoint are based on the concept of **matching entities**; this phrase will be used throughout the documentation. - Two entities **match** when they have no common attributes or when their common + Two entities **match** when they have no common attributes or when their common attributes contain the same values. Here **common attributes** are those that have the same names in both entities. -It is usually assumed that the common attributes are of compatible datatypes to allow +It is usually assumed that the common attributes are of compatible datatypes to allow equality comparisons. Another way to phrase the same definition is Two entities match when they have no common attributes whose values differ. -It may be conceptually convenient to imagine that all tables always have an additional +It may be conceptually convenient to imagine that all tables always have an additional invisible attribute, `omega` whose domain comprises only one value, 1. Then the definition of matching entities is simplified: Two entities match when their common attributes contain the same values. -Matching entities can be **merged** into a single entity without any conflicts of +Matching entities can be **merged** into a single entity without any conflicts of attribute names and values. ### Examples @@ -81,16 +86,310 @@ but these entities do *not* match: ## Join compatibility -All binary operators with other tables as their two operands require that the operands +All binary operators with other tables as their two operands require that the operands be **join-compatible**, which means that: -1. All common attributes in both operands (attributes with the same name) must be part +1. All common attributes in both operands (attributes with the same name) must be part of either the primary key or a foreign key. -2. All common attributes in the two relations must be of a compatible datatype for +2. All common attributes in the two relations must be of a compatible datatype for equality comparisons. -These restrictions are introduced both for performance reasons and for conceptual -reasons. -For performance, they encourage queries that rely on indexes. -For conceptual reasons, they encourage database design in which entities in different -tables are related to each other by the use of primary keys and foreign keys. +## Restriction + +The restriction operator `A & cond` selects the subset of entities from `A` that meet +the condition `cond`. The exclusion operator `A - cond` selects the complement of +restriction, i.e. the subset of entities from `A` that do not meet the condition +`cond`. This means that the restriction and exclusion operators are complementary. +The same query could be constructed using either `A & cond` or `A - Not(cond)`. + +
+![Restriction and exclusion.](../../../images/concepts-operators-restriction.png){: style="height:200px"} +
+ +The condition `cond` may be one of the following: + +=== "Python" + + - another table + - a mapping, e.g. `dict` + - an expression in a character string + - a collection of conditions as a `list`, `tuple`, or Pandas `DataFrame` + - a Boolean expression (`True` or `False`) + - an `AndList` + - a `Not` object + - a query expression + +??? Warning "Permissive Operators" + + To circumvent compatibility checks, DataJoint offers permissive operators for + Restriction (`^`) and Join (`@`). Use with Caution. + +## Proj + +The `proj` operator represents **projection** and is used to select attributes +(columns) from a table, to rename them, or to create new calculated attributes. + +1. A simple projection *selects a subset of attributes* of the original +table, which may not include the [primary key](../concepts/glossary#primary-key). + +2. A more complex projection *renames an attribute* in another table. This could be +useful when one table should be referenced multiple times in another. A user table, +could contain all personnel. A project table references one person for the lead and +another the coordinator, both referencing the common personnel pool. + +3. Projection can also perform calculations (as available in +[MySQL](https://dev.mysql.com/doc/refman/5.7/en/functions.html)) on a single attribute. + +## Aggr + +**Aggregation** is a special form of `proj` with the added feature of allowing + aggregation calculations on another table. It has the form `table.aggr + (other, ...)` where `other` is another table. Aggregation allows adding calculated + attributes to each entity in `table` based on aggregation functions over attributes + in the matching entities of `other`. + +Aggregation functions include `count`, `sum`, `min`, `max`, `avg`, `std`, `variance`, +and others. + +## Union + +The result of the union operator `A + B` contains all the entities from both operands. + +[Entity normalization](../design/normalization) requires that `A` and `B` are of the same type, +with with the same [primary key](../concepts/glossary#primary-key), using homologous +attributes. Without secondary attributes, the result is the simple set union. With +secondary attributes, they must have the same names and datatypes. The two operands +must also be **disjoint**, without any duplicate primary key values across both inputs. +These requirements prevent ambiguity of attribute values and preserve entity identity. + +??? Note "Principles of union" + + 1. As in all operators, the order of the attributes in the operands is not + significant. + + 2. Operands `A` and `B` must have the same primary key attributes. Otherwise, an + error will be raised. + + 3. Operands `A` and `B` may not have any common non-key attributes. Otherwise, an + error will be raised. + + 4. The result `A + B` will have the same primary key as `A` and `B`. + + 5. The result `A + B` will have all the non-key attributes from both `A` and `B`. + + 6. For entities that are found in both `A` and `B` (based on the primary key), the + secondary attributes will be filled from the corresponding entities in `A` and + `B`. + + 7. For entities that are only found in either `A` or `B`, the other operand's + secondary attributes will filled with null values. + +For union, order does not matter. + +
+![Union Example 1](../../../images/concepts-operators-union1.png){: style="height:200px"} +
+
+![Union Example 2](../../../images/concepts-operators-union2.png){: style="height:200px"} +
+ +??? Note "Properties of union" + + 1. Commutative: `A + B` is equivalent to `B + A`. + 2. Associative: `(A + B) + C` is equivalent to `A + (B + C)`. + +## Universal Set + +All of the above operators are designed to preserve their input type. Some queries may +require creating a new entity type not already represented by existing tables. This +means that the new type must be defined as part of the query. + +Universal sets fulfill this role using `dj.U` notation. They denote the set of all +possible entities with given attributes of any possible datatype. Attributes of +universal sets are allowed to be matched to any namesake attributes, even those that do +not come from the same initial source. + +Universal sets should be used sparingly when no suitable base tables already exist. In +some cases, defining a new base table can make queries clearer and more semantically +constrained. + +The examples below will use the table definitions in [table tiers](../reproduce/table-tiers). + + + +## Top + +Similar to the universal set operator, the top operator uses `dj.Top` notation. It is used to +restrict a query by the given `limit`, `order_by`, and `offset` parameters: + +```python +Session & dj.Top(limit=10, order_by='session_date') +``` + +The result of this expression returns the first 10 rows of `Session` and sorts them +by their `session_date` in ascending order. + +### `order_by` + +| Example | Description | +|-------------------------------------------|---------------------------------------------------------------------------------| +| `order_by="session_date DESC"` | Sort by `session_date` in *descending* order | +| `order_by="KEY"` | Sort by the primary key | +| `order_by="KEY DESC"` | Sort by the primary key in *descending* order | +| `order_by=["subject_id", "session_date"]` | Sort by `subject_id`, then sort matching `subject_id`s by their `session_date` | + +The default values for `dj.Top` parameters are `limit=1`, `order_by="KEY"`, and `offset=0`. + +## Restriction + +`&` and `-` operators permit restriction. + +### By a mapping + +For a [Session table](../reproduce/table-tiers#manual-tables), that has the attribute +`session_date`, we can restrict to sessions from January 1st, 2022: + +```python +Session & {'session_date': "2022-01-01"} +``` + +If there were any typos (e.g., using `sess_date` instead of `session_date`), our query +will return all of the entities of `Session`. + +### By a string + +Conditions may include arithmetic operations, functions, range tests, etc. Restriction +of table `A` by a string containing an attribute not found in table `A` produces an +error. + +```python +Session & 'user = "Alice"' # (1) +Session & 'session_date >= "2022-01-01"' # (2) +``` + +1. All the sessions performed by Alice +2. All of the sessions on or after January 1st, 2022 + +### By a collection + +When `cond` is a collection of conditions, the conditions are applied by logical +disjunction (logical OR). Restricting a table by a collection will return all entities +that meet *any* of the conditions in the collection. + +For example, if we restrict the `Session` table by a collection containing two +conditions, one for user and one for date, the query will return any sessions with a +matching user *or* date. + +A collection can be a list, a tuple, or a Pandas `DataFrame`. + +``` python +cond_list = ['user = "Alice"', 'session_date = "2022-01-01"'] # (1) +cond_tuple = ('user = "Alice"', 'session_date = "2022-01-01"') # (2) +import pandas as pd +cond_frame = pd.DataFrame(data={'user': ['Alice'], 'session_date': ['2022-01-01']}) # (3) + +Session() & ['user = "Alice"', 'session_date = "2022-01-01"'] +``` + +1. A list +2. A tuple +3. A data frame + +`dj.AndList` represents logical conjunction(logical AND). Restricting a table by an +`AndList` will return all entities that meet *all* of the conditions in the list. `A & +dj.AndList([c1, c2, c3])` is equivalent to `A & c1 & c2 & c3`. + +```python +Student() & dj.AndList(['user = "Alice"', 'session_date = "2022-01-01"']) +``` + +The above will show all the sessions that Alice conducted on the given day. + +### By a `Not` object + +The special function `dj.Not` represents logical negation, such that `A & dj.Not +(cond)` is equivalent to `A - cond`. + +### By a query + +Restriction by a query object is a generalization of restriction by a table. The example +below creates a query object corresponding to all the users named Alice. The `Session` +table is then restricted by the query object, returning all the sessions performed by +Alice. + +``` python +query = User & 'user = "Alice"' +Session & query +``` + +## Proj + +Renaming an attribute in python can be done via keyword arguments: + +```python +table.proj(new_attr='old_attr') +``` + +This can be done in the context of a table definition: + +```python +@schema +class Session(dj.Manual): + definition = """ + # Experiment Session + -> Animal + session : smallint # session number for the animal + --- + session_datetime : datetime # YYYY-MM-DD HH:MM:SS + session_start_time : float # seconds relative to session_datetime + session_end_time : float # seconds relative to session_datetime + -> User.proj(experimenter='username') + -> User.proj(supervisor='username') + """ +``` + +Or to rename multiple values in a table with the following syntax: +`Table.proj(*existing_attributes,*renamed_attributes)` + +```python +Session.proj('session','session_date',start='session_start_time',end='session_end_time') +``` + +Projection can also be used to to compute new attributes from existing ones. + +```python +Session.proj(duration='session_end_time-session_start_time') & 'duration > 10' +``` + +## Aggr + +For more complicated calculations, we can use aggregation. + +``` python +Subject.aggr(Session,n="count(*)") # (1) +Subject.aggr(Session,average_start="avg(session_start_time)") # (2) +``` + +1. Number of sessions per subject. +2. Average `session_start_time` for each subject + + + +## Universal set + +Universal sets offer the complete list of combinations of attributes. + +``` python +# All home cities of students +dj.U('laser_wavelength', 'laser_power') & Scan # (1) +dj.U('laser_wavelength', 'laser_power').aggr(Scan, n="count(*)") # (2) +dj.U().aggr(Session, n="max(session)") # (3) +``` + +1. All combinations of wavelength and power. +2. Total number of scans for each combination. +3. Largest session number. + +`dj.U()`, as shown in the last example above, is often useful for integer IDs. +For an example of this process, see the source code for +[Element Array Electrophysiology's `insert_new_params`](https://datajoint.com/docs/elements/element-array-ephys/latest/api/element_array_ephys/ephys_acute/#element_array_ephys.ephys_acute.ClusteringParamSet.insert_new_params). diff --git a/tests/conftest.py b/tests/conftest.py index 65d68268b..9ece6bb49 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -330,6 +330,8 @@ def schema_simp(connection_test, prefix): schema = dj.Schema( prefix + "_relational", schema_simple.LOCALS_SIMPLE, connection=connection_test ) + schema(schema_simple.SelectPK) + schema(schema_simple.KeyPK) schema(schema_simple.IJ) schema(schema_simple.JI) schema(schema_simple.A) diff --git a/tests/schema_simple.py b/tests/schema_simple.py index 9e3113c9a..f3e591382 100644 --- a/tests/schema_simple.py +++ b/tests/schema_simple.py @@ -13,6 +13,22 @@ import inspect +class SelectPK(dj.Lookup): + definition = """ # tests sql keyword escaping + id: int + select : int + """ + contents = list(dict(id=i, select=i * j) for i in range(3) for j in range(4, 0, -1)) + + +class KeyPK(dj.Lookup): + definition = """ # tests sql keyword escaping + id : int + key : int + """ + contents = list(dict(id=i, key=i + j) for i in range(3) for j in range(4, 0, -1)) + + class IJ(dj.Lookup): definition = """ # tests restrictions i : int diff --git a/tests/test_declare.py b/tests/test_declare.py index 50845eace..f848d31c2 100644 --- a/tests/test_declare.py +++ b/tests/test_declare.py @@ -339,6 +339,17 @@ class WithSuchALongPartNameThatItCrashesMySQL(dj.Part): schema_any(WhyWouldAnyoneCreateATableNameThisLong) +def test_regex_mismatch(schema_any): + + class IndexAttribute(dj.Manual): + definition = """ + index: int + """ + + with pytest.raises(dj.DataJointError): + schema_any(IndexAttribute) + + def test_table_name_with_underscores(schema_any): """ Test issue #1150 -- Reject table names containing underscores. Tables should be in strict diff --git a/tests/test_fetch.py b/tests/test_fetch.py index 4f45ae9e9..7a3cf5a11 100644 --- a/tests/test_fetch.py +++ b/tests/test_fetch.py @@ -202,28 +202,6 @@ def test_offset(lang, languages): assert np.all([cc == ll for cc, ll in zip(c, l)]), "Sorting order is different" -def test_limit_warning(lang): - """Tests whether warning is raised if offset is used without limit.""" - logger = logging.getLogger("datajoint") - log_capture = io.StringIO() - stream_handler = logging.StreamHandler(log_capture) - log_format = logging.Formatter( - "[%(asctime)s][%(funcName)s][%(levelname)s]: %(message)s" - ) - stream_handler.setFormatter(log_format) - stream_handler.set_name("test_limit_warning") - logger.addHandler(stream_handler) - lang.fetch(offset=1) - - log_contents = log_capture.getvalue() - log_capture.close() - - for handler in logger.handlers: # Clean up handler - if handler.name == "test_limit_warning": - logger.removeHandler(handler) - assert "[WARNING]: Offset set, but no limit." in log_contents - - def test_len(lang): """Tests __len__""" assert len(lang.fetch()) == len(lang), "__len__ is not behaving properly" diff --git a/tests/test_relational_operand.py b/tests/test_relational_operand.py index 65c6a5d74..8ff8286e1 100644 --- a/tests/test_relational_operand.py +++ b/tests/test_relational_operand.py @@ -5,6 +5,7 @@ import datetime import numpy as np import datajoint as dj +from datajoint.errors import DataJointError from .schema_simple import * from .schema import * @@ -570,3 +571,96 @@ def test_union_multiple(schema_simp_pop): y = set(zip(*q2.fetch("i", "j"))) assert x == y assert q1.fetch(as_dict=True) == q2.fetch(as_dict=True) + + +class TestDjTop: + + def test_restrictions_by_top(self, schema_simp_pop): + a = L() & dj.Top() + b = L() & dj.Top(order_by=["cond_in_l", "KEY"]) + x = L() & dj.Top(5, "id_l desc", 4) & "cond_in_l=1" + y = L() & "cond_in_l=1" & dj.Top(5, "id_l desc", 4) + z = ( + L() + & dj.Top(None, order_by="id_l desc") + & "cond_in_l=1" + & dj.Top(5, "id_l desc") + & ("id_l=20", "id_l=16", "id_l=17") + & dj.Top(2, "id_l asc", 1) + ) + assert len(a) == 1 + assert len(b) == 1 + assert len(x) == 1 + assert len(y) == 5 + assert len(z) == 2 + assert a.fetch(as_dict=True) == [ + {"id_l": 0, "cond_in_l": 1}, + ] + assert b.fetch(as_dict=True) == [ + {"id_l": 3, "cond_in_l": 0}, + ] + assert x.fetch(as_dict=True) == [{"id_l": 25, "cond_in_l": 1}] + assert y.fetch(as_dict=True) == [ + {"id_l": 16, "cond_in_l": 1}, + {"id_l": 15, "cond_in_l": 1}, + {"id_l": 11, "cond_in_l": 1}, + {"id_l": 10, "cond_in_l": 1}, + {"id_l": 5, "cond_in_l": 1}, + ] + assert z.fetch(as_dict=True) == [ + {"id_l": 17, "cond_in_l": 1}, + {"id_l": 20, "cond_in_l": 1}, + ] + + def test_top_restriction_with_keywords(self, schema_simp_pop): + select = SelectPK() & dj.Top(limit=9, order_by=["select desc"]) + key = KeyPK() & dj.Top(limit=9, order_by="key desc") + assert select.fetch(as_dict=True) == [ + {"id": 2, "select": 8}, + {"id": 2, "select": 6}, + {"id": 1, "select": 4}, + {"id": 2, "select": 4}, + {"id": 1, "select": 3}, + {"id": 1, "select": 2}, + {"id": 2, "select": 2}, + {"id": 1, "select": 1}, + {"id": 0, "select": 0}, + ] + assert key.fetch(as_dict=True) == [ + {"id": 2, "key": 6}, + {"id": 2, "key": 5}, + {"id": 1, "key": 5}, + {"id": 0, "key": 4}, + {"id": 1, "key": 4}, + {"id": 2, "key": 4}, + {"id": 0, "key": 3}, + {"id": 1, "key": 3}, + {"id": 2, "key": 3}, + ] + + def test_top_errors(self, schema_simp_pop): + with pytest.raises(DataJointError) as err1: + L() & ("cond_in_l=1", dj.Top()) + with pytest.raises(DataJointError) as err2: + L() & dj.AndList(["cond_in_l=1", dj.Top()]) + with pytest.raises(TypeError) as err3: + L() & dj.Top(limit="1") + with pytest.raises(TypeError) as err4: + L() & dj.Top(order_by=1) + with pytest.raises(TypeError) as err5: + L() & dj.Top(offset="1") + assert ( + "datajoint.errors.DataJointError: Invalid restriction type Top(limit=1, order_by=['KEY'], offset=0)" + == str(err1.exconly()) + ) + assert ( + "datajoint.errors.DataJointError: Invalid restriction type Top(limit=1, order_by=['KEY'], offset=0)" + == str(err2.exconly()) + ) + assert "TypeError: Top limit must be an integer" == str(err3.exconly()) + assert "TypeError: Top order_by attributes must all be strings" == str( + err4.exconly() + ) + assert "TypeError: The offset argument must be an integer" == str( + err5.exconly() + ) diff --git a/tests/test_schema.py b/tests/test_schema.py index 6407cacab..257de221c 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -210,6 +210,8 @@ def test_list_tables(schema_simp): "#website", "profile", "profile__website", + "#select_p_k", + "#key_p_k", ] ) actual = set(schema_simp.list_tables()) diff --git a/tests_old/schema_simple.py b/tests_old/schema_simple.py index 751761797..d2f3ea376 100644 --- a/tests_old/schema_simple.py +++ b/tests_old/schema_simple.py @@ -15,6 +15,24 @@ schema = dj.Schema(PREFIX + "_relational", locals(), connection=dj.conn(**CONN_INFO)) +@schema +class SelectPK(dj.Lookup): + definition = """ # tests sql keyword escaping + id: int + select : int + """ + contents = list(dict(id=i, select=i * j) for i in range(3) for j in range(4, 0, -1)) + + +@schema +class KeyPK(dj.Lookup): + definition = """ # tests sql keyword escaping + id : int + key : int + """ + contents = list(dict(id=i, key=i + j) for i in range(3) for j in range(4, 0, -1)) + + @schema class IJ(dj.Lookup): definition = """ # tests restrictions diff --git a/tests_old/test_fetch.py b/tests_old/test_fetch.py index 1c415cb27..7506551c8 100644 --- a/tests_old/test_fetch.py +++ b/tests_old/test_fetch.py @@ -213,26 +213,6 @@ def test_offset(self): np.all([cc == ll for cc, ll in zip(c, l)]), "Sorting order is different" ) - def test_limit_warning(self): - """Tests whether warning is raised if offset is used without limit.""" - log_capture = io.StringIO() - stream_handler = logging.StreamHandler(log_capture) - log_format = logging.Formatter( - "[%(asctime)s][%(funcName)s][%(levelname)s]: %(message)s" - ) - stream_handler.setFormatter(log_format) - stream_handler.set_name("test_limit_warning") - logger.addHandler(stream_handler) - self.lang.fetch(offset=1) - - log_contents = log_capture.getvalue() - log_capture.close() - - for handler in logger.handlers: # Clean up handler - if handler.name == "test_limit_warning": - logger.removeHandler(handler) - assert "[WARNING]: Offset set, but no limit." in log_contents - def test_len(self): """Tests __len__""" assert_equal( diff --git a/tests_old/test_relational_operand.py b/tests_old/test_relational_operand.py index 0611ab267..3ba6291da 100644 --- a/tests_old/test_relational_operand.py +++ b/tests_old/test_relational_operand.py @@ -11,9 +11,11 @@ raises, assert_set_equal, assert_list_equal, + assert_raises, ) import datajoint as dj +from datajoint.errors import DataJointError from .schema_simple import ( A, B, @@ -23,6 +25,8 @@ L, DataA, DataB, + SelectPK, + KeyPK, TTestUpdate, IJ, JI, @@ -487,6 +491,95 @@ def test_restrictions_by_lists(): ) assert_true(len(w - y) == 0, "incorrect restriction without common attributes") + @staticmethod + def test_restrictions_by_top(): + a = L() & dj.Top() + b = L() & dj.Top(order_by=["cond_in_l", "KEY"]) + x = L() & dj.Top(5, "id_l desc", 4) & "cond_in_l=1" + y = L() & "cond_in_l=1" & dj.Top(5, "id_l desc", 4) + z = ( + L() + & dj.Top(None, order_by="id_l desc") + & "cond_in_l=1" + & dj.Top(5, "id_l desc") + & ("id_l=20", "id_l=16", "id_l=17") + & dj.Top(2, "id_l asc", 1) + ) + assert len(a) == 1 + assert len(b) == 1 + assert len(x) == 1 + assert len(y) == 5 + assert len(z) == 2 + assert a.fetch(as_dict=True) == [ + {"id_l": 0, "cond_in_l": 1}, + ] + assert b.fetch(as_dict=True) == [ + {"id_l": 3, "cond_in_l": 0}, + ] + assert x.fetch(as_dict=True) == [{"id_l": 25, "cond_in_l": 1}] + assert y.fetch(as_dict=True) == [ + {"id_l": 16, "cond_in_l": 1}, + {"id_l": 15, "cond_in_l": 1}, + {"id_l": 11, "cond_in_l": 1}, + {"id_l": 10, "cond_in_l": 1}, + {"id_l": 5, "cond_in_l": 1}, + ] + assert z.fetch(as_dict=True) == [ + {"id_l": 17, "cond_in_l": 1}, + {"id_l": 20, "cond_in_l": 1}, + ] + + @staticmethod + def test_top_restriction_with_keywords(): + select = SelectPK() & dj.Top(limit=9, order_by=["select desc"]) + key = KeyPK() & dj.Top(limit=9, order_by="key desc") + assert select.fetch(as_dict=True) == [ + {"id": 2, "select": 8}, + {"id": 2, "select": 6}, + {"id": 1, "select": 4}, + {"id": 2, "select": 4}, + {"id": 1, "select": 3}, + {"id": 1, "select": 2}, + {"id": 2, "select": 2}, + {"id": 1, "select": 1}, + {"id": 0, "select": 0}, + ] + assert key.fetch(as_dict=True) == [ + {"id": 2, "key": 6}, + {"id": 2, "key": 5}, + {"id": 1, "key": 5}, + {"id": 0, "key": 4}, + {"id": 1, "key": 4}, + {"id": 2, "key": 4}, + {"id": 0, "key": 3}, + {"id": 1, "key": 3}, + {"id": 2, "key": 3}, + ] + + @staticmethod + def test_top_errors(): + with assert_raises(DataJointError) as err1: + L() & ("cond_in_l=1", dj.Top()) + with assert_raises(DataJointError) as err2: + L() & dj.AndList(["cond_in_l=1", dj.Top()]) + with assert_raises(TypeError) as err3: + L() & dj.Top(limit="1") + with assert_raises(TypeError) as err4: + L() & dj.Top(order_by=1) + with assert_raises(TypeError) as err5: + L() & dj.Top(offset="1") + assert ( + "Invalid restriction type Top(limit=1, order_by=['KEY'], offset=0)" + == str(err1.exception) + ) + assert ( + "Invalid restriction type Top(limit=1, order_by=['KEY'], offset=0)" + == str(err2.exception) + ) + assert "Top limit must be an integer" == str(err3.exception) + assert "Top order_by attributes must all be strings" == str(err4.exception) + assert "The offset argument must be an integer" == str(err5.exception) + @staticmethod def test_datetime(): """Test date retrieval""" diff --git a/tests_old/test_schema.py b/tests_old/test_schema.py index 8ec24fc49..f7a18198e 100644 --- a/tests_old/test_schema.py +++ b/tests_old/test_schema.py @@ -155,6 +155,8 @@ def test_list_tables(): "#website", "profile", "profile__website", + "#select_p_k", + "#key_p_k", ] ) == set(schema_simple.list_tables())