Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2055,6 +2055,15 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
value_y: The dependent variable for correlation
value_x: The independent variable for correlation
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw))
Expand Down Expand Up @@ -2101,6 +2110,22 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
value_y: The dependent variable for covariance
value_x: The independent variable for covariance
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> import builtins
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [],
... [dfn.functions.covar_pop(
... dfn.col("a"), dfn.col("b")
... ).alias("v")]
... )
>>> builtins.round(
... result.collect_column("v")[0].as_py(), 4
... )
0.6667
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw))
Expand All @@ -2118,6 +2143,15 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr
value_y: The dependent variable for covariance
value_x: The independent variable for covariance
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw))
Expand All @@ -2127,6 +2161,15 @@ def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample covariance.

This is an alias for :py:func:`covar_samp`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.covar(dfn.col("a"), dfn.col("b")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
return covar_samp(value_y, value_x, filter)

Expand Down Expand Up @@ -2215,6 +2258,14 @@ def stddev(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The value to find the minimum of
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
>>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.stddev(expression.expr, filter=filter_raw))
Expand All @@ -2229,6 +2280,14 @@ def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The value to find the minimum of
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.stddev_pop(expression.expr, filter=filter_raw))
Expand All @@ -2238,6 +2297,14 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample standard deviation of the argument.

This is an alias for :py:func:`stddev`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]})
>>> result = df.aggregate([], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
return stddev(arg, filter=filter)

Expand All @@ -2246,6 +2313,14 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample variance of the argument.

This is an alias for :py:func:`var_samp`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
return var_samp(expression, filter)

Expand All @@ -2259,6 +2334,14 @@ def var_pop(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The variable to compute the variance for
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [0.0, 2.0]})
>>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.var_pop(expression.expr, filter=filter_raw))
Expand All @@ -2273,6 +2356,14 @@ def var_samp(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The variable to compute the variance for
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.var_sample(expression.expr, filter=filter_raw))
Expand All @@ -2282,6 +2373,14 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr:
"""Computes the sample variance of the argument.

This is an alias for :py:func:`var_samp`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.var_sample(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
return var_samp(expression, filter)

Expand All @@ -2303,6 +2402,15 @@ def regr_avgx(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
5.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2326,6 +2434,15 @@ def regr_avgy(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2349,6 +2466,15 @@ def regr_count(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
3
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2372,6 +2498,15 @@ def regr_intercept(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
0.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2395,6 +2530,15 @@ def regr_r2(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2418,6 +2562,15 @@ def regr_slope(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2441,6 +2594,15 @@ def regr_sxx(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2464,6 +2626,15 @@ def regr_sxy(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -2487,6 +2658,15 @@ def regr_syy(
y: The linear regression dependent variable
x: The linear regression independent variable
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None

Expand Down
Loading