diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index fd116254b..ccfcd33b6 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -2055,6 +2055,15 @@ def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: value_y: The dependent variable for correlation value_x: The independent variable for correlation filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw)) @@ -2101,6 +2110,22 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: value_y: The dependent variable for covariance value_x: The independent variable for covariance filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> import builtins + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], + ... [dfn.functions.covar_pop( + ... dfn.col("a"), dfn.col("b") + ... ).alias("v")] + ... ) + >>> builtins.round( + ... result.collect_column("v")[0].as_py(), 4 + ... ) + 0.6667 """ filter_raw = filter.expr if filter is not None else None return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw)) @@ -2118,6 +2143,15 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr value_y: The dependent variable for covariance value_x: The independent variable for covariance filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw)) @@ -2127,6 +2161,15 @@ def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: """Computes the sample covariance. This is an alias for :py:func:`covar_samp`. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.covar(dfn.col("a"), dfn.col("b")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ return covar_samp(value_y, value_x, filter) @@ -2215,6 +2258,14 @@ def stddev(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]}) + >>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.stddev(expression.expr, filter=filter_raw)) @@ -2229,6 +2280,14 @@ def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.stddev_pop(expression.expr, filter=filter_raw)) @@ -2238,6 +2297,14 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr: """Computes the sample standard deviation of the argument. This is an alias for :py:func:`stddev`. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]}) + >>> result = df.aggregate([], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ return stddev(arg, filter=filter) @@ -2246,6 +2313,14 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. This is an alias for :py:func:`var_samp`. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ return var_samp(expression, filter) @@ -2259,6 +2334,14 @@ def var_pop(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The variable to compute the variance for filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0, 2.0]}) + >>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.var_pop(expression.expr, filter=filter_raw)) @@ -2273,6 +2356,14 @@ def var_samp(expression: Expr, filter: Expr | None = None) -> Expr: Args: expression: The variable to compute the variance for filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.var_sample(expression.expr, filter=filter_raw)) @@ -2282,6 +2373,14 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. This is an alias for :py:func:`var_samp`. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.var_sample(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ return var_samp(expression, filter) @@ -2303,6 +2402,15 @@ def regr_avgx( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 5.0 """ filter_raw = filter.expr if filter is not None else None @@ -2326,6 +2434,15 @@ def regr_avgy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2349,6 +2466,15 @@ def regr_count( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None @@ -2372,6 +2498,15 @@ def regr_intercept( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 0.0 """ filter_raw = filter.expr if filter is not None else None @@ -2395,6 +2530,15 @@ def regr_r2( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None @@ -2418,6 +2562,15 @@ def regr_slope( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2441,6 +2594,15 @@ def regr_sxx( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2464,6 +2626,15 @@ def regr_sxy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2487,6 +2658,15 @@ def regr_syy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None