diff --git a/bigframes/core/col.py b/bigframes/core/col.py index d00d61365a..cad30f8f33 100644 --- a/bigframes/core/col.py +++ b/bigframes/core/col.py @@ -14,7 +14,7 @@ from __future__ import annotations import dataclasses -from typing import Any, Hashable +from typing import Any, Hashable, Literal, TYPE_CHECKING import bigframes_vendored.pandas.core.col as pd_col @@ -23,6 +23,10 @@ import bigframes.operations as bf_ops import bigframes.operations.aggregations as agg_ops +if TYPE_CHECKING: + import bigframes.operations.datetimes as datetimes + import bigframes.operations.strings as strings + # Not to be confused with the Expression class in `bigframes.core.expressions` # Name collision unintended @@ -32,7 +36,7 @@ class Expression: _value: bf_expression.Expression - def _apply_unary(self, op: bf_ops.UnaryOp) -> Expression: + def _apply_unary_op(self, op: bf_ops.UnaryOp) -> Expression: return Expression(op.as_expr(self._value)) def _apply_unary_agg(self, op: agg_ops.UnaryAggregateOp) -> Expression: @@ -44,7 +48,14 @@ def _apply_unary_agg(self, op: agg_ops.UnaryAggregateOp) -> Expression: agg_expressions.WindowExpression(agg_expr, window_spec.unbound()) ) - def _apply_binary(self, other: Any, op: bf_ops.BinaryOp, reverse: bool = False): + # alignment is purely for series compatibility, and is ignored here + def _apply_binary_op( + self, + other: Any, + op: bf_ops.BinaryOp, + alignment: Literal["outer", "left"] = "outer", + reverse: bool = False, + ): if isinstance(other, Expression): other_value = other._value else: @@ -55,79 +66,79 @@ def _apply_binary(self, other: Any, op: bf_ops.BinaryOp, reverse: bool = False): return Expression(op.as_expr(self._value, other_value)) def __add__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.add_op) + return self._apply_binary_op(other, bf_ops.add_op) def __radd__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.add_op, reverse=True) + return self._apply_binary_op(other, bf_ops.add_op, reverse=True) def __sub__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.sub_op) + return self._apply_binary_op(other, bf_ops.sub_op) def __rsub__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.sub_op, reverse=True) + return self._apply_binary_op(other, bf_ops.sub_op, reverse=True) def __mul__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.mul_op) + return self._apply_binary_op(other, bf_ops.mul_op) def __rmul__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.mul_op, reverse=True) + return self._apply_binary_op(other, bf_ops.mul_op, reverse=True) def __truediv__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.div_op) + return self._apply_binary_op(other, bf_ops.div_op) def __rtruediv__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.div_op, reverse=True) + return self._apply_binary_op(other, bf_ops.div_op, reverse=True) def __floordiv__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.floordiv_op) + return self._apply_binary_op(other, bf_ops.floordiv_op) def __rfloordiv__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.floordiv_op, reverse=True) + return self._apply_binary_op(other, bf_ops.floordiv_op, reverse=True) def __ge__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.ge_op) + return self._apply_binary_op(other, bf_ops.ge_op) def __gt__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.gt_op) + return self._apply_binary_op(other, bf_ops.gt_op) def __le__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.le_op) + return self._apply_binary_op(other, bf_ops.le_op) def __lt__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.lt_op) + return self._apply_binary_op(other, bf_ops.lt_op) def __eq__(self, other: object) -> Expression: # type: ignore - return self._apply_binary(other, bf_ops.eq_op) + return self._apply_binary_op(other, bf_ops.eq_op) def __ne__(self, other: object) -> Expression: # type: ignore - return self._apply_binary(other, bf_ops.ne_op) + return self._apply_binary_op(other, bf_ops.ne_op) def __mod__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.mod_op) + return self._apply_binary_op(other, bf_ops.mod_op) def __rmod__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.mod_op, reverse=True) + return self._apply_binary_op(other, bf_ops.mod_op, reverse=True) def __and__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.and_op) + return self._apply_binary_op(other, bf_ops.and_op) def __rand__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.and_op, reverse=True) + return self._apply_binary_op(other, bf_ops.and_op, reverse=True) def __or__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.or_op) + return self._apply_binary_op(other, bf_ops.or_op) def __ror__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.or_op, reverse=True) + return self._apply_binary_op(other, bf_ops.or_op, reverse=True) def __xor__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.xor_op) + return self._apply_binary_op(other, bf_ops.xor_op) def __rxor__(self, other: Any) -> Expression: - return self._apply_binary(other, bf_ops.xor_op, reverse=True) + return self._apply_binary_op(other, bf_ops.xor_op, reverse=True) def __invert__(self) -> Expression: - return self._apply_unary(bf_ops.invert_op) + return self._apply_unary_op(bf_ops.invert_op) def sum(self) -> Expression: return self._apply_unary_agg(agg_ops.sum_op) @@ -147,6 +158,18 @@ def min(self) -> Expression: def max(self) -> Expression: return self._apply_unary_agg(agg_ops.max_op) + @property + def dt(self) -> datetimes.DatetimeSimpleMethods: + import bigframes.operations.datetimes as datetimes + + return datetimes.DatetimeSimpleMethods(self) + + @property + def str(self) -> strings.StringMethods: + import bigframes.operations.strings as strings + + return strings.StringMethods(self) + def col(col_name: Hashable) -> Expression: return Expression(bf_expression.free_var(col_name)) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 3ae98a267e..dd27587433 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -982,7 +982,9 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp): - if x.type() in (ibis_dtypes.str, ibis_dtypes.Timestamp("UTC")): # type: ignore + if x.type() == ibis_dtypes.Timestamp(None): # type: ignore + return x # already a timestamp, no-op + elif x.type() in (ibis_dtypes.str, ibis_dtypes.Timestamp("UTC")): # type: ignore return x.try_cast(ibis_dtypes.Timestamp(None)) # type: ignore else: # Numerical inputs. diff --git a/bigframes/operations/datetime_ops.py b/bigframes/operations/datetime_ops.py index 19541a383c..37a6035ef7 100644 --- a/bigframes/operations/datetime_ops.py +++ b/bigframes/operations/datetime_ops.py @@ -74,6 +74,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT dtypes.STRING_DTYPE, dtypes.DATE_DTYPE, dtypes.TIMESTAMP_DTYPE, + dtypes.DATETIME_DTYPE, ): raise TypeError("expected string or numeric input") return pd.ArrowDtype(pa.timestamp("us", tz=None)) @@ -87,6 +88,8 @@ class ToTimestampOp(base_ops.UnaryOp): def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: # Must be numeric or string + if input_types[0] == dtypes.TIMESTAMP_DTYPE: + raise TypeError("Already tz-aware.") if input_types[0] not in ( dtypes.FLOAT_DTYPE, dtypes.INT_DTYPE, diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index f66c37bb64..2850919ee3 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -15,13 +15,16 @@ from __future__ import annotations import datetime as dt -from typing import Literal, Optional +from typing import Generic, Literal, Optional, TypeVar import bigframes_vendored.pandas.core.arrays.datetimelike as vendored_pandas_datetimelike import bigframes_vendored.pandas.core.indexes.accessor as vendordt import pandas from bigframes import dataframe, dtypes, series +from bigframes._tools import docs +import bigframes.core.col +import bigframes.core.indexes.base as indices from bigframes.core.logging import log_adapter import bigframes.operations as ops @@ -31,152 +34,156 @@ _SUPPORTED_FREQS = ("Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us") -@log_adapter.class_logger -class DatetimeMethods( - vendordt.DatetimeProperties, - vendored_pandas_datetimelike.DatelikeOps, -): - __doc__ = vendordt.DatetimeProperties.__doc__ +T = TypeVar("T", series.Series, indices.Index, bigframes.core.col.Expression) - def __init__(self, data: series.Series): - self._data = data + +# Simpler base class for datetime properties, excludes isocalendar, unit, tz +class DatetimeSimpleMethods(Generic[T]): + def __init__(self, data: T): + self._data: T = data # Date accessors @property - def day(self) -> series.Series: + def day(self) -> T: return self._data._apply_unary_op(ops.day_op) @property - def dayofweek(self) -> series.Series: + def dayofweek(self) -> T: return self._data._apply_unary_op(ops.dayofweek_op) @property - def day_of_week(self) -> series.Series: + def day_of_week(self) -> T: return self.dayofweek @property - def weekday(self) -> series.Series: + def weekday(self) -> T: return self.dayofweek @property - def dayofyear(self) -> series.Series: + def dayofyear(self) -> T: return self._data._apply_unary_op(ops.dayofyear_op) @property - def day_of_year(self) -> series.Series: + def day_of_year(self) -> T: return self.dayofyear @property - def date(self) -> series.Series: + def date(self) -> T: return self._data._apply_unary_op(ops.date_op) @property - def quarter(self) -> series.Series: + def quarter(self) -> T: return self._data._apply_unary_op(ops.quarter_op) @property - def year(self) -> series.Series: + def year(self) -> T: return self._data._apply_unary_op(ops.year_op) @property - def month(self) -> series.Series: + def month(self) -> T: return self._data._apply_unary_op(ops.month_op) - def isocalendar(self) -> dataframe.DataFrame: - iso_ops = [ops.iso_year_op, ops.iso_week_op, ops.iso_day_op] - labels = pandas.Index(["year", "week", "day"]) - block = self._data._block.project_exprs( - [op.as_expr(self._data._value_column) for op in iso_ops], labels, drop=True - ) - return dataframe.DataFrame(block) - # Time accessors @property - def hour(self) -> series.Series: + def hour(self) -> T: return self._data._apply_unary_op(ops.hour_op) @property - def minute(self) -> series.Series: + def minute(self) -> T: return self._data._apply_unary_op(ops.minute_op) @property - def second(self) -> series.Series: + def second(self) -> T: return self._data._apply_unary_op(ops.second_op) @property - def time(self) -> series.Series: + def time(self) -> T: return self._data._apply_unary_op(ops.time_op) # Timedelta accessors @property - def days(self) -> series.Series: + def days(self) -> T: self._check_dtype(dtypes.TIMEDELTA_DTYPE) return self._data._apply_binary_op(_ONE_DAY, ops.floordiv_op) @property - def seconds(self) -> series.Series: + def seconds(self) -> T: self._check_dtype(dtypes.TIMEDELTA_DTYPE) return self._data._apply_binary_op(_ONE_DAY, ops.mod_op) // _ONE_SECOND # type: ignore @property - def microseconds(self) -> series.Series: + def microseconds(self) -> T: self._check_dtype(dtypes.TIMEDELTA_DTYPE) return self._data._apply_binary_op(_ONE_SECOND, ops.mod_op) // _ONE_MICRO # type: ignore - def total_seconds(self) -> series.Series: + def total_seconds(self) -> T: self._check_dtype(dtypes.TIMEDELTA_DTYPE) return self._data._apply_binary_op(_ONE_SECOND, ops.div_op) def _check_dtype(self, target_dtype: dtypes.Dtype): - if self._data._dtype == target_dtype: - return - raise TypeError(f"Expect dtype: {target_dtype}, but got {self._data._dtype}") - - @property - def tz(self) -> Optional[dt.timezone]: - # Assumption: pyarrow dtype - tz_string = self._data._dtype.pyarrow_dtype.tz - if tz_string == "UTC": - return dt.timezone.utc - elif tz_string is None: - return None - else: - raise ValueError(f"Unexpected timezone {tz_string}") - - def tz_localize(self, tz: Literal["UTC"] | None) -> series.Series: + if isinstance(self._data, (indices.Index, series.Series)): + if self._data.dtype != target_dtype: + raise TypeError( + f"Expect dtype: {target_dtype}, but got {self._data.dtype}" + ) + return + + def tz_localize(self, tz: Literal["UTC"] | None) -> T: if tz == "UTC": - if self._data.dtype == dtypes.TIMESTAMP_DTYPE: - raise ValueError("Already tz-aware.") - return self._data._apply_unary_op(ops.ToTimestampOp()) if tz is None: - if self._data.dtype == dtypes.DATETIME_DTYPE: - return self._data # no-op - return self._data._apply_unary_op(ops.ToDatetimeOp()) raise ValueError(f"Unsupported timezone {tz}") - @property - def unit(self) -> str: - # Assumption: pyarrow dtype - return self._data._dtype.pyarrow_dtype.unit - - def day_name(self) -> series.Series: + def day_name(self) -> T: return self.strftime("%A") - def strftime(self, date_format: str) -> series.Series: + def strftime(self, date_format: str) -> T: return self._data._apply_unary_op(ops.StrftimeOp(date_format=date_format)) - def normalize(self) -> series.Series: + def normalize(self) -> T: return self._data._apply_unary_op(ops.normalize_op) - def floor(self, freq: str) -> series.Series: + def floor(self, freq: str) -> T: if freq not in _SUPPORTED_FREQS: raise ValueError(f"freq must be one of {_SUPPORTED_FREQS}") return self._data._apply_unary_op(ops.FloorDtOp(freq=freq)) # type: ignore + + +# this is the version used by series.dt, and the one that shows up in reference docs +@log_adapter.class_logger +@docs.inherit_docs(vendordt.DatetimeProperties) +@docs.inherit_docs(vendored_pandas_datetimelike.DatelikeOps) +class DatetimeMethods(DatetimeSimpleMethods[bigframes.series.Series]): + def __init__(self, data: series.Series): + super().__init__(data) + + @property + def tz(self) -> Optional[dt.timezone]: + # Assumption: pyarrow dtype + tz_string = self._data._dtype.pyarrow_dtype.tz + if tz_string == "UTC": + return dt.timezone.utc + elif tz_string is None: + return None + else: + raise ValueError(f"Unexpected timezone {tz_string}") + + @property + def unit(self) -> str: + # Assumption: pyarrow dtype + return self._data._dtype.pyarrow_dtype.unit + + def isocalendar(self) -> dataframe.DataFrame: + iso_ops = [ops.iso_year_op, ops.iso_week_op, ops.iso_day_op] + labels = pandas.Index(["year", "week", "day"]) + block = self._data._block.project_exprs( + [op.as_expr(self._data._value_column) for op in iso_ops], labels, drop=True + ) + return dataframe.DataFrame(block) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 8b5b57b259..1712d42355 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -21,6 +21,7 @@ import bigframes_vendored.pandas.core.strings.accessor as vendorstr from bigframes._tools import docs +import bigframes.core.col import bigframes.core.indexes.base as indices from bigframes.core.logging import log_adapter import bigframes.dataframe as df @@ -36,7 +37,7 @@ re.DOTALL: "s", } -T = TypeVar("T", series.Series, indices.Index) +T = TypeVar("T", series.Series, indices.Index, bigframes.core.col.Expression) @log_adapter.class_logger @@ -324,7 +325,14 @@ def to_blob(self, connection: Optional[str] = None) -> T: bigframes.series.Series: Blob Series. """ - session = self._data._block.session + import bigframes.core.blocks + + if hasattr(self._data, "_block") and isinstance( + self._data._block, bigframes.core.blocks.Block + ): + session = self._data._block.session + else: + raise ValueError("to_blob is only supported via Series.str") connection = session._create_bq_connection(connection=connection) return self._data._apply_binary_op(connection, ops.obj_make_ref_op) diff --git a/bigframes/series.py b/bigframes/series.py index 2d0b13b470..23799a0a43 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -73,7 +73,6 @@ import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.operations.blob as blob -import bigframes.operations.datetimes as dt import bigframes.operations.lists as lists import bigframes.operations.plotting as plotting import bigframes.operations.python_op_maps as python_ops @@ -82,6 +81,7 @@ if typing.TYPE_CHECKING: import bigframes.geopandas.geoseries + import bigframes.operations.datetimes as datetimes import bigframes.operations.strings as strings @@ -208,8 +208,10 @@ def __init__( self._block.session._register_object(self) @property - def dt(self) -> dt.DatetimeMethods: - return dt.DatetimeMethods(self) + def dt(self) -> datetimes.DatetimeMethods: + import bigframes.operations.datetimes as datetimes + + return datetimes.DatetimeMethods(self) @property def dtype(self): diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 9f4b5e5705..2bc972a048 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -342,19 +342,20 @@ def test_dt_tz_localize(scalars_dfs, col_name, tz): assert_series_equal(bf_result.to_pandas(), pd_result, check_index_type=False) -@pytest.mark.parametrize( - ("col_name", "tz"), - [ - ("timestamp_col", "UTC"), - ("datetime_col", "US/Eastern"), - ], -) -def test_dt_tz_localize_invalid_inputs(scalars_dfs, col_name, tz): +def test_dt_tz_localize_already_localized(scalars_dfs): + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, _ = scalars_dfs + + with pytest.raises(TypeError): + scalars_df["timestamp_col"].dt.tz_localize("UTC") + + +def test_dt_tz_localize_invalid_timezone(scalars_dfs): pytest.importorskip("pandas", minversion="2.0.0") scalars_df, _ = scalars_dfs with pytest.raises(ValueError): - scalars_df[col_name].dt.tz_localize(tz) + scalars_df["datetime_col"].dt.tz_localize("US/Eastern") @pytest.mark.parametrize( diff --git a/tests/unit/test_col.py b/tests/unit/test_col.py index c7a7eaa326..c3fcb10c9d 100644 --- a/tests/unit/test_col.py +++ b/tests/unit/test_col.py @@ -227,3 +227,22 @@ def test_getitem_with_pd_col(scalars_dfs): pd_result = scalars_pandas_df[pd.col("float64_col") > 4] # type: ignore assert_frame_equal(bf_result, pd_result) + + +def test_col_str_accessor(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.assign(result=bpd.col("string_col").str.lower()).to_pandas() + pd_result = scalars_pandas_df.assign(result=pd.col("string_col").str.lower()) # type: ignore + + assert_frame_equal(bf_result, pd_result) + + +def test_col_dt_accessor(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.assign(result=bpd.col("date_col").dt.year).to_pandas() + pd_result = scalars_pandas_df.assign(result=pd.col("date_col").dt.year) # type: ignore + + # int64[pyarrow] vs Int64 + assert_frame_equal(bf_result, pd_result, check_dtype=False)