From 8184895c497f1a2ca1dbdd4fcfa11f06b5b9f610 Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Sat, 24 May 2025 02:20:52 +0300 Subject: [PATCH 1/3] Profiler: expose numRecords from the run result. --- docs/profiles.md | 1 + pydeequ/profiles.py | 15 +++++++++++++-- tests/test_profiles.py | 4 ++++ tutorials/profiles.ipynb | 15 +++++++++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/docs/profiles.md b/docs/profiles.md index 2007cab..ee6b993 100644 --- a/docs/profiles.md +++ b/docs/profiles.md @@ -20,5 +20,6 @@ Here are the current supported functionalities of Profiles. | | useSparkSession | | | ColumnProfilesBuilder | ColumnProfilesBuilder(spark_session) | Done | | | property: profiles | Done | +| | property: numRecords | Done | | StandardColumnProfile | StandardColumnProfile(spark_session, column, java_column_profile) | Done | | NumericColumnProfile | NumericColumnProfile(spark_session, column, java_column_profile) | Done | diff --git a/pydeequ/profiles.py b/pydeequ/profiles.py index fbbfd84..5ca3d01 100644 --- a/pydeequ/profiles.py +++ b/pydeequ/profiles.py @@ -2,12 +2,12 @@ """ Profiles file for all the Profiles classes in Deequ""" import json from collections import namedtuple +from typing import Optional from pyspark.sql import DataFrame, SparkSession from pydeequ.analyzers import KLLParameters from pydeequ.metrics import BucketDistribution from pydeequ.pandas_utils import ensure_pyspark_df -from enum import Enum from pydeequ.scala_utils import ( get_or_else_none, java_list_to_python_list, @@ -239,6 +239,7 @@ def __init__(self, spark_session: SparkSession): self._sc = spark_session.sparkContext self._jvm = spark_session._jvm self._profiles = [] + self._numRecords = None self.columnProfileClasses = { "StandardColumnProfile": StandardColumnProfile, "StringColumnProfile": StandardColumnProfile, @@ -251,11 +252,12 @@ def _columnProfilesFromColumnRunBuilderRun(self, run): Produces a Java profile based on the designated column :param run: columnProfilerRunner result - :return: a setter for columnProfilerRunner result + :return self: a setter for columnProfilerRunner result """ self._run_result = run profile_map = self._jvm.scala.collection.JavaConversions.mapAsJavaMap(run.profiles()) # TODO from ScalaUtils self._profiles = {column: self._columnProfileBuilder(column, profile_map[column]) for column in profile_map} + self._numRecords = run.numRecords() return self @property @@ -267,6 +269,15 @@ def profiles(self): """ return self._profiles + @property + def numRecords(self) -> Optional[int]: + """ + A getter for the number of records + + :return Optional[int]: number of records + """ + return self._numRecords + def _columnProfileBuilder(self, column, java_column_profile): """Factory function for ColumnProfile Returns a Java profile based on the designated column diff --git a/tests/test_profiles.py b/tests/test_profiles.py index e196ffd..fd46a2c 100644 --- a/tests/test_profiles.py +++ b/tests/test_profiles.py @@ -72,6 +72,10 @@ def test_spark_session_type_exception(self): except TypeError: pass + def test_profile_numRecords(self): + result = ColumnProfilerRunner(self.spark).onData(self.df).run() + self.assertEqual(result.numRecords, 3) + if __name__ == "__main__": unittest.main() diff --git a/tutorials/profiles.ipynb b/tutorials/profiles.ipynb index b968217..60d15a3 100644 --- a/tutorials/profiles.ipynb +++ b/tutorials/profiles.ipynb @@ -952,6 +952,21 @@ " print(profile)" ] }, + { + "metadata": {}, + "cell_type": "code", + "source": "print(result.numRecords)", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3010972\n" + ] + } + ], + "execution_count": 6 + }, { "cell_type": "markdown", "metadata": {}, From decfb8473d2da6a4a05eadafca4002831c846e21 Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Mon, 4 May 2026 21:23:16 +0300 Subject: [PATCH 2/3] Profiler: avoid using Optional and use 0 default for numRecords --- pydeequ/profiles.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pydeequ/profiles.py b/pydeequ/profiles.py index 5ca3d01..a15f5cf 100644 --- a/pydeequ/profiles.py +++ b/pydeequ/profiles.py @@ -2,7 +2,6 @@ """ Profiles file for all the Profiles classes in Deequ""" import json from collections import namedtuple -from typing import Optional from pyspark.sql import DataFrame, SparkSession from pydeequ.analyzers import KLLParameters @@ -239,7 +238,7 @@ def __init__(self, spark_session: SparkSession): self._sc = spark_session.sparkContext self._jvm = spark_session._jvm self._profiles = [] - self._numRecords = None + self._numRecords = 0 self.columnProfileClasses = { "StandardColumnProfile": StandardColumnProfile, "StringColumnProfile": StandardColumnProfile, @@ -270,7 +269,7 @@ def profiles(self): return self._profiles @property - def numRecords(self) -> Optional[int]: + def numRecords(self) -> int: """ A getter for the number of records From 46d682890e233abb79a518a20d40bac07a0ed5a2 Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Mon, 4 May 2026 21:30:34 +0300 Subject: [PATCH 3/3] Profiler: fix numRecords docstring return type. --- pydeequ/profiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydeequ/profiles.py b/pydeequ/profiles.py index a15f5cf..72ccc21 100644 --- a/pydeequ/profiles.py +++ b/pydeequ/profiles.py @@ -273,7 +273,7 @@ def numRecords(self) -> int: """ A getter for the number of records - :return Optional[int]: number of records + :return int: number of records """ return self._numRecords