From 5c474d834c8816e2d652fe9113c889337e7f6b27 Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Fri, 6 Jun 2025 22:18:07 +0300 Subject: [PATCH 1/5] Unfail repository tests. --- tests/test_repository.py | 48 ++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tests/test_repository.py b/tests/test_repository.py index 3c37471..3b060cc 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- import unittest -import pytest +from py4j.protocol import Py4JError from pyspark.sql import Row -from pydeequ.analyzers import * -from pydeequ.checks import * -from pydeequ.repository import * -from pydeequ.verification import * +from pydeequ.analyzers import AnalyzerContext, AnalysisRunner, ApproxCountDistinct +from pydeequ.checks import Check, CheckLevel +from pydeequ.repository import FileSystemMetricsRepository, InMemoryMetricsRepository, ResultKey +from pydeequ.verification import VerificationResult, VerificationSuite from tests.conftest import setup_pyspark @@ -273,31 +273,35 @@ def test_verifications_IMmetrep_noTags_noFile(self): print(df.collect()) print(result_metrep.collect()) - @pytest.mark.xfail(reason="@unittest.expectedFailure") def test_fail_no_useRepository(self): - """This test should fail because it doesn't call useRepository() before saveOrAppendResult()""" + """This run fails because it doesn't call useRepository() before saveOrAppendResult().""" metrics_file = FileSystemMetricsRepository.helper_metrics_file(self.spark, "metrics.json") print(f"metrics filepath: {metrics_file}") key_tags = {"tag": "FS metrep analyzers -- FAIL"} resultKey = ResultKey(self.spark, ResultKey.current_milli_time(), key_tags) # MISSING useRepository() - result = ( - self.AnalysisRunner.onData(self.df) - .addAnalyzer(ApproxCountDistinct("b")) - .saveOrAppendResult(resultKey) - .run() + with self.assertRaises(Py4JError) as err: + _ = ( + self.AnalysisRunner.onData(self.df) + .addAnalyzer(ApproxCountDistinct("b")) + .saveOrAppendResult(resultKey) + .run() + ) + + self.assertIn( + "Method saveOrAppendResult([class com.amazon.deequ.repository.ResultKey]) does not exist", + str(err.exception) ) - @pytest.mark.xfail(reason="@unittest.expectedFailure") def test_fail_no_load(self): - """This test should fail because we do not load() for the repository reading""" + """This run fails because we do not load() for the repository reading.""" metrics_file = FileSystemMetricsRepository.helper_metrics_file(self.spark, "metrics.json") print(f"metrics filepath: {metrics_file}") repository = FileSystemMetricsRepository(self.spark, metrics_file) key_tags = {"tag": "FS metrep analyzers"} resultKey = ResultKey(self.spark, ResultKey.current_milli_time(), key_tags) - result = ( + _ = ( self.AnalysisRunner.onData(self.df) .addAnalyzer(ApproxCountDistinct("b")) .useRepository(repository) @@ -306,8 +310,14 @@ def test_fail_no_load(self): ) # MISSING: repository.load() - result_metrep_json = ( - repository.before(ResultKey.current_milli_time()) - .forAnalyzers([ApproxCountDistinct("b")]) - .getSuccessMetricsAsJson() + with self.assertRaises(AttributeError) as err: + _ = ( + repository.before(ResultKey.current_milli_time()) + .forAnalyzers([ApproxCountDistinct("b")]) + .getSuccessMetricsAsJson() + ) + + self.assertEqual( + "'FileSystemMetricsRepository' object has no attribute 'RepositoryLoader'", + str(err.exception) ) From cbfb8d360dc9fec5f2593f15334409afbd01988a Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Fri, 6 Jun 2025 22:19:07 +0300 Subject: [PATCH 2/5] Apply black to repository tests. --- tests/test_repository.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/tests/test_repository.py b/tests/test_repository.py index 3b060cc..f148d93 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -18,7 +18,9 @@ def setUpClass(cls): cls.AnalysisRunner = AnalysisRunner(cls.spark) cls.VerificationSuite = VerificationSuite(cls.spark) cls.sc = cls.spark.sparkContext - cls.df = cls.sc.parallelize([Row(a="foo", b=1, c=5), Row(a="bar", b=2, c=6), Row(a="baz", b=3, c=None)]).toDF() + cls.df = cls.sc.parallelize( + [Row(a="foo", b=1, c=5), Row(a="bar", b=2, c=6), Row(a="baz", b=3, c=None)] + ).toDF() @classmethod def tearDownClass(cls): @@ -121,12 +123,16 @@ def test_verifications_FSmetrep(self): ) # TEST: Check JSON for tags - result_metrep_json = repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsJson() + result_metrep_json = ( + repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsJson() + ) print(result_metrep_json[0]["tag"], key_tags["tag"]) self.assertEqual(result_metrep_json[0]["tag"], key_tags["tag"]) - result_metrep = repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsDataFrame() + result_metrep = ( + repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsDataFrame() + ) df = VerificationResult.checkResultsAsDataFrame(self.spark, result) print(df.collect()) @@ -146,7 +152,9 @@ def test_verifications_FSmetrep_noTags_noFile(self): ) # TEST: Check DF parity - result_metrep = repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsDataFrame() + result_metrep = ( + repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsDataFrame() + ) df = VerificationResult.checkResultsAsDataFrame(self.spark, result) print(df.collect()) @@ -243,12 +251,16 @@ def test_verifications_IMmetrep(self): ) # TEST: Check JSON for tags - result_metrep_json = repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsJson() + result_metrep_json = ( + repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsJson() + ) print(result_metrep_json[0]["tag"], key_tags["tag"]) self.assertEqual(result_metrep_json[0]["tag"], key_tags["tag"]) - result_metrep = repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsDataFrame() + result_metrep = ( + repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsDataFrame() + ) df = VerificationResult.checkResultsAsDataFrame(self.spark, result) print(df.collect()) @@ -267,7 +279,9 @@ def test_verifications_IMmetrep_noTags_noFile(self): ) # TEST: Check DF parity - result_metrep = repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsDataFrame() + result_metrep = ( + repository.load().before(ResultKey.current_milli_time()).getSuccessMetricsAsDataFrame() + ) df = VerificationResult.checkResultsAsDataFrame(self.spark, result) print(df.collect()) @@ -291,7 +305,7 @@ def test_fail_no_useRepository(self): self.assertIn( "Method saveOrAppendResult([class com.amazon.deequ.repository.ResultKey]) does not exist", - str(err.exception) + str(err.exception), ) def test_fail_no_load(self): @@ -318,6 +332,5 @@ def test_fail_no_load(self): ) self.assertEqual( - "'FileSystemMetricsRepository' object has no attribute 'RepositoryLoader'", - str(err.exception) + "'FileSystemMetricsRepository' object has no attribute 'RepositoryLoader'", str(err.exception) ) From 68279c0bff3a1cbd4873c8ca9cea62f989bf61e9 Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Mon, 4 May 2026 23:41:45 +0300 Subject: [PATCH 3/5] Repository tests: use less specific Scala error messages assertions. --- tests/test_repository.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/test_repository.py b/tests/test_repository.py index f148d93..08ee87c 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -303,10 +303,8 @@ def test_fail_no_useRepository(self): .run() ) - self.assertIn( - "Method saveOrAppendResult([class com.amazon.deequ.repository.ResultKey]) does not exist", - str(err.exception), - ) + self.assertIn("ResultKey", str(err.exception)) + def test_fail_no_load(self): """This run fails because we do not load() for the repository reading.""" @@ -331,6 +329,4 @@ def test_fail_no_load(self): .getSuccessMetricsAsJson() ) - self.assertEqual( - "'FileSystemMetricsRepository' object has no attribute 'RepositoryLoader'", str(err.exception) - ) + self.assertIn("RepositoryLoader", str(err.exception)) From c542b6b5775b4efdc335414e508059a02b3e5345 Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Tue, 5 May 2026 00:15:11 +0300 Subject: [PATCH 4/5] Repository tests: remove potentially fragile error messages assertions. --- tests/test_repository.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_repository.py b/tests/test_repository.py index 08ee87c..c16ac2e 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -303,8 +303,6 @@ def test_fail_no_useRepository(self): .run() ) - self.assertIn("ResultKey", str(err.exception)) - def test_fail_no_load(self): """This run fails because we do not load() for the repository reading.""" @@ -329,4 +327,3 @@ def test_fail_no_load(self): .getSuccessMetricsAsJson() ) - self.assertIn("RepositoryLoader", str(err.exception)) From f86d2279ef8da3aafa8991db01396379385f752f Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Tue, 5 May 2026 00:17:33 +0300 Subject: [PATCH 5/5] Repository tests: remove unised err vars. --- tests/test_repository.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_repository.py b/tests/test_repository.py index c16ac2e..0cff34d 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -295,7 +295,7 @@ def test_fail_no_useRepository(self): resultKey = ResultKey(self.spark, ResultKey.current_milli_time(), key_tags) # MISSING useRepository() - with self.assertRaises(Py4JError) as err: + with self.assertRaises(Py4JError): _ = ( self.AnalysisRunner.onData(self.df) .addAnalyzer(ApproxCountDistinct("b")) @@ -320,7 +320,7 @@ def test_fail_no_load(self): ) # MISSING: repository.load() - with self.assertRaises(AttributeError) as err: + with self.assertRaises(AttributeError): _ = ( repository.before(ResultKey.current_milli_time()) .forAnalyzers([ApproxCountDistinct("b")])