Skip to content

Commit f49705f

Browse files
committed
add interop test against parquet-testing
1 parent 046239f commit f49705f

6 files changed

Lines changed: 45 additions & 21 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ public int hashCode() {
387387
return 31 * type.hashCode()
388388
+ 31 * Arrays.hashCode(getMaxBytes())
389389
+ 17 * Arrays.hashCode(getMinBytes())
390+
+ 13 * Long.valueOf(this.getNanCount()).hashCode()
390391
+ Long.valueOf(this.getNumNulls()).hashCode();
391392
}
392393

parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import it.unimi.dsi.fastutil.longs.LongLists;
3333
import java.nio.ByteBuffer;
3434
import java.util.ArrayList;
35-
import java.util.Arrays;
3635
import java.util.Formatter;
3736
import java.util.List;
3837
import java.util.PrimitiveIterator;
@@ -255,10 +254,6 @@ boolean isNullPage(int pageIndex) {
255254
return nullPages[pageIndex];
256255
}
257256

258-
private int getArrayIndex(int pageIndex) {
259-
return Arrays.binarySearch(pageIndexes, pageIndex);
260-
}
261-
262257
// Returns true if this ColumnIndex may have NaN-polluted min/max values that cannot be trusted.
263258
// If true, we need to conservatively include pages with NaN in min/max for some predicates.
264259
boolean mayHaveNaNPollutedMinMax() {
@@ -285,12 +280,6 @@ boolean hasNaNMinMax(int arrayIndex) {
285280
return isMinNaN(arrayIndex) || isMaxNaN(arrayIndex);
286281
}
287282

288-
// Returns true if this page is a confirmed all-NaN page:
289-
// min==max==NaN and nanCounts confirms at least one NaN value.
290-
private boolean isAllNaNs(int arrayIndex, int pageIndex) {
291-
return nanCounts != null && nanCounts[pageIndex] > 0 && isMinNaN(arrayIndex) && isMaxNaN(arrayIndex);
292-
}
293-
294283
/*
295284
* Returns the min value for arrayIndex as a ByteBuffer. (Min values are not stored for null-pages so arrayIndex
296285
* might not equal to pageIndex.)
@@ -428,9 +417,12 @@ public <T extends Comparable<T>> PrimitiveIterator.OfInt visit(NotEq<T> notEq) {
428417
}
429418

430419
if (isNaNLiteral(value)) {
431-
return nanCounts == null
432-
? IndexIterator.all(getPageCount())
433-
: IndexIterator.filter(getPageCount(), pageIndex -> nanCounts[pageIndex] == 0);
420+
// v != NaN is satisfied by every null value, every non-NaN value, and (under IEEE754 total
421+
// order) every NaN whose bit pattern differs from the literal. The page-level min/max and
422+
// nan_count are not sufficient to prove that *all* values of a page equal the NaN literal
423+
// (e.g. a page may mix NaNs with nulls, or hold NaNs with different payloads), so we
424+
// conservatively keep every page.
425+
return IndexIterator.all(getPageCount());
434426
}
435427

436428
if (nullCounts == null) {

parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilderNaN.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ public void testNaNFloatMixed() {
276276
assertEquals(List.of(0), toList(ci.visit(FilterApi.in(FLOAT_COL, new HashSet<>(List.of(1.5f))))));
277277

278278
assertEquals(List.of(1), toList(ci.visit(FilterApi.eq(FLOAT_COL, Float.NaN))));
279-
assertEquals(List.of(0, 2), toList(ci.visit(FilterApi.notEq(FLOAT_COL, Float.NaN))));
279+
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.notEq(FLOAT_COL, Float.NaN))));
280280
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.lt(FLOAT_COL, Float.NaN))));
281281
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.ltEq(FLOAT_COL, Float.NaN))));
282282
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(FLOAT_COL, Float.NaN))));
@@ -331,7 +331,7 @@ public void testNaNDoubleMixed() {
331331
assertEquals(List.of(0), toList(ci.visit(FilterApi.in(DOUBLE_COL, new HashSet<>(List.of(1.5))))));
332332

333333
assertEquals(List.of(1), toList(ci.visit(FilterApi.eq(DOUBLE_COL, Double.NaN))));
334-
assertEquals(List.of(0, 2), toList(ci.visit(FilterApi.notEq(DOUBLE_COL, Double.NaN))));
334+
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.notEq(DOUBLE_COL, Double.NaN))));
335335
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.lt(DOUBLE_COL, Double.NaN))));
336336
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.ltEq(DOUBLE_COL, Double.NaN))));
337337
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(DOUBLE_COL, Double.NaN))));
@@ -386,7 +386,7 @@ public void testNaNFloat16Mixed() {
386386
assertEquals(List.of(0), toList(ci.visit(FilterApi.in(FLOAT16_COL, new HashSet<>(List.of(FLOAT16_ONE))))));
387387

388388
assertEquals(List.of(1), toList(ci.visit(FilterApi.eq(FLOAT16_COL, FLOAT16_NAN))));
389-
assertEquals(List.of(0, 2), toList(ci.visit(FilterApi.notEq(FLOAT16_COL, FLOAT16_NAN))));
389+
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.notEq(FLOAT16_COL, FLOAT16_NAN))));
390390
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.lt(FLOAT16_COL, FLOAT16_NAN))));
391391
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.ltEq(FLOAT16_COL, FLOAT16_NAN))));
392392
assertEquals(List.of(0, 1, 2), toList(ci.visit(FilterApi.gt(FLOAT16_COL, FLOAT16_NAN))));

parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,10 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
371371
}
372372

373373
if (isNaNLiteral(meta, value)) {
374-
return (stats.isNanCountSet() && stats.getNanCount() == 0) ? BLOCK_CANNOT_MATCH : BLOCK_MIGHT_MATCH;
374+
// We are looking for records where v != NaN. Every non-NaN value (and every null) satisfies
375+
// this predicate, and the chunk-level statistics cannot prove that all values are NaN in a
376+
// way that lets us drop the block safely, so we conservatively keep it.
377+
return BLOCK_MIGHT_MATCH;
375378
}
376379

377380
if (isAllNaNs(meta)) {

parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ public void testNaNDoubleZeroNaNCount() {
708708
assertFalse(canDrop(in(doubleColumn, new HashSet<>(List.of(50.0))), metas));
709709

710710
assertTrue(canDrop(eq(doubleColumn, Double.NaN), metas));
711-
assertTrue(canDrop(notEq(doubleColumn, Double.NaN), metas));
711+
assertFalse(canDrop(notEq(doubleColumn, Double.NaN), metas));
712712
assertFalse(canDrop(lt(doubleColumn, Double.NaN), metas));
713713
assertFalse(canDrop(ltEq(doubleColumn, Double.NaN), metas));
714714
assertFalse(canDrop(gt(doubleColumn, Double.NaN), metas));
@@ -825,7 +825,7 @@ public void testNaNFloatZeroNaNCount() {
825825
assertFalse(canDrop(in(floatCol, new HashSet<>(List.of(50.0f))), metas));
826826

827827
assertTrue(canDrop(eq(floatCol, Float.NaN), metas));
828-
assertTrue(canDrop(notEq(floatCol, Float.NaN), metas));
828+
assertFalse(canDrop(notEq(floatCol, Float.NaN), metas));
829829
assertFalse(canDrop(lt(floatCol, Float.NaN), metas));
830830
assertFalse(canDrop(ltEq(floatCol, Float.NaN), metas));
831831
assertFalse(canDrop(gt(floatCol, Float.NaN), metas));
@@ -947,7 +947,7 @@ public void testNaNFloat16ZeroNaNCount() {
947947
assertFalse(canDrop(in(float16Column, new HashSet<>(List.of(FLOAT16_FIFTY))), metas));
948948

949949
assertTrue(canDrop(eq(float16Column, FLOAT16_NAN), metas));
950-
assertTrue(canDrop(notEq(float16Column, FLOAT16_NAN), metas));
950+
assertFalse(canDrop(notEq(float16Column, FLOAT16_NAN), metas));
951951
assertFalse(canDrop(lt(float16Column, FLOAT16_NAN), metas));
952952
assertFalse(canDrop(ltEq(float16Column, FLOAT16_NAN), metas));
953953
assertFalse(canDrop(gt(float16Column, FLOAT16_NAN), metas));

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInterOpReadFloatingPointNanCount.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,15 @@
5050

5151
public class TestInterOpReadFloatingPointNanCount {
5252

53+
// Canonical parquet-testing fixture exercising the same scenarios as the local generator, as a
54+
// single file with five row groups. parquet-java's writer produces the same statistics and
55+
// column index layout, so the reference file is verified with the exact same expectations as the
56+
// locally generated merged file.
57+
private static final String REFERENCE_FILE = "floating_orders_nan_count.parquet";
58+
private static final String REFERENCE_CHANGESET = "ffdcbb5e22828186c7461e56dbd26a0fe3caee56";
59+
60+
private final InterOpTester interop = new InterOpTester();
61+
5362
@Rule
5463
public TemporaryFolder temp = new TemporaryFolder();
5564

@@ -74,6 +83,25 @@ public void testReadStatsAndColumnIndex() throws IOException {
7483
Scenario.ZERO_MAX);
7584
}
7685

86+
/**
87+
* Reads the canonical {@code floating_orders_nan_count.parquet} from the parquet-testing repo and
88+
* verifies it against the same expectations as the locally generated files, confirming that
89+
* parquet-java reads the reference file (produced by another writer) consistently.
90+
*/
91+
@Test
92+
public void testReadReferenceFile() throws IOException {
93+
Configuration conf = new Configuration();
94+
Path file = interop.GetInterOpFile(REFERENCE_FILE, REFERENCE_CHANGESET);
95+
verifyFile(
96+
file,
97+
conf,
98+
Scenario.NO_NAN,
99+
Scenario.MIXED_NAN,
100+
Scenario.ALL_NAN,
101+
Scenario.ZERO_MIN,
102+
Scenario.ZERO_MAX);
103+
}
104+
77105
private static void verifyFile(Path file, Configuration conf, Scenario... scenarios) throws IOException {
78106
List<Float> expectFloats = concatFloatValues(scenarios);
79107
List<Double> expectDoubles = concatDoubleValues(scenarios);

0 commit comments

Comments
 (0)