Skip to content

Commit 046239f

Browse files
committed
PARQUET-2249: Add IEEE-754 total order and nan count for floating types
1 parent 7469faa commit 046239f

29 files changed

Lines changed: 3580 additions & 164 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.io.api.Binary;
22+
import org.apache.parquet.schema.ColumnOrder;
23+
import org.apache.parquet.schema.Float16;
24+
import org.apache.parquet.schema.LogicalTypeAnnotation;
2225
import org.apache.parquet.schema.PrimitiveType;
2326
import org.apache.parquet.schema.Types;
2427

@@ -28,6 +31,7 @@ public class BinaryStatistics extends Statistics<Binary> {
2831
private static final PrimitiveType DEFAULT_FAKE_TYPE =
2932
Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named("fake_binary_type");
3033

34+
private final boolean isFloat16;
3135
private Binary max;
3236
private Binary min;
3337

@@ -41,26 +45,59 @@ public BinaryStatistics() {
4145

4246
BinaryStatistics(PrimitiveType type) {
4347
super(type);
48+
this.isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation;
49+
if (isFloat16) {
50+
incrementNanCount(0);
51+
}
4452
}
4553

4654
private BinaryStatistics(BinaryStatistics other) {
4755
super(other.type());
56+
this.isFloat16 = other.isFloat16;
4857
if (other.hasNonNullValue()) {
4958
initializeStats(other.min, other.max);
5059
}
5160
setNumNulls(other.getNumNulls());
61+
incrementNanCount(other.getNanCount());
5262
}
5363

5464
@Override
5565
public void updateStats(Binary value) {
66+
if (isFloat16 && Float16.isNaN(value.get2BytesLittleEndian())) {
67+
incrementNanCount();
68+
}
5669
if (!this.hasNonNullValue()) {
5770
min = value.copy();
5871
max = value.copy();
5972
this.markAsNotEmpty();
60-
} else if (comparator().compare(min, value) > 0) {
61-
min = value.copy();
62-
} else if (comparator().compare(max, value) < 0) {
63-
max = value.copy();
73+
} else {
74+
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
75+
boolean valueIsNaN = Float16.isNaN(value.get2BytesLittleEndian());
76+
boolean minIsNaN = Float16.isNaN(min.get2BytesLittleEndian());
77+
boolean maxIsNaN = Float16.isNaN(max.get2BytesLittleEndian());
78+
if (valueIsNaN) {
79+
if (minIsNaN && comparator().compare(min, value) > 0) {
80+
min = value.copy();
81+
}
82+
if (maxIsNaN && comparator().compare(max, value) < 0) {
83+
max = value.copy();
84+
}
85+
} else {
86+
if (minIsNaN || comparator().compare(min, value) > 0) {
87+
min = value.copy();
88+
}
89+
if (maxIsNaN || comparator().compare(max, value) < 0) {
90+
max = value.copy();
91+
}
92+
}
93+
return;
94+
}
95+
96+
if (comparator().compare(min, value) > 0) {
97+
min = value.copy();
98+
} else if (comparator().compare(max, value) < 0) {
99+
max = value.copy();
100+
}
64101
}
65102
}
66103

@@ -126,6 +163,29 @@ public boolean isSmallerThanWithTruncation(long size, int truncationLength) {
126163
*/
127164
@Deprecated
128165
public void updateStats(Binary min_value, Binary max_value) {
166+
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
167+
boolean minValueIsNaN = Float16.isNaN(min_value.get2BytesLittleEndian());
168+
boolean minIsNaN = Float16.isNaN(min.get2BytesLittleEndian());
169+
if (minValueIsNaN) {
170+
if (minIsNaN && comparator().compare(min, min_value) > 0) {
171+
min = min_value.copy();
172+
}
173+
} else if (minIsNaN || comparator().compare(min, min_value) > 0) {
174+
min = min_value.copy();
175+
}
176+
177+
boolean maxValueIsNaN = Float16.isNaN(max_value.get2BytesLittleEndian());
178+
boolean maxIsNaN = Float16.isNaN(max.get2BytesLittleEndian());
179+
if (maxValueIsNaN) {
180+
if (maxIsNaN && comparator().compare(max, max_value) < 0) {
181+
max = max_value.copy();
182+
}
183+
} else if (maxIsNaN || comparator().compare(max, max_value) < 0) {
184+
max = max_value.copy();
185+
}
186+
return;
187+
}
188+
129189
if (comparator().compare(min, min_value) > 0) {
130190
min = min_value.copy();
131191
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.bytes.BytesUtils;
22+
import org.apache.parquet.schema.ColumnOrder;
2223
import org.apache.parquet.schema.PrimitiveType;
2324
import org.apache.parquet.schema.Types;
2425

@@ -41,6 +42,7 @@ public DoubleStatistics() {
4142

4243
DoubleStatistics(PrimitiveType type) {
4344
super(type);
45+
incrementNanCount(0);
4446
}
4547

4648
private DoubleStatistics(DoubleStatistics other) {
@@ -49,10 +51,14 @@ private DoubleStatistics(DoubleStatistics other) {
4951
initializeStats(other.min, other.max);
5052
}
5153
setNumNulls(other.getNumNulls());
54+
incrementNanCount(other.getNanCount());
5255
}
5356

5457
@Override
5558
public void updateStats(double value) {
59+
if (Double.isNaN(value)) {
60+
incrementNanCount();
61+
}
5662
if (!this.hasNonNullValue()) {
5763
initializeStats(value, value);
5864
} else {
@@ -79,12 +85,12 @@ public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes) {
7985

8086
@Override
8187
public byte[] getMaxBytes() {
82-
return BytesUtils.longToBytes(Double.doubleToLongBits(max));
88+
return BytesUtils.longToBytes(Double.doubleToRawLongBits(max));
8389
}
8490

8591
@Override
8692
public byte[] getMinBytes() {
87-
return BytesUtils.longToBytes(Double.doubleToLongBits(min));
93+
return BytesUtils.longToBytes(Double.doubleToRawLongBits(min));
8894
}
8995

9096
@Override
@@ -98,6 +104,25 @@ public boolean isSmallerThan(long size) {
98104
}
99105

100106
public void updateStats(double min_value, double max_value) {
107+
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
108+
if (Double.isNaN(min_value)) {
109+
if (Double.isNaN(min) && comparator().compare(min, min_value) > 0) {
110+
min = min_value;
111+
}
112+
} else if (Double.isNaN(min) || comparator().compare(min, min_value) > 0) {
113+
min = min_value;
114+
}
115+
116+
if (Double.isNaN(max_value)) {
117+
if (Double.isNaN(max) && comparator().compare(max, max_value) < 0) {
118+
max = max_value;
119+
}
120+
} else if (Double.isNaN(max) || comparator().compare(max, max_value) < 0) {
121+
max = max_value;
122+
}
123+
return;
124+
}
125+
101126
if (comparator().compare(min, min_value) > 0) {
102127
min = min_value;
103128
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.bytes.BytesUtils;
22+
import org.apache.parquet.schema.ColumnOrder;
2223
import org.apache.parquet.schema.PrimitiveType;
2324
import org.apache.parquet.schema.Types;
2425

@@ -42,6 +43,7 @@ public FloatStatistics() {
4243

4344
FloatStatistics(PrimitiveType type) {
4445
super(type);
46+
incrementNanCount(0);
4547
}
4648

4749
private FloatStatistics(FloatStatistics other) {
@@ -50,10 +52,14 @@ private FloatStatistics(FloatStatistics other) {
5052
initializeStats(other.min, other.max);
5153
}
5254
setNumNulls(other.getNumNulls());
55+
incrementNanCount(other.getNanCount());
5356
}
5457

5558
@Override
5659
public void updateStats(float value) {
60+
if (Float.isNaN(value)) {
61+
incrementNanCount();
62+
}
5763
if (!this.hasNonNullValue()) {
5864
initializeStats(value, value);
5965
} else {
@@ -80,12 +86,12 @@ public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes) {
8086

8187
@Override
8288
public byte[] getMaxBytes() {
83-
return BytesUtils.intToBytes(Float.floatToIntBits(max));
89+
return BytesUtils.intToBytes(Float.floatToRawIntBits(max));
8490
}
8591

8692
@Override
8793
public byte[] getMinBytes() {
88-
return BytesUtils.intToBytes(Float.floatToIntBits(min));
94+
return BytesUtils.intToBytes(Float.floatToRawIntBits(min));
8995
}
9096

9197
@Override
@@ -99,6 +105,25 @@ public boolean isSmallerThan(long size) {
99105
}
100106

101107
public void updateStats(float min_value, float max_value) {
108+
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
109+
if (Float.isNaN(min_value)) {
110+
if (Float.isNaN(min) && comparator().compare(min, min_value) > 0) {
111+
min = min_value;
112+
}
113+
} else if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) {
114+
min = min_value;
115+
}
116+
117+
if (Float.isNaN(max_value)) {
118+
if (Float.isNaN(max) && comparator().compare(max, max_value) < 0) {
119+
max = max_value;
120+
}
121+
} else if (Float.isNaN(max) || comparator().compare(max, max_value) < 0) {
122+
max = max_value;
123+
}
124+
return;
125+
}
126+
102127
if (comparator().compare(min, min_value) > 0) {
103128
min = min_value;
104129
}

0 commit comments

Comments
 (0)