Skip to content

Commit 6353fa5

Browse files
committed
remove flags
1 parent 1575cb4 commit 6353fa5

8 files changed

Lines changed: 71 additions & 78 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@ public enum ColumnOrderName {
3838
*/
3939
TYPE_DEFINED_ORDER,
4040
/**
41-
* Chronological order for INT96 timestamps: values are compared by the Julian day (the last 4
42-
* bytes, as a little-endian signed int32), then by the nanoseconds within the day (the first 8
43-
* bytes, as a little-endian signed int64). Only supported for the INT96 physical type.
41+
* Chronological order for INT96 timestamps. Only valid for the INT96 physical type.
4442
*/
4543
INT96_TIMESTAMP_ORDER
4644
}

parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -566,9 +566,15 @@ public PrimitiveType(
566566
this.decimalMeta = decimalMeta;
567567

568568
if (columnOrder == null) {
569-
columnOrder = primitive == PrimitiveTypeName.INT96 || originalType == OriginalType.INTERVAL
570-
? ColumnOrder.undefined()
571-
: ColumnOrder.typeDefined();
569+
if (primitive == PrimitiveTypeName.INT96) {
570+
// A plain INT96 is the legacy timestamp encoding; default it to the chronological order.
571+
// An annotated INT96 carries other semantics, so leave its order undefined.
572+
columnOrder = originalType == null ? ColumnOrder.int96TimestampOrder() : ColumnOrder.undefined();
573+
} else if (originalType == OriginalType.INTERVAL) {
574+
columnOrder = ColumnOrder.undefined();
575+
} else {
576+
columnOrder = ColumnOrder.typeDefined();
577+
}
572578
}
573579
this.columnOrder = requireValidColumnOrder(columnOrder);
574580
}
@@ -611,10 +617,18 @@ public PrimitiveType(
611617
}
612618

613619
if (columnOrder == null) {
614-
columnOrder = primitive == PrimitiveTypeName.INT96
615-
|| logicalTypeAnnotation instanceof LogicalTypeAnnotation.IntervalLogicalTypeAnnotation
616-
? ColumnOrder.undefined()
617-
: ColumnOrder.typeDefined();
620+
if (primitive == PrimitiveTypeName.INT96) {
621+
// A plain INT96 is the legacy timestamp encoding; default it to the chronological order.
622+
// An annotated INT96 carries other semantics, so leave its order undefined.
623+
columnOrder = logicalTypeAnnotation == null ?
624+
ColumnOrder.int96TimestampOrder() :
625+
ColumnOrder.undefined();
626+
} else if (
627+
logicalTypeAnnotation instanceof LogicalTypeAnnotation.IntervalLogicalTypeAnnotation) {
628+
columnOrder = ColumnOrder.undefined();
629+
} else {
630+
columnOrder = ColumnOrder.typeDefined();
631+
}
618632
}
619633
this.columnOrder = requireValidColumnOrder(columnOrder);
620634
}
@@ -661,15 +675,6 @@ public PrimitiveType withLogicalTypeAnnotation(LogicalTypeAnnotation logicalType
661675
return new PrimitiveType(getRepetition(), primitive, length, getName(), logicalType, getId());
662676
}
663677

664-
/**
665-
* @param columnOrder the column order
666-
* @return a new PrimitiveType with the same fields and the given column order
667-
*/
668-
public PrimitiveType withColumnOrder(ColumnOrder columnOrder) {
669-
return new PrimitiveType(
670-
getRepetition(), primitive, length, getName(), getLogicalTypeAnnotation(), getId(), columnOrder);
671-
}
672-
673678
/**
674679
* @return the primitive type
675680
*/

parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBinaryTruncator.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,16 @@ public void testContractNonStringTypes() {
9393
testTruncator(
9494
Types.required(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("test_fixed_interval"), false);
9595
testTruncator(Types.required(BINARY).as(DECIMAL).precision(10).scale(2).named("test_binary_decimal"), false);
96-
testTruncator(Types.required(INT96).named("test_int96"), false);
96+
97+
// INT96 has a fixed 12-byte width and a chronological comparator (so it is excluded from the
98+
// variable-length checks above, like FLOAT16). Its truncator is a no-op: verify it returns the
99+
// value unchanged regardless of the requested length.
100+
BinaryTruncator int96Truncator = BinaryTruncator.getTruncator(
101+
Types.required(INT96).named("test_int96"));
102+
Binary int96Value = Binary.fromConstantByteArray(
103+
new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4});
104+
assertSame(int96Value, int96Truncator.truncateMin(int96Value, 4));
105+
assertSame(int96Value, int96Truncator.truncateMax(int96Value, 4));
97106
}
98107

99108
@Test

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2043,12 +2043,17 @@ private void buildChildren(
20432043
|| schemaElement.converted_type == ConvertedType.INTERVAL)) {
20442044
columnOrder = org.apache.parquet.schema.ColumnOrder.undefined();
20452045
}
2046-
// INT96_TIMESTAMP_ORDER is only valid for INT96 columns; ignore it anywhere else
2046+
// INT96_TIMESTAMP_ORDER is only valid for INT96 columns, ignore it anywhere else.
20472047
if (columnOrder.getColumnOrderName() == ColumnOrderName.INT96_TIMESTAMP_ORDER
20482048
&& schemaElement.type != Type.INT96) {
20492049
columnOrder = org.apache.parquet.schema.ColumnOrder.undefined();
20502050
}
20512051
primitiveBuilder.columnOrder(columnOrder);
2052+
} else if (schemaElement.type == Type.INT96) {
2053+
// A footer without column orders predates INT96_TIMESTAMP_ORDER, so an INT96 column here
2054+
// must not inherit the (chronological) construction-time default: its stats, if any, were
2055+
// written under the legacy order and must be ignored.
2056+
primitiveBuilder.columnOrder(org.apache.parquet.schema.ColumnOrder.undefined());
20522057
}
20532058
childBuilder = primitiveBuilder;
20542059
} else {

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ public InternalParquetRecordWriter(
8989
ParquetProperties props) {
9090
this.parquetFileWriter = parquetFileWriter;
9191
this.writeSupport = Objects.requireNonNull(writeSupport, "writeSupport cannot be null");
92-
this.schema = ParquetFileWriter.applyInt96TimestampOrder(schema);
92+
this.schema = schema;
9393
this.extraMetaData = extraMetaData;
9494
this.rowGroupSizeThreshold = rowGroupSize;
9595
this.rowGroupRecordCountThreshold = props.getRowGroupRowCountLimit();

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,8 @@
9292
import org.apache.parquet.io.ParquetEncodingException;
9393
import org.apache.parquet.io.PositionOutputStream;
9494
import org.apache.parquet.io.SeekableInputStream;
95-
import org.apache.parquet.schema.ColumnOrder;
96-
import org.apache.parquet.schema.GroupType;
9795
import org.apache.parquet.schema.MessageType;
9896
import org.apache.parquet.schema.PrimitiveType;
99-
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
100-
import org.apache.parquet.schema.Type;
10197
import org.apache.parquet.schema.TypeUtil;
10298
import org.slf4j.Logger;
10399
import org.slf4j.LoggerFactory;
@@ -439,7 +435,7 @@ public ParquetFileWriter(
439435
throws IOException {
440436
this(
441437
file,
442-
applyInt96TimestampOrder(schema),
438+
schema,
443439
mode,
444440
rowGroupSize,
445441
maxPaddingSize,
@@ -451,32 +447,6 @@ public ParquetFileWriter(
451447
props.getAllocator());
452448
}
453449

454-
/**
455-
* Returns the schema with INT96 columns tagged with the INT96_TIMESTAMP_ORDER column order, so
456-
* that statistics are accumulated with the chronological comparator and the proper column order
457-
* is written to the footer.
458-
*/
459-
static MessageType applyInt96TimestampOrder(MessageType schema) {
460-
return new MessageType(schema.getName(), applyInt96TimestampOrder(schema.getFields()));
461-
}
462-
463-
private static List<Type> applyInt96TimestampOrder(List<Type> fields) {
464-
List<Type> result = new ArrayList<>(fields.size());
465-
for (Type field : fields) {
466-
if (field.isPrimitive()) {
467-
PrimitiveType primitive = field.asPrimitiveType();
468-
if (primitive.getPrimitiveTypeName() == PrimitiveTypeName.INT96) {
469-
field = primitive.withColumnOrder(ColumnOrder.int96TimestampOrder());
470-
}
471-
result.add(field);
472-
} else {
473-
GroupType group = field.asGroupType();
474-
result.add(group.withNewFields(applyInt96TimestampOrder(group.getFields())));
475-
}
476-
}
477-
return result;
478-
}
479-
480450
@Deprecated
481451
public ParquetFileWriter(
482452
OutputFile file,

parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,17 +1170,14 @@ public void testMissingValuesFromStats() {
11701170

11711171
@Test
11721172
public void testSkippedV2Stats() {
1173+
// INTERVAL has an undefined column order, so its stats are skipped.
11731174
testSkippedV2Stats(
11741175
Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
11751176
.length(12)
11761177
.as(OriginalType.INTERVAL)
11771178
.named(""),
11781179
new BigInteger("12345678"),
11791180
new BigInteger("12345679"));
1180-
testSkippedV2Stats(
1181-
Types.optional(PrimitiveTypeName.INT96).named(""),
1182-
new BigInteger("-75687987"),
1183-
new BigInteger("45367657"));
11841181
}
11851182

11861183
private void testSkippedV2Stats(PrimitiveType type, Object min, Object max) {
@@ -1382,8 +1379,7 @@ public void testColumnOrders() throws IOException {
13821379
+ " required binary key (UTF8);" // Key to be hacked to have unknown column order -> undefined
13831380
+ " optional group list_col (LIST) {"
13841381
+ " repeated group list {"
1385-
+ " optional int96 array_element;" // INT96 element with type defined column order ->
1386-
// undefined
1382+
+ " optional int96 array_element;" // plain INT96 element -> INT96_TIMESTAMP_ORDER
13871383
+ " }"
13881384
+ " }"
13891385
+ " }"
@@ -1397,9 +1393,10 @@ public void testColumnOrders() throws IOException {
13971393

13981394
List<org.apache.parquet.format.ColumnOrder> columnOrders = formatMetadata.getColumn_orders();
13991395
assertEquals(3, columnOrders.size());
1400-
for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) {
1401-
assertTrue(columnOrder.isSetTYPE_ORDER());
1402-
}
1396+
// binary_col and key get TYPE_ORDER, the INT96 array_element gets INT96_TIMESTAMP_ORDER.
1397+
assertTrue(columnOrders.get(0).isSetTYPE_ORDER());
1398+
assertTrue(columnOrders.get(1).isSetTYPE_ORDER());
1399+
assertTrue(columnOrders.get(2).isSetINT96_TIMESTAMP_ORDER());
14031400

14041401
// Simulate that thrift got a union type that is not in the generated code
14051402
// (when the file contains a not-yet-supported column order)
@@ -1412,7 +1409,7 @@ public void testColumnOrders() throws IOException {
14121409
assertEquals(
14131410
ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder());
14141411
assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder());
1415-
assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder());
1412+
assertEquals(ColumnOrder.int96TimestampOrder(), columns.get(2).getPrimitiveType().columnOrder());
14161413
}
14171414

14181415
@Test
@@ -1487,7 +1484,11 @@ public void testColumnIndexConversion() {
14871484
assertNull(
14881485
"Should ignore unsupported types",
14891486
ParquetMetadataConverter.toParquetColumnIndex(
1490-
Types.required(PrimitiveTypeName.INT96).named("test_int96"), columnIndex));
1487+
Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
1488+
.length(12)
1489+
.as(OriginalType.INTERVAL)
1490+
.named("test_interval"),
1491+
columnIndex));
14911492
assertNull(
14921493
"Should ignore unsupported types",
14931494
ParquetMetadataConverter.fromParquetColumnIndex(

parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestInt96TimestampStatistics.java

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ private static void assertStatsUsable(ColumnChunkMetaData column) {
189189

190190
@Test
191191
public void testWriterEmitsInt96StatsAndColumnOrder() throws IOException {
192+
// Values are written in non-chronological order; the writer must still produce the
193+
// chronological min/max, not first/last or byte-wise extremes.
192194
File file = writeFile();
193195
FileMetaData rawFooter = readRawFooter(file);
194196
// schema[0] is the message root; column_orders are indexed by leaf order: ts=0, id=1
@@ -224,19 +226,6 @@ public void testWriterEmitsInt96StatsAndColumnOrder() throws IOException {
224226
}
225227
}
226228

227-
@Test
228-
public void testStatsAccumulationUsesChronologicalOrder() throws IOException {
229-
// Values are written in non-chronological order; the writer must still produce the
230-
// chronological min/max, not first/last or byte-wise extremes.
231-
File file = writeFile();
232-
ParquetMetadata footer = readFooter(file, new Configuration());
233-
byte[] minBytes = getColumn(footer, "ts").getStatistics().getMinBytes();
234-
assertFalse(Binary.fromConstantByteArray(minBytes).equals(NEXT_DAY));
235-
assertArrayEquals(EXPECTED_MIN.getBytes(), minBytes);
236-
assertArrayEquals(EXPECTED_MAX.getBytes(),
237-
getColumn(footer, "ts").getStatistics().getMaxBytes());
238-
}
239-
240229
@Test
241230
public void testReaderReadsStatsWrittenWithInt96TimestampOrder() throws IOException {
242231
File file = writeFile();
@@ -255,7 +244,7 @@ public void testReaderIgnoresInt96StatsWithTypeDefinedOrder() throws IOException
255244
File file = writeFile();
256245
FileMetaData rawFooter = readRawFooter(file);
257246
downgradeInt96ToTypeOrder(rawFooter);
258-
File legacy = rewriteFooter(file, rawFooter, "legacy.parquet");
247+
File legacy = rewriteFooter(file, rawFooter, "type-defined-orders.parquet");
259248

260249
// Validate the data is still intact after rewriting the footer.
261250
try (
@@ -278,6 +267,22 @@ public void testReaderIgnoresInt96StatsWithTypeDefinedOrder() throws IOException
278267
assertTrue(getColumn(footer, "id").getStatistics().hasNonNullValue());
279268
}
280269

270+
@Test
271+
public void testReaderIgnoresInt96StatsWhenColumnOrdersAbsent() throws IOException {
272+
// Legacy layout: INT96 stats present but the footer omits column_orders entirely (predating the
273+
// order). The reader must not infer INT96_TIMESTAMP_ORDER from the construction-time default
274+
// and must therefore ignore the stats.
275+
File file = writeFile();
276+
FileMetaData rawFooter = readRawFooter(file);
277+
rawFooter.unsetColumn_orders();
278+
File legacy = rewriteFooter(file, rawFooter, "no-column-orders.parquet");
279+
280+
ParquetMetadata footer = readFooter(legacy, new Configuration());
281+
assertEquals(ColumnOrder.undefined(),
282+
footer.getFileMetaData().getSchema().getType("ts").asPrimitiveType().columnOrder());
283+
assertStatsIgnored(getColumn(footer, "ts"));
284+
}
285+
281286
private static byte[] toArray(ByteBuffer buffer) {
282287
byte[] bytes = new byte[buffer.remaining()];
283288
buffer.duplicate().get(bytes);

0 commit comments

Comments
 (0)