with open(filename, 'wb') as f: # Write magic: [0, 0, type_code, dim_count] f.write(bytes([0, 0, data_type_code, dim_count])) # Write dimensions (big-endian) for dim in data_array.shape: f.write(dim.to_bytes(4, 'big')) # Write data (row-major, native endianness) # Convert to flat bytes in correct order data_array.astype(data_array.dtype, copy=False).tofile(f) #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <arpa/inet.h> typedef struct idx_file uint8_t data_type; // 0x08,0x09,0x0B-0x0E uint8_t dim_count; // 1-255 uint32_t *dims; // array of dim_count sizes void *data; // raw data pointer size_t data_size_bytes; idx_file_t;
Report ID: TR-IDX-2024-01 Date: October 26, 2024 Subject: Structure, Usage, Implementation, and Optimization of the IDX Binary Format 1. Executive Summary The IDX file format is a simple, open, binary format designed for storing multidimensional arrays (tensors) of numerical data. Originally developed for the IDX (Index) system in the 1990s (most notably for storing font glyph data), it gained widespread recognition as the standard data format for the MNIST database of handwritten digits. Its primary advantages are extreme simplicity, platform-agnostic design (handling endianness), and minimal file overhead. idx file
out->data_type = header[2]; out->dim_count = header[3]; with open(filename, 'wb') as f: # Write magic:
| Code (decimal) | Code (hex) | Data Type | C equivalent (typical) | .NET equivalent | |----------------|------------|-----------|------------------------|------------------| | 0x08 | 8 | Unsigned byte (uint8) | unsigned char | Byte | | 0x09 | 9 | Signed byte (int8) | signed char | SByte | | 0x0B | 11 | Short (int16) | short | Int16 | | 0x0C | 12 | Int32 (int) | int | Int32 | | 0x0D | 13 | Float (single) | float | Single | | 0x0E | 14 | Double | double | Double | Its primary advantages are extreme simplicity
out->dims = malloc(out->dim_count * sizeof(uint32_t)); for (int i = 0; i < out->dim_count; i++) uint32_t dim_net; if (fread(&dim_net, 4, 1, f) != 1) free(out->dims); fclose(f); return -4; out->dims[i] = ntohl(dim_net);