diff --git a/src/main/java/edu/ucsd/msjava/fragindex/DirectStore.java b/src/main/java/edu/ucsd/msjava/fragindex/DirectStore.java new file mode 100644 index 00000000..2924c5d0 --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/fragindex/DirectStore.java @@ -0,0 +1,30 @@ +package edu.ucsd.msjava.fragindex; + +/** + * In-memory fragment-index store. Slabs live as {@link Slab} objects on the + * Java heap by default; Phase 4 will add an off-heap {@link java.nio.DirectByteBuffer} + * backing so the working set stays out of {@code -Xmx}. + * + *
For Phase 1 this is the only available store. It's used by unit tests + * and by in-memory search runs against small FASTA files. + */ +public final class DirectStore implements FragmentIndexStore { + private final Slab[] slabs; + + public DirectStore(int slabCount) { + this.slabs = new Slab[slabCount]; + } + + @Override + public int slabCount() { return slabs.length; } + + @Override + public void putSlab(int slabId, Slab slab) { + slabs[slabId] = slab; + } + + @Override + public Slab openSlab(int slabId) { + return slabs[slabId]; + } +} diff --git a/src/main/java/edu/ucsd/msjava/fragindex/EliasFano.java b/src/main/java/edu/ucsd/msjava/fragindex/EliasFano.java new file mode 100644 index 00000000..11ff490e --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/fragindex/EliasFano.java @@ -0,0 +1,63 @@ +package edu.ucsd.msjava.fragindex; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * Simple Elias-Fano-inspired codec for sorted non-decreasing int[] lists. + * + * Layout (little-endian): + * [4 bytes: length N] + * [4 bytes: max value U, or 0 if N==0] + * [for each value i: 4 bytes raw int] + * + * This first cut is correctness-only — plain int array encoding. A compact + * Elias-Fano layout replaces this in Task 7 once the API shape is stable. + */ +public final class EliasFano { + private EliasFano() {} + + public static byte[] encode(int[] values) { + int n = values.length; + ByteBuffer buf = ByteBuffer.allocate(8 + 4 * n).order(ByteOrder.LITTLE_ENDIAN); + buf.putInt(n); + buf.putInt(n == 0 ? 0 : values[n - 1]); + for (int v : values) buf.putInt(v); + return buf.array(); + } + + public static int[] decode(byte[] encoded) { + ByteBuffer buf = ByteBuffer.wrap(encoded).order(ByteOrder.LITTLE_ENDIAN); + int n = buf.getInt(); + buf.getInt(); // max value; unused in this naive layout + int[] out = new int[n]; + for (int i = 0; i < n; i++) out[i] = buf.getInt(); + return out; + } + + public static final class Cursor { + private final ByteBuffer buf; + private final int total; + private int index; + + Cursor(ByteBuffer buf, int total) { + this.buf = buf; + this.total = total; + this.index = 0; + } + + public boolean hasNext() { return index < total; } + public int next() { + int v = buf.getInt(); + index++; + return v; + } + } + + public static Cursor open(byte[] encoded) { + ByteBuffer buf = ByteBuffer.wrap(encoded).order(ByteOrder.LITTLE_ENDIAN); + int n = buf.getInt(); + buf.getInt(); // max (unused) + return new Cursor(buf, n); + } +} diff --git a/src/main/java/edu/ucsd/msjava/fragindex/Fingerprint128.java b/src/main/java/edu/ucsd/msjava/fragindex/Fingerprint128.java new file mode 100644 index 00000000..0db338a3 --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/fragindex/Fingerprint128.java @@ -0,0 +1,43 @@ +package edu.ucsd.msjava.fragindex; + +/** + * 128-bit fragment fingerprint split into b-ion / y-ion halves. + * + *
Each theoretical fragment hashes into one bit in the appropriate half via + * {@code bucket_index % 64}. At search time a spectrum's fingerprint is ANDed + * with each candidate peptide's fingerprint and popcounted — peptides whose + * fragment set doesn't share enough bits with the spectrum are pruned before + * any fragment-index lookup runs. + * + *
Threshold tuning (default {@code popcountAnd ≥ 8}) lives in the caller, + * not this class. + */ +public final class Fingerprint128 { + private long lo; // b-ion bits + private long hi; // y-ion bits + + public Fingerprint128() {} + + public Fingerprint128(long lo, long hi) { + this.lo = lo; + this.hi = hi; + } + + public void setBIonBucket(int bucketIndex) { + lo |= 1L << (bucketIndex & 63); + } + + public void setYIonBucket(int bucketIndex) { + hi |= 1L << (bucketIndex & 63); + } + + public int popcountB() { return Long.bitCount(lo); } + public int popcountY() { return Long.bitCount(hi); } + + public int popcountAnd(Fingerprint128 other) { + return Long.bitCount(lo & other.lo) + Long.bitCount(hi & other.hi); + } + + public long loBits() { return lo; } + public long hiBits() { return hi; } +} diff --git a/src/main/java/edu/ucsd/msjava/fragindex/FragmentIndexStore.java b/src/main/java/edu/ucsd/msjava/fragindex/FragmentIndexStore.java new file mode 100644 index 00000000..6ff6eaa7 --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/fragindex/FragmentIndexStore.java @@ -0,0 +1,23 @@ +package edu.ucsd.msjava.fragindex; + +/** + * Storage backend for fragment-index slabs. Implementations differ in whether + * slabs live off-heap in memory ({@link DirectStore}) or on disk via mmap + * (to be added in Phase 4). + * + *
All methods must be thread-safe for concurrent readers after + * {@link #putSlab(int, Slab)} has been called for each slab during build. + */ +public interface FragmentIndexStore { + /** Total number of slabs this store holds (set at construction). */ + int slabCount(); + + /** Install a slab at the given id. Called once per slab during build. */ + void putSlab(int slabId, Slab slab); + + /** Return the slab at the given id, or null if none has been put yet. */ + Slab openSlab(int slabId); + + /** Optional hint that the caller has finished with the slab. Default: no-op. */ + default void closeSlab(Slab slab) {} +} diff --git a/src/main/java/edu/ucsd/msjava/fragindex/Slab.java b/src/main/java/edu/ucsd/msjava/fragindex/Slab.java new file mode 100644 index 00000000..65312caf --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/fragindex/Slab.java @@ -0,0 +1,70 @@ +package edu.ucsd.msjava.fragindex; + +/** + * Immutable read-only view over one precursor-mass slab of the fragment index. + * + *
Returned by {@link SlabBuilder#finish()} once all peptides and fragments + * are loaded. Immutable by construction: the fingerprint array is never + * mutated after construction, and {@link #fingerprint(int)} returns a fresh + * snapshot rather than the internal object. Safe for concurrent readers. + */ +public final class Slab { + private final int slabId; + private final double minMassDa; + private final double maxMassDa; + private final int peptideCount; + private final Fingerprint128[] fingerprints; + private final byte[][] bucketEncoded; // bucket -> Elias-Fano-encoded peptide-id list + + Slab(int slabId, double minMassDa, double maxMassDa, + Fingerprint128[] fingerprints, byte[][] bucketEncoded) { + this.slabId = slabId; + this.minMassDa = minMassDa; + this.maxMassDa = maxMassDa; + this.peptideCount = fingerprints.length; + this.fingerprints = fingerprints; + this.bucketEncoded = bucketEncoded; + } + + public int slabId() { return slabId; } + public double minMassDa() { return minMassDa; } + public double maxMassDa() { return maxMassDa; } + public int peptideCount() { return peptideCount; } + + /** + * Returns the fingerprint bits for the given peptide as an immutable + * 2-long snapshot. The returned Fingerprint128 is a fresh object built + * from the peptide's lo/hi bit-words; mutating it has no effect on the + * slab's internal state. Callers that only need bit-level AND+popcount + * can use {@link #fingerprintLoBits(int)} / {@link #fingerprintHiBits(int)} + * for zero-allocation access. + */ + public Fingerprint128 fingerprint(int peptideId) { + Fingerprint128 src = fingerprints[peptideId]; + return new Fingerprint128(src.loBits(), src.hiBits()); + } + + /** Zero-allocation read of the b-ion fingerprint word for a peptide. */ + public long fingerprintLoBits(int peptideId) { + return fingerprints[peptideId].loBits(); + } + + /** Zero-allocation read of the y-ion fingerprint word for a peptide. */ + public long fingerprintHiBits(int peptideId) { + return fingerprints[peptideId].hiBits(); + } + + public int[] peptidesInBucket(int bucket) { + if (bucket < 0 || bucket >= bucketEncoded.length) return new int[0]; + byte[] enc = bucketEncoded[bucket]; + if (enc == null) return new int[0]; + return EliasFano.decode(enc); + } + + public EliasFano.Cursor bucketCursor(int bucket) { + if (bucket < 0 || bucket >= bucketEncoded.length || bucketEncoded[bucket] == null) { + return EliasFano.open(EliasFano.encode(new int[0])); + } + return EliasFano.open(bucketEncoded[bucket]); + } +} diff --git a/src/main/java/edu/ucsd/msjava/fragindex/SlabBuilder.java b/src/main/java/edu/ucsd/msjava/fragindex/SlabBuilder.java new file mode 100644 index 00000000..550a6773 --- /dev/null +++ b/src/main/java/edu/ucsd/msjava/fragindex/SlabBuilder.java @@ -0,0 +1,72 @@ +package edu.ucsd.msjava.fragindex; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.TreeMap; + +/** + * Writable buffer for assembling a single slab of the fragment index. + * + *
Used during index build. Caller flow: + *
+ * SlabBuilder b = new SlabBuilder(slabId, minMassDa, maxMassDa); + * int pid = b.addPeptide(peptideMassDa); + * b.addFragment(pid, fragmentBucket, isB); + * ... + * Slab slab = b.finish(); + *+ * + *
Not thread-safe. Each builder is owned by a single build thread.
+ */
+public final class SlabBuilder {
+ private final int slabId;
+ private final double minMassDa;
+ private final double maxMassDa;
+ private final List