Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/MSGFPlus.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,16 @@ Usage: java -Xmx3500M -jar MSGFPlus.jar
[-tasks NumTasks] (Override the number of tasks to use on the threads; Default: internally calculated based on inputs)
More tasks than threads will reduce the memory requirements of the search, but will be slower (how much depends on the inputs).
1 <= tasks <= numThreads: will create one task per thread, which is the original behavior.
tasks = 0: use default calculation - minimum of: (threads*3) and (numSpectra/250).
tasks = 0: use default calculation - minimum of: (threads*3) and (numSpectra/minSpectraPerThread).
tasks < 0: multiply number of threads by abs(tasks) to determine number of tasks (i.e., -2 means "2 * numThreads" tasks).
One task per thread will use the most memory, but will usually finish the fastest.
2-3 tasks per thread will use comparably less memory, but may cause the search to take 1.5 to 2 times as long.

[-minSpectraPerThread MinSpectraPerThread] (Minimum number of spectra to assign per thread/task; Default: 250)
Controls the per-thread workload floor used when auto-selecting numThreads and numTasks.
The effective thread count is capped at max(1, round(numSpectra / minSpectraPerThread)).
Lower this value to raise parallelism on small inputs on many-core hosts (see issue #52).

[-verbose 0/1] (Console output message verbosity; Default: 0)
0: Report total progress only
1: Report total and per-thread progress/status
Expand Down
10 changes: 9 additions & 1 deletion docs/Troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,15 @@ java -Xmx8G -jar MSGFPlus.jar -s spectra.mzML -d db.fasta -thread 8 -tasks -3

By default MS-GF+ caps concurrent threads at `min(availableCores, numSpectra / 250)`. On a 20-core machine with ~1,000 spectra you will see only ~4 active threads. This is intentional to avoid I/O contention, but can surprise users with small input files on large hosts.

**Workaround** — process multiple mzML files in parallel as separate MS-GF+ processes, or accept the cap on a single small file.
**Override** — lower the per-thread spectrum floor with `-minSpectraPerThread N` (default 250). For example, to force ~20 threads on a 1,000-spectrum file:

```bash
java -Xmx8G -jar MSGFPlus.jar -s small.mzML -d db.fasta -thread 20 -minSpectraPerThread 50
```

Going below ~50 usually makes the search slower because per-thread setup overhead starts to dominate; benchmark your own data.

**Alternative workaround** — process multiple mzML files in parallel as separate MS-GF+ processes.

Related issue: [#52](https://github.com/MSGFPlus/msgfplus/issues/52).

Expand Down
77 changes: 77 additions & 0 deletions src/main/java/edu/ucsd/msjava/misc/MSGFLogger.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package edu.ucsd.msjava.misc;

import java.io.PrintStream;

/**
* Lightweight leveled logger for MS-GF+ console output.
*
* <p>The runtime verbose flag (from {@code -verbose 0/1}) gates {@link #debug}; all other
* levels print unconditionally. Call {@link #setVerbose(boolean)} once at startup after
* parsing CLI arguments; the default is {@code false} (compatible with today's behaviour).
*
* <p>Designed to replace ad-hoc {@code System.out.println} calls at the top-level entry
* points without pulling in slf4j / log4j. Info/debug write to {@code stdout}; warn/error
* write to {@code stderr}.
*/
public final class MSGFLogger {

private static volatile boolean verbose = false;
private static PrintStream out = System.out;
private static PrintStream err = System.err;

private MSGFLogger() {}

public static void setVerbose(boolean v) {
verbose = v;
}

public static boolean isVerbose() {
return verbose;
}

/** Testing hook: swap the output streams. Package-private. */
static void setStreams(PrintStream outStream, PrintStream errStream) {
out = outStream;
err = errStream;
}

/** Always printed; for top-level progress the user should see. */
public static void info(String msg) {
out.println(msg);
}

public static void info(String fmt, Object... args) {
out.println(String.format(fmt, args));
}

/** Printed only when {@code -verbose 1}. Use for per-thread / per-task chatter. */
public static void debug(String msg) {
if (verbose) {
out.println(msg);
}
}

public static void debug(String fmt, Object... args) {
if (verbose) {
out.println(String.format(fmt, args));
}
}

/** Always printed to stderr, prefixed with {@code [Warning]}. */
public static void warn(String msg) {
err.println("[Warning] " + msg);
}

public static void warn(String fmt, Object... args) {
err.println("[Warning] " + String.format(fmt, args));
}

/** Always printed to stderr, prefixed with {@code [Error]}. */
public static void error(String msg) {
err.println("[Error] " + msg);
}

public static void error(String fmt, Object... args) {
err.println("[Error] " + String.format(fmt, args));
}
}
193 changes: 193 additions & 0 deletions src/main/java/edu/ucsd/msjava/misc/RunManifestWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
package edu.ucsd.msjava.misc;

import edu.ucsd.msjava.msdbsearch.SearchParams;
import edu.ucsd.msjava.msutil.DBSearchIOFiles;

import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.time.Instant;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.Map;

/**
* Writes a JSON run-manifest sidecar alongside each mzIdentML output.
*
* <p>The manifest captures the run context — MS-GF+ version, Java version and
* heap, host OS, thread count, enzyme / instrument / activation / protocol,
* precursor tolerance, isotope range, length / charge / mod bounds, FASTA
* path and size, original CLI argv — so that downstream pipelines
* (quantms, Galaxy-P, custom scripts) can reproduce or verify a search
* without re-parsing logs.
*
* <p>Output path is {@code <outputMzid>.manifest.json}. The JSON is hand-rolled
* with a stable key order; no new dependencies are pulled in.
*
* <p>Failures to write are logged as warnings via {@link MSGFLogger} and never
* abort the search — the manifest is advisory metadata, not search output.
*/
public final class RunManifestWriter {

private RunManifestWriter() {}

/**
* Write a manifest for the given IO pair. Caller is responsible for
* invoking this after the mzid has been written successfully.
*
* @param io spectrum/output pair from {@link SearchParams#getDBSearchIOList()}
* @param params parsed search parameters
* @param version MS-GF+ version string (e.g. {@code "v2024.07.27"})
* @param argv original CLI argv (used verbatim under {@code "cli_args"})
*/
public static void write(DBSearchIOFiles io, SearchParams params, String version, String[] argv) {
File outputFile = io.getOutputFile();
File manifestFile = new File(outputFile.getPath() + ".manifest.json");
try {
Map<String, Object> m = buildManifestMap(io, params, version, argv);
try (BufferedWriter w = Files.newBufferedWriter(manifestFile.toPath(), StandardCharsets.UTF_8)) {
writeJson(w, m, 0);
w.write("\n");
}
MSGFLogger.debug("Run manifest written to " + manifestFile.getPath());
} catch (IOException | RuntimeException e) {
MSGFLogger.warn("Could not write run manifest to %s: %s", manifestFile.getPath(), e.getMessage());
}
}

/** Testing and inspection hook. Builds the manifest map without writing to disk. */
public static Map<String, Object> buildManifestMap(DBSearchIOFiles io, SearchParams params, String version, String[] argv) {
Map<String, Object> m = new LinkedHashMap<String, Object>();
m.put("msgfplus_version", version);
m.put("run_timestamp_utc", Instant.now().toString());

m.put("java_version", System.getProperty("java.version"));
m.put("java_vendor", System.getProperty("java.vendor"));
m.put("os_name", System.getProperty("os.name"));
m.put("os_version", System.getProperty("os.version"));
m.put("os_arch", System.getProperty("os.arch"));

Runtime rt = Runtime.getRuntime();
m.put("max_heap_mb", rt.maxMemory() / (1024L * 1024L));
m.put("available_processors", rt.availableProcessors());
m.put("requested_threads", params.getNumThreads());
m.put("num_tasks", params.getNumTasks());
m.put("min_spectra_per_thread", params.getMinSpectraPerThread());

File specFile = io.getSpecFile();
m.put("spec_file", specFile.getAbsolutePath());
m.put("spec_file_size_bytes", specFile.length());
m.put("spec_file_format", io.getSpecFileFormat() == null ? null : io.getSpecFileFormat().toString());

File fastaFile = params.getDatabaseFile();
if (fastaFile != null) {
m.put("fasta_file", fastaFile.getAbsolutePath());
m.put("fasta_file_size_bytes", fastaFile.length());
}

File outputFile = io.getOutputFile();
m.put("output_file", outputFile.getAbsolutePath());

m.put("enzyme", params.getEnzyme() == null ? null : params.getEnzyme().getName());
m.put("activation_method", params.getActivationMethod() == null ? null : params.getActivationMethod().getName());
m.put("instrument", params.getInstType() == null ? null : params.getInstType().getName());
m.put("protocol", params.getProtocol() == null ? null : params.getProtocol().getName());

m.put("precursor_tol_left", params.getLeftPrecursorMassTolerance() == null ? null : params.getLeftPrecursorMassTolerance().toString());
m.put("precursor_tol_right", params.getRightPrecursorMassTolerance() == null ? null : params.getRightPrecursorMassTolerance().toString());
m.put("isotope_error_min", params.getMinIsotopeError());
m.put("isotope_error_max", params.getMaxIsotopeError());

m.put("num_tolerable_termini", params.getNumTolerableTermini());
m.put("min_peptide_length", params.getMinPeptideLength());
m.put("max_peptide_length", params.getMaxPeptideLength());
m.put("min_charge", params.getMinCharge());
m.put("max_charge", params.getMaxCharge());
m.put("max_missed_cleavages", params.getMaxMissedCleavages());
m.put("num_matches_per_spec", params.getNumMatchesPerSpec());
m.put("min_ms_level", params.getMinMSLevel());
m.put("max_ms_level", params.getMaxMSLevel());

m.put("cli_args", argv == null ? new ArrayList<String>() : java.util.Arrays.asList(argv));
return m;
}

// --- tiny hand-rolled JSON writer -----------------------------------
// Keeps the jar dep-free. Supports String, Number, Boolean, null,
// List/Iterable of the same, and Map<String, ?> via nested emit.

private static void writeJson(BufferedWriter w, Object value, int indent) throws IOException {
if (value == null) {
w.write("null");
return;
}
if (value instanceof Map) {
@SuppressWarnings("unchecked")
Map<String, Object> map = (Map<String, Object>) value;
w.write("{");
boolean first = true;
for (Map.Entry<String, Object> e : map.entrySet()) {
if (!first) w.write(",");
first = false;
w.write("\n");
indent(w, indent + 1);
w.write(jsonString(e.getKey()));
w.write(": ");
writeJson(w, e.getValue(), indent + 1);
}
if (!first) {
w.write("\n");
indent(w, indent);
}
w.write("}");
return;
}
if (value instanceof Iterable) {
w.write("[");
boolean first = true;
for (Object item : (Iterable<?>) value) {
if (!first) w.write(", ");
first = false;
writeJson(w, item, indent + 1);
}
w.write("]");
return;
}
if (value instanceof Number || value instanceof Boolean) {
w.write(value.toString());
return;
}
w.write(jsonString(value.toString()));
}

private static void indent(BufferedWriter w, int level) throws IOException {
for (int i = 0; i < level; i++) w.write(" ");
}

private static String jsonString(String s) {
StringBuilder sb = new StringBuilder(s.length() + 2);
sb.append('"');
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
switch (c) {
case '"': sb.append("\\\""); break;
case '\\': sb.append("\\\\"); break;
case '\n': sb.append("\\n"); break;
case '\r': sb.append("\\r"); break;
case '\t': sb.append("\\t"); break;
case '\b': sb.append("\\b"); break;
case '\f': sb.append("\\f"); break;
default:
if (c < 0x20) {
sb.append(String.format("\\u%04x", (int) c));
} else {
sb.append(c);
}
}
}
sb.append('"');
return sb.toString();
}
}
6 changes: 6 additions & 0 deletions src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public class SearchParams {
private int maxCharge;
private int numThreads;
private int numTasks;
private int minSpectraPerThread;
private boolean verbose;
private boolean doNotUseEdgeScore;
private File dbIndexDir;
Expand Down Expand Up @@ -181,6 +182,10 @@ public int getNumTasks() {
return numTasks;
}

public int getMinSpectraPerThread() {
return minSpectraPerThread;
}

public boolean getVerbose() {
return verbose;
}
Expand Down Expand Up @@ -415,6 +420,7 @@ public String parse(ParamManager paramManager) {

numThreads = paramManager.getNumThreads();
numTasks = paramManager.getNumTasks();
minSpectraPerThread = paramManager.getMinSpectraPerThread();
verbose = paramManager.getVerboseFlag() == 1;
doNotUseEdgeScore = paramManager.getEdgeScoreFlag() == 1;

Expand Down
23 changes: 22 additions & 1 deletion src/main/java/edu/ucsd/msjava/params/ParamManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,19 @@ public enum ParamNameEnum {
NUM_TASKS("tasks", "NumTasks", "Override the number of tasks to use on the threads; Default: (internally calculated based on inputs)",
"More tasks than threads will reduce the memory requirements of the search, but will be slower (how much depends on the inputs).\n" +
"\t 1 <= tasks <= numThreads: will create one task per thread, which is the original behavior.\n" +
"\t tasks = 0: use default calculation - minimum of: (threads*3) and (numSpectra/250).\n" +
"\t tasks = 0: use default calculation - minimum of: (threads*3) and (numSpectra/minSpectraPerThread).\n" +
"\t tasks < 0: multiply number of threads by abs(tasks) to determine number of tasks (i.e., -2 means \"2 * numThreads\" tasks).\n" +
"\t One task per thread will use the most memory, but will usually finish the fastest.\n" +
"\t 2-3 tasks per thread will use comparably less memory, but may cause the search to take 1.5 to 2 times as long."),

MIN_SPECTRA_PER_THREAD("minSpectraPerThread", "MinSpectraPerThread",
"Minimum number of spectra to assign per thread/task; Default: 250",
"Controls the per-thread workload floor used when auto-selecting numThreads and numTasks.\n" +
"\t The effective thread count is capped at max(1, round(numSpectra / minSpectraPerThread)).\n" +
"\t Lower this value to raise parallelism on small inputs running on many-core hosts\n" +
"\t (e.g. set to 50 when searching ~1,000 spectra on a 20-core machine).\n" +
"\t Going too low increases per-thread setup overhead and can slow the search."),

// Used by MS-GF+
ISOTOPE_ERROR("ti", "IsotopeErrorRange", "Range of allowed isotope peak errors; Default: 0,1",
"Takes into account the error introduced by choosing a non-monoisotopic peak for fragmentation.\n" +
Expand Down Expand Up @@ -592,6 +600,13 @@ private void addNumTasksParam() {
addParameter(numTasksParam);
}

private void addMinSpectraPerThreadParam() {
IntParameter minSpectraParam = new IntParameter(ParamNameEnum.MIN_SPECTRA_PER_THREAD);
minSpectraParam.defaultValue(250);
minSpectraParam.minValue(1);
addParameter(minSpectraParam);
}

private void addTdaParam() {
EnumParameter tdaParam = new EnumParameter(ParamNameEnum.TDA_STRATEGY);
tdaParam.registerEntry("Don't search decoy database").setDefault();
Expand Down Expand Up @@ -796,6 +811,7 @@ public void addMSGFPlusParams() {

addNumThreadsParam();
addNumTasksParam();
addMinSpectraPerThreadParam();
addVerboseModeParam();

addTdaParam();
Expand Down Expand Up @@ -909,6 +925,7 @@ public void addMSGFDBParams() {
addOutputFileParam();

addNumThreadsParam();
addMinSpectraPerThreadParam();

addTdaParam();

Expand Down Expand Up @@ -1177,6 +1194,10 @@ public int getNumTasks() {
return getIntValue(ParamNameEnum.NUM_TASKS.key);
}

public int getMinSpectraPerThread() {
return getIntValue(ParamNameEnum.MIN_SPECTRA_PER_THREAD.key);
}

public int getVerboseFlag() {
return getIntValue(ParamNameEnum.VERBOSE.key);
}
Expand Down
Loading
Loading