Skip to content

Commit d3a14d8

Browse files
authored
Merge pull request #336 from AdamaJava/qp_fastq_read_id_bug
qprofiler1 / qvisualise - handle new fastq header
2 parents 43a8f24 + 3353fd1 commit d3a14d8

File tree

5 files changed

+776
-703
lines changed

5 files changed

+776
-703
lines changed

qprofiler/src/org/qcmg/qprofiler/fastq/FastqSummaryReport.java

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,12 @@ public class FastqSummaryReport extends SummaryReport {
3636
private static final Integer i = Integer.MAX_VALUE;
3737

3838
//SEQ
39-
private final SummaryByCycleNew2<Character> seqByCycle = new SummaryByCycleNew2<Character>(c, 512);
40-
private Map<Integer, AtomicLong> seqLineLengths = null;
39+
private final SummaryByCycleNew2<Character> seqByCycle = new SummaryByCycleNew2<>(c, 512);
4140
private final QCMGAtomicLongArray seqBadReadLineLengths = new QCMGAtomicLongArray(128);
4241
private final KmersSummary kmersSummary = new KmersSummary( KmersSummary.MAX_KMERS ); //default use biggest mers length
4342

4443
//QUAL
45-
private final SummaryByCycleNew2<Integer> qualByCycleInteger = new SummaryByCycleNew2<Integer>(i, 512);
46-
private Map<Integer, AtomicLong> qualLineLengths = null;
44+
private final SummaryByCycleNew2<Integer> qualByCycleInteger = new SummaryByCycleNew2<>(i, 512);
4745
private final QCMGAtomicLongArray qualBadReadLineLengths = new QCMGAtomicLongArray(128);
4846

4947
// Header info
@@ -114,16 +112,16 @@ public void toXml(Element parent) {
114112
SummaryReportUtils.lengthMapToXml(readNameElement, "QUAL_HEADERS", qualHeaders);
115113

116114
// create the length maps here from the cycles objects
117-
seqLineLengths = SummaryByCycleUtils.getLengthsFromSummaryByCycle(seqByCycle, getRecordsParsed());
118-
qualLineLengths = SummaryByCycleUtils.getLengthsFromSummaryByCycle(qualByCycleInteger, getRecordsParsed());
115+
Map<Integer, AtomicLong> seqLineLengths = SummaryByCycleUtils.getLengthsFromSummaryByCycle(seqByCycle, getRecordsParsed());
116+
Map<Integer, AtomicLong> qualLineLengths = SummaryByCycleUtils.getLengthsFromSummaryByCycle(qualByCycleInteger, getRecordsParsed());
119117

120118
// SEQ
121119
Element seqElement = createSubElement(element, "SEQ");
122120
seqByCycle.toXml(seqElement, "BaseByCycle");
123121
SummaryReportUtils.lengthMapToXmlTallyItem(seqElement, "LengthTally", seqLineLengths);
124122
SummaryReportUtils.lengthMapToXml(seqElement, "BadBasesInReads", seqBadReadLineLengths);
125123

126-
kmersSummary.toXml(seqElement,kmersSummary.MAX_KMERS);
124+
kmersSummary.toXml(seqElement, KmersSummary.MAX_KMERS);
127125
kmersSummary.toXml(seqElement,1); //add 1-mers
128126
kmersSummary.toXml(seqElement,2); //add 2-mers
129127
kmersSummary.toXml(seqElement,3); //add 3-mers
@@ -137,11 +135,6 @@ public void toXml(Element parent) {
137135
}
138136
}
139137

140-
/**
141-
* Reads a row from the text file and returns it as a string
142-
*
143-
* @return next row in file
144-
*/
145138
public void parseRecord(FastqRecord record) {
146139
if (null != record) {
147140

@@ -158,12 +151,19 @@ public void parseRecord(FastqRecord record) {
158151
byte[] readBases = record.getReadString().getBytes();
159152
SummaryByCycleUtils.parseCharacterSummary(seqByCycle, readBases, reverseStrand);
160153
SummaryReportUtils.tallyBadReadsAsString(readBases, seqBadReadLineLengths);
161-
kmersSummary.parseKmers( readBases, false ); //fastq base are all orignal forward
154+
kmersSummary.parseKmers( readBases, false ); //fastq base are all original forward
162155

163156
// header stuff
164-
if (record.getReadName().contains(":")) {
165-
String [] headerDetails = TabTokenizer.tokenize(record.getReadName(), ':');
166-
if (null != headerDetails && headerDetails.length > 0) {
157+
158+
String headerToUse = record.getReadName();
159+
int spaceCount = StringUtils.getCount(headerToUse, ' ');
160+
if (spaceCount == 2) {
161+
headerToUse = TabTokenizer.tokenize(headerToUse, ' ')[1];
162+
}
163+
164+
if (headerToUse.contains(":")) {
165+
String [] headerDetails = TabTokenizer.tokenize(headerToUse, ':');
166+
if (headerDetails.length > 0) {
167167

168168
//if length is equal to 10, we have the classic Casava 1.8 format
169169

@@ -180,7 +180,7 @@ public void parseRecord(FastqRecord record) {
180180
// 13051 - x
181181
// 2071 - y
182182
// 2 - 2nd in pair
183-
if (record.getReadName().contains(" ")) {
183+
if (headerToUse.contains(" ")) {
184184
parseFiveElementHeaderWithSpaces(headerDetails);
185185
} else {
186186
parseFiveElementHeaderNoSpaces(headerDetails);
@@ -231,13 +231,13 @@ public void parseRecord(FastqRecord record) {
231231
filteredN.incrementAndGet();
232232
}
233233

234-
// skip control bit for now
234+
// skip the control bit for now
235235

236236
// indexes
237237
if (headerLength > 9) {
238238
key = headerDetails[9];
239239
updateMap(indexes, key);
240-
} // thats it!!
240+
} // that's it!!
241241
}
242242
}
243243
}
@@ -292,18 +292,18 @@ void parseFiveElementHeaderWithSpaces(String [] params) {
292292
// split by space
293293
String [] firstElementParams = params[0].split(" ");
294294
if (firstElementParams.length != 2) {
295-
throw new UnsupportedOperationException("Incorrect header format encountered in parseFiveElementHeader. Expected '@ERR091788.3104 HSQ955_155:2:1101:13051:2071/2' but recieved: " + Arrays.deepToString(params));
295+
throw new IllegalArgumentException("Incorrect header format encountered in parseFiveElementHeaderWithSpaces. Expected a space (e.g. @ERR091788.3104 HSQ955_155) in the first element in the array, but received: " + Arrays.deepToString(params));
296296
}
297297
String [] machineAndReadPosition = firstElementParams[0].split("\\.");
298298
if (machineAndReadPosition.length != 2) {
299-
throw new UnsupportedOperationException("Incorrect header format encountered in parseFiveElementHeader. Expected '@ERR091788.3104 HSQ955_155:2:1101:13051:2071/2' but recieved: " + Arrays.deepToString(params));
299+
throw new IllegalArgumentException("Incorrect header format encountered in parseFiveElementHeaderWithSpaces. Expected a single dot (e.g. @ERR091788.3104 HSQ955_155) in the first part of the first element in the array, but received: " + Arrays.deepToString(params));
300300
}
301301

302302
updateMap(instruments, machineAndReadPosition[0]);
303303

304304
String [] flowCellAndRunId = firstElementParams[1].split("_");
305305
if (flowCellAndRunId.length != 2) {
306-
throw new UnsupportedOperationException("Incorrect header format encountered in parseFiveElementHeader. Expected '@ERR091788.3104 HSQ955_155:2:1101:13051:2071/2' but recieved: " + Arrays.deepToString(params));
306+
throw new IllegalArgumentException("Incorrect header format encountered in parseFiveElementHeaderWithSpaces. Expected a single underscore (e.g. @ERR091788.3104 HSQ955_155) in the second part of the first element in the array but received: " + Arrays.deepToString(params));
307307
}
308308

309309
updateMap(flowCellIds, flowCellAndRunId[0]);
Lines changed: 107 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,141 @@
11
package org.qcmg.qprofiler.fastq;
22

3-
import static org.junit.Assert.assertEquals;
43
import htsjdk.samtools.fastq.FastqRecord;
5-
6-
import org.junit.Ignore;
74
import org.junit.Test;
85

6+
import static org.junit.Assert.assertEquals;
7+
import static org.junit.Assert.assertTrue;
8+
99
public class FastqSummaryReportTest {
1010

1111
@Test
1212
public void parseRecordHeader() {
13-
FastqRecord rec = new FastqRecord("@ERR091788.1 HSQ955_155:2:1101:1473:2037/1",
13+
FastqRecord rec = new FastqRecord("ERR091788.1 HSQ955_155:2:1101:1473:2037/1",
1414
"GGGCANCCAGCAGCCCTCGGGGCTTCTCTGTTTATGGAGTAGCCATTCTCGTATCCTTCTACTTTCTTAAACTTTCTTTCACTTACAAAAAAATAGTGGA",
15-
"+",
15+
"",
1616
"<@@DD#2AFFHHH<FHFF@@FEG@DF?BF4?FFGDIBC?B?=FHIEFHGGG@CGHIIHDHFHFECDEEEECCCCCCAC@CCC>CCCCCCBBBBAC>:@<C");
1717

1818
FastqSummaryReport report = new FastqSummaryReport();
1919
report.parseRecord(rec);
2020
assertEquals(1, report.getRecordsParsed());
21-
assertEquals(1, report.instruments.get("@ERR091788").intValue());
21+
assertEquals(1, report.instruments.get("ERR091788").intValue());
2222
assertEquals(1, report.flowCellIds.get("HSQ955").intValue());
2323
assertEquals(1, report.flowCellLanes.get("2").intValue());
2424
assertEquals(1, report.tileNumbers.get(1101).intValue());
2525
assertEquals(1, report.firstInPair.intValue());
2626
assertEquals(0, report.secondInPair.intValue());
27-
2827
}
2928

30-
@Ignore // may need to cater for this in the future...
31-
public void parseAnotherRecordHeader() {
32-
FastqRecord rec = new FastqRecord("@SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36",
29+
@Test // may need to cater for this in the future...
30+
public void parseHeaderDoubleSpace2() {
31+
FastqRecord rec = new FastqRecord("SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36",
3332
"GGGCANCCAGCAGCCCTCGGGGCTTCTCTGTTTATGGAGTAGCCATTCTCGTATCCTTCTACTTTCTTAAACTTTCTTTCACTTACAAAAAAATAGTGGA",
34-
"+",
33+
"",
3534
"<@@DD#2AFFHHH<FHFF@@FEG@DF?BF4?FFGDIBC?B?=FHIEFHGGG@CGHIIHDHFHFECDEEEECCCCCCAC@CCC>CCCCCCBBBBAC>:@<C");
3635

3736
FastqSummaryReport report = new FastqSummaryReport();
3837
report.parseRecord(rec);
3938
assertEquals(1, report.getRecordsParsed());
40-
assertEquals(1, report.instruments.get("@ERR091788").intValue());
41-
assertEquals(1, report.flowCellIds.get("HSQ955").intValue());
42-
assertEquals(1, report.flowCellLanes.get("2").intValue());
39+
assertEquals(1, report.instruments.get("071112").intValue());
40+
assertEquals(1, report.runIds.get("SLXA-EAS1_s_7").intValue());
41+
assertEquals(1, report.flowCellLanes.get("5").intValue());
42+
assertEquals(1, report.tileNumbers.get(1).intValue());
43+
assertEquals(0, report.firstInPair.intValue());
44+
assertEquals(0, report.secondInPair.intValue());
45+
}
46+
@Test // may need to cater for this in the future...
47+
public void parseHeaderDoubleSpace() {
48+
49+
/*
50+
@SRR14585604.19092 A00805:41:HMJJWDRXX:1:1101:15329:20181 length=101
51+
TGCATTGTGTCAAAAGAAATTTCCTTATTTTCTACTGCCATTCCCATAAAAGTAAGTAGTCTCATTTTTGACATATTCTGTTCATGTAACAGGCCAAGTTA
52+
+SRR14585604.19092 A00805:41:HMJJWDRXX:1:1101:15329:20181 length=101
53+
:::FF:F,:F,FF:F:FFF:FFF,FFFFF,FF:FFF,F:F:,F,:FFF:FF:FF:F,F,::F::FF,FF,,:F,F,FFF,FFF:,,FFFFF,F:FF,FF,F
54+
*/
55+
FastqRecord rec = new FastqRecord("SRR14585604.19092 A00805:41:HMJJWDRXX:1:1101:15329:20181 length=101",
56+
"TGCATTGTGTCAAAAGAAATTTCCTTATTTTCTACTGCCATTCCCATAAAAGTAAGTAGTCTCATTTTTGACATATTCTGTTCATGTAACAGGCCAAGTTA",
57+
"SRR14585604.19092 A00805:41:HMJJWDRXX:1:1101:15329:20181 length=101",
58+
":::FF:F,:F,FF:F:FFF:FFF,FFFFF,FF:FFF,F:F:,F,:FFF:FF:FF:F,F,::F::FF,FF,,:F,F,FFF,FFF:,,FFFFF,F:FF,FF,F");
59+
60+
FastqSummaryReport report = new FastqSummaryReport();
61+
report.parseRecord(rec);
62+
assertEquals(1, report.getRecordsParsed());
63+
assertEquals(1, report.instruments.get("A00805").intValue());
64+
assertEquals(1, report.flowCellIds.get("HMJJWDRXX").intValue());
65+
assertEquals(1, report.flowCellLanes.get("1").intValue());
66+
assertEquals(1, report.tileNumbers.get(1101).intValue());
67+
assertEquals(0, report.firstInPair.intValue());
68+
assertEquals(0, report.secondInPair.intValue());
69+
}
70+
71+
@Test
72+
public void parseHeaderSingleSpace() {
73+
/*
74+
@VH01336:23:AAC37HWHV:1:1101:18459:1000 1:N:0:GGGGGGGG+AGATCTCG
75+
GTCCAGTTGCATTTTAGTAAGCTCTTTTTGATTCTCAAATCCGGCGTCAACCATACCAGCAGAGGAAGCATCAGCACCAGCACGCTCCCAAGCATTAAGCT
76+
+
77+
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC;CCCCCCCCC;;CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCC
78+
*/
79+
FastqRecord rec = new FastqRecord("VH01336:23:AAC37HWHV:1:1101:18459:1000 1:N:0:GGGGGGGG+AGATCTCG",
80+
"GTCCAGTTGCATTTTAGTAAGCTCTTTTTGATTCTCAAATCCGGCGTCAACCATACCAGCAGAGGAAGCATCAGCACCAGCACGCTCCCAAGCATTAAGCT",
81+
"",
82+
"CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC;CCCCCCCCC;;CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC-CCCCCCCC");
83+
84+
FastqSummaryReport report = new FastqSummaryReport();
85+
report.parseRecord(rec);
86+
assertEquals(1, report.getRecordsParsed());
87+
assertEquals(1, report.instruments.get("VH01336").intValue());
88+
assertEquals(1, report.flowCellIds.get("AAC37HWHV").intValue());
89+
assertEquals(1, report.flowCellLanes.get("1").intValue());
4390
assertEquals(1, report.tileNumbers.get(1101).intValue());
4491
assertEquals(1, report.firstInPair.intValue());
4592
assertEquals(0, report.secondInPair.intValue());
46-
4793
}
48-
49-
5094

95+
@Test
96+
public void parseHeaderNoSpace() {
97+
/*
98+
@HWI-ST590:2:1201:12570:134058#0
99+
AATAGTCCTAACGTTCTACATAACTTCAAGTAGTAAAATTCACCATCCTCT
100+
+
101+
:BC8?ABCEBEB9CEBFB@BC;>BFD=DE?B;@DBDED?DCD?BDDDBBBB
102+
*/
103+
FastqRecord rec = new FastqRecord("HWI-ST590:2:1201:12570:134058#0",
104+
"AATAGTCCTAACGTTCTACATAACTTCAAGTAGTAAAATTCACCATCCTCT",
105+
"",
106+
":BC8?ABCEBEB9CEBFB@BC;>BFD=DE?B;@DBDED?DCD?BDDDBBBB");
107+
108+
FastqSummaryReport report = new FastqSummaryReport();
109+
report.parseRecord(rec);
110+
assertEquals(1, report.getRecordsParsed());
111+
assertEquals(1, report.instruments.get("HWI-ST590").intValue());
112+
assertTrue(report.flowCellIds.isEmpty());
113+
assertEquals(1, report.flowCellLanes.get("2").intValue());
114+
assertEquals(1, report.tileNumbers.get(1201).intValue());
115+
assertEquals(0, report.firstInPair.intValue());
116+
assertEquals(0, report.secondInPair.intValue());
117+
}
118+
@Test
119+
public void parseHeaderNoSpace2() {
120+
/*
121+
@V350046278L1C001R00100004433/2
122+
CGCTGAAAATTGAAAGCCCGCTTGGGATAAGTGACATTAAGAACTGGCACCGACTGCAGAACCGCAATTTCCAGTTGACGCTAAGTGGGGGCTTATTTAGCACCCAGCTCTGTTTGCCAACACCCCCTGGGCATGAGAGCTCCCCAAGGG
123+
+
124+
HGGCGEE<DDDH<EAHBFGGGDBHHBB;C:HCGBEEBA@8HEDGAFGECFGG,1BH?@G)-C@EB?D/C6>GDHBBF(EH:7>>G@GH?G?F@?6<CB?B=DEC:>>CBE?G???BBG<.F:E?CFD?@?A:#5E>5BE/>BFFD+$E,>
125+
*/
126+
FastqRecord rec = new FastqRecord("V350046278L1C001R00100004433/2",
127+
"CGCTGAAAATTGAAAGCCCGCTTGGGATAAGTGACATTAAGAACTGGCACCGACTGCAGAACCGCAATTTCCAGTTGACGCTAAGTGGGGGCTTATTTAGCACCCAGCTCTGTTTGCCAACACCCCCTGGGCATGAGAGCTCCCCAAGGG",
128+
"",
129+
"HGGCGEE<DDDH<EAHBFGGGDBHHBB;C:HCGBEEBA@8HEDGAFGECFGG,1BH?@G)-C@EB?D/C6>GDHBBF(EH:7>>G@GH?G?F@?6<CB?B=DEC:>>CBE?G???BBG<.F:E?CFD?@?A:#5E>5BE/>BFFD+$E,>");
130+
131+
FastqSummaryReport report = new FastqSummaryReport();
132+
report.parseRecord(rec);
133+
assertEquals(1, report.getRecordsParsed());
134+
assertTrue(report.instruments.isEmpty());
135+
assertTrue(report.flowCellIds.isEmpty());
136+
assertTrue(report.flowCellLanes.isEmpty());
137+
assertTrue(report.tileNumbers.isEmpty());
138+
assertEquals(0, report.firstInPair.intValue());
139+
assertEquals(0, report.secondInPair.intValue());
140+
}
51141
}

0 commit comments

Comments
 (0)