-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathV4subsetORF.java
More file actions
219 lines (193 loc) · 6.53 KB
/
Copy pathV4subsetORF.java
File metadata and controls
219 lines (193 loc) · 6.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
package scripts;
/*************************************************
* V4 create subset of sTCW annotated with UniProt Dec2021
*
* Set1
* viewSingleTCW:
* Basic Hit Filter: Rank=1, SP-plants, 1E-30, %Sim>=50, %Hit>=50
* Create Seq Table and output columns
* This Filter: Not Multi; not ORF=Hit+; Not TR-plants; Sim>=100 HitCov>=100
*
* Set2 (Osj) & Set3 (NnR)
* viewSingleTCW:
* Seq Filter: BestBits, TR-plants, %Sim>=100, %Hit>=100%
* Seq Table: Osj reverse order
* Create Seq Table and output columns
* This Filter: Not Multi; not ORF=Hit+;
*
* Seq Table Columns:
* #Seq-ID TCW-Remarks #SwissProt BS-Hit-ID BS-DB-type BS-Taxo BS-%Sim BS-%HitCov
*/
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.HashSet;
public class V4subsetORF {
public static void main(String[] args) {new V4subsetORF();}
int SET=3;
String dir = "paper/ORF/Subset/set";
String fastaFile = "SeqTableSeqs.fa";
String tabFile = "SeqTableColumns.tsv";
String outFaFile, outRmkFile;
int maxSeqs=5000, maxNoSP=-1;
String sp0Rmk = "NoSP";
double simCutoff=100.0, hitCutoff=100.0;
HashMap <String, String> seqHitMap = new HashMap <String, String> (); // initial set with removal except unique
HashMap <String, String> seqRmkMap = new HashMap <String, String> (); // non-unique based on hit & pairs
HashSet <String> uniqueSet = new HashSet <String> ();
HashSet <String> finalSet = new HashSet <String> ();
private void init() {
dir += SET + "/";
outFaFile = dir + "set" + SET + ".fa";
outRmkFile = dir + "names" + SET + ".txt";
if (SET!=1) maxNoSP=2000;
prt(dir + ": " + maxSeqs + " seqs; " + maxNoSP + " sp=0");
}
public V4subsetORF() {
try {
init();
readColumns(); // seqHitMap, seqRmkMap, hitCntMap
createUniqueHit(); // uniqueSeq - keep only one for each hit
finalSet(); // finalSet - keep only those that pass max parameters
writeFasta();
}
catch (Exception e) {e.printStackTrace(); System.exit(0);}
}
/*****************************************************/
private void finalSet() {
try {
int cntSp0=0;
for (String seqid : seqRmkMap.keySet()) { // set2 only
if (!uniqueSet.contains(seqid)) continue;
String remark = seqRmkMap.get(seqid);
if (remark.contains(sp0Rmk)) cntSp0++;
else continue;
finalSet.add(seqid);
if (cntSp0>=maxNoSP) break;
}
for (String seqid : seqRmkMap.keySet()) {
if (uniqueSet.contains(seqid)) {
finalSet.add(seqid);
if (finalSet.size()>=maxSeqs) break;
}
}
prt("Finals:");
prt(finalSet.size(), "Final Set ");
prt(cntSp0, "SP=0");
}
catch (Exception e) {e.printStackTrace(); System.exit(0);}
}
/********************************************************
* All have ORf
*******************************************************/
private void readColumns() {
try {
String line, name=null;
int nSP;
String remark, hit, type, taxo;
double sim, hitcov;
int cntRead=0, cntNoTR=0, cntNoSim=0, cntNoHitCov=0, cntNoPlus=0, cntIsMulti=0, cntNoSP=0;
BufferedReader in = new BufferedReader ( new FileReader (dir + "/" + tabFile));
line = in.readLine();
prt(line);
// #Seq-ID TCW-Remarks #SwissProt BS-Hit-ID BS-DB-type BS-Taxo BS-%Sim BS-%HitCov
while ((line = in.readLine()) != null) {
cntRead++;
String [] tok = line.split("\\t");
int i=0;
name = tok[i++];
remark = tok[i++];
nSP = Integer.parseInt(tok[i++]);
hit = tok[i++];
type = tok[i++];
taxo = tok[i++];
sim = Double.parseDouble(tok[i++]);
hitcov = Double.parseDouble(tok[i++]);
if (!remark.contains("ORF=Hit+")) {cntNoPlus++; continue;}
if (remark.startsWith("Multi")) {cntIsMulti++; continue;}
// should always pass for set2 and set3
if (!type.contentEquals("tr") || !taxo.contentEquals("plants")) {cntNoTR++; continue;}
if (sim<simCutoff) {cntNoSim++; continue;}
if (hitcov<hitCutoff) {cntNoHitCov++; continue;}
if (SET!=1) {
if (nSP==0) {cntNoSP++; remark += "; " + sp0Rmk;}
}
seqHitMap.put(name, hit);
seqRmkMap.put(name, remark);
}
in.close();
prt("Totals:");
prt(seqHitMap.size(), tabFile);
prt(cntRead, "Read");
prt(cntNoSP, "is SP=0");
prt(cntNoTR, "not TR");
prt(cntNoSim, "not Sim");
prt(cntNoHitCov, "not HitCov");
prt(cntIsMulti, "Multi");
prt(cntNoPlus, "not ORF=Hit+");
prt("");
}
catch (Exception e) {e.printStackTrace(); System.exit(0);}
}
/*******************************************************
* This is done after removing similar pairs, so there are not many dup hits
*/
private void createUniqueHit() {
try {
HashSet <String> hitSet = new HashSet <String> ();
for (String seqid : seqHitMap.keySet()) {
String hit = seqHitMap.get(seqid);
if (!hitSet.contains(hit)) {
uniqueSet.add(seqid);
hitSet.add(hit);
}
}
prt(uniqueSet.size(), "remove Duplicate hit seqs ");
}
catch (Exception e) {e.printStackTrace(); System.exit(0);}
}
private void writeFasta() {
try {
int cntLg=0, cntMk=0;
String line, seq="", name=null;
boolean bPrt=false;
PrintWriter fhFa = new PrintWriter(new FileOutputStream(outFaFile, false));
PrintWriter fhRm = new PrintWriter(new FileOutputStream(outRmkFile, false));
BufferedReader inFa = new BufferedReader ( new FileReader (dir + "/" + fastaFile));
while ((line = inFa.readLine()) != null) {
line = line.trim();
if (line.startsWith(">")) {
if (bPrt) {
String remark = seqRmkMap.get(name);
if (remark.contains("!Lg")) cntLg++;
if (remark.contains("!Mk")) cntMk++;
fhRm.println(name + " " + remark);
fhFa.println(">" + name + "\n" + seq);
seq="";
}
line = line.substring(1);
name = line.split(" ")[0];
bPrt = finalSet.contains(name);
}
else if (bPrt) seq += line;
}
if (bPrt) {
String remark = seqRmkMap.get(name);
if (remark.contains("!Lg")) cntLg++;
if (remark.contains("!Mk")) cntMk++;
fhRm.println(name);
fhFa.println(">" + name + "\n" + seq);
seq="";
}
inFa.close(); fhFa.close(); fhRm.close();
prt(cntLg, "!Lg");
prt(cntMk, "!Mk");
prt(SET + " Done " + finalSet.size());
}
catch (Exception e) {e.printStackTrace(); System.exit(0);}
}
private void prt(String msg) {System.out.println(msg);}
private void prt(int x, String msg) {if (x>0) System.out.format("%8d %s\n", x, msg);}
}