-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparseNR.java
More file actions
94 lines (91 loc) · 2.79 KB
/
Copy pathparseNR.java
File metadata and controls
94 lines (91 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
package scripts;
/*******************************************
* TCW blasted nr.gz against demoTra. Move the demo-nr.tab to blast/nr.tab
* Run from main directory in order to make a nrDemo fasta file of only the hit proteins.
* 1. Read blast .tab file. 2. Read nr.gz. 3. Write a subset of nr.gz fasta file.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.zip.GZIPInputStream;
public class parseNR {
static private String inFasta = "projects/DBfasta/nr_Oct2016/nr.gz";
static private String inTab = "blast/nr.tab";
static private String outFasta = "blast/nrDemo";
public static void main(String[] args) {
System.out.println("Read " + inFasta + " and " + inTab);
System.out.println("Create " + outFasta);
readTab();
readGZ();
System.out.println("Finished");
}
private static void readTab() {
try {
BufferedReader in = new BufferedReader ( new FileReader (inTab));
String line;
while ((line = in.readLine()) !=null) {
String [] tok = line.split("\t");
nrIDs.add(tok[1]);
}
System.out.println("Read " + nrIDs.size());
}
catch (Exception e) {e.printStackTrace();};
}
private static void readGZ() {
try {
BufferedReader in = openGZIP(inFasta);
PrintWriter out = new PrintWriter(new FileOutputStream(outFasta, false));
int cnt=0;
String line, name;
boolean prt=false;
while ((line = in.readLine()) != null) {
if (line.startsWith(">")) {
if (line.contains(" ")) name = line.substring(1, line.indexOf(" "));
else name = line.substring(1);
prt = false;
if (nrIDs.contains(name)) {
prt = true;
out.format("%s\n", line);
cnt++;
}
else prt=false;
}
else if (prt) out.format("%s\n", line);
}
System.out.println("Wrote " + cnt);
}
catch (Exception e) {e.printStackTrace();};
}
public static BufferedReader openGZIP(String file) {
try {
if (!file.endsWith(".gz")) {
File f = new File (file);
if (f.exists())
return new BufferedReader ( new FileReader (f));
else {
f = new File (file + ".gz");
if (f.exists()) file = file + ".gz";
else {
System.err.println("Cannot open file " + file);
System.exit(-1);
}
}
}
if (file.endsWith(".gz")) {
FileInputStream fin = new FileInputStream(file);
GZIPInputStream gzis = new GZIPInputStream(fin);
InputStreamReader xover = new InputStreamReader(gzis);
return new BufferedReader(xover);
}
else System.err.println("Do not recognize file suffix: " + file);
}
catch (Exception e) {e.printStackTrace();}
return null;
}
private static HashSet <String> nrIDs = new HashSet <String> ();
}