Skip to content

Commit 4ae8d64

Browse files
committed
Adding jsonstats tool.
1 parent 0ae0c71 commit 4ae8d64

2 files changed

Lines changed: 161 additions & 1 deletion

File tree

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ else
2222
endif
2323
endif
2424

25-
MAINEXECUTABLES=parse minify json2json
25+
MAINEXECUTABLES=parse minify json2json jsonstats
2626
TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck
2727
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile
2828
SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing
@@ -116,6 +116,8 @@ minify: tools/minify.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIB
116116
json2json: tools/json2json.cpp $(HEADERS) $(LIBFILES)
117117
$(CXX) $(CXXFLAGS) -o json2json $ tools/json2json.cpp $(LIBFILES) -I.
118118

119+
jsonstats: tools/jsonstats.cpp $(HEADERS) $(LIBFILES)
120+
$(CXX) $(CXXFLAGS) -o jsonstats $ tools/jsonstats.cpp $(LIBFILES) -I.
119121

120122
ujdecode.o: $(UJSON4C_INCLUDE)
121123
$(CC) $(CFLAGS) -c dependencies/ujson4c/src/ujdecode.c

tools/jsonstats.cpp

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#include <iostream>
2+
#include <unistd.h>
3+
4+
#include "simdjson/jsonioutil.h"
5+
#include "simdjson/jsonparser.h"
6+
7+
using namespace std;
8+
9+
size_t count_nonasciibytes(const u8* input, size_t length) {
10+
size_t count = 0;
11+
for(size_t i = 0; i < length; i++) {
12+
count += input[i] >> 7;
13+
}
14+
return count;
15+
}
16+
17+
18+
size_t count_backslash(const u8* input, size_t length) {
19+
size_t count = 0;
20+
for(size_t i = 0; i < length; i++) {
21+
count += (input[i] == '\\') ? 1 : 0;
22+
}
23+
return count;
24+
}
25+
26+
27+
struct stat_s {
28+
size_t integer_count;
29+
size_t float_count;
30+
size_t string_count;
31+
size_t backslash_count;
32+
size_t nonasciibyte_count;
33+
size_t object_count;
34+
size_t array_count;
35+
size_t null_count;
36+
size_t true_count;
37+
size_t false_count;
38+
size_t byte_count;
39+
size_t structural_indexes_count;
40+
bool valid;
41+
};
42+
43+
typedef struct stat_s stat_t;
44+
45+
46+
47+
stat_t simdjson_computestats(const std::string_view &p) {
48+
stat_t answer;
49+
ParsedJson pj = build_parsed_json(p);
50+
answer.valid = pj.isValid();
51+
if (!answer.valid) {
52+
return answer;
53+
}
54+
answer.backslash_count = count_backslash((const u8*)p.data(), p.size());
55+
answer.nonasciibyte_count = count_nonasciibytes((const u8*)p.data(), p.size());
56+
answer.byte_count = p.size();
57+
answer.integer_count = 0;
58+
answer.float_count = 0;
59+
answer.object_count = 0;
60+
answer.array_count = 0;
61+
answer.null_count = 0;
62+
answer.true_count = 0;
63+
answer.false_count = 0;
64+
answer.string_count = 0;
65+
answer.structural_indexes_count = pj.n_structural_indexes;
66+
size_t tapeidx = 0;
67+
u64 tape_val = pj.tape[tapeidx++];
68+
u8 type = (tape_val >> 56);
69+
size_t howmany = 0;
70+
assert(type == 'r');
71+
howmany = tape_val & JSONVALUEMASK;
72+
for (; tapeidx < howmany; tapeidx++) {
73+
tape_val = pj.tape[tapeidx];
74+
// u64 payload = tape_val & JSONVALUEMASK;
75+
type = (tape_val >> 56);
76+
switch (type) {
77+
case 'l': // we have a long int
78+
answer.integer_count++;
79+
tapeidx++; // skipping the integer
80+
break;
81+
case 'd': // we have a double
82+
answer.float_count++;
83+
tapeidx++; // skipping the double
84+
break;
85+
case 'n': // we have a null
86+
answer.null_count++;
87+
break;
88+
case 't': // we have a true
89+
answer.true_count++;
90+
break;
91+
case 'f': // we have a false
92+
answer.false_count++;
93+
break;
94+
case '{': // we have an object
95+
answer.object_count++;
96+
break;
97+
case '}': // we end an object
98+
break;
99+
case '[': // we start an array
100+
answer.array_count++;
101+
break;
102+
case ']': // we end an array
103+
break;
104+
case '"': // we have a string
105+
answer.string_count++;
106+
break;
107+
default:
108+
break; // ignore
109+
}
110+
}
111+
return answer;
112+
}
113+
114+
115+
116+
117+
118+
119+
120+
int main(int argc, char *argv[]) {
121+
int c;
122+
123+
while ((c = getopt(argc, argv, "")) != -1)
124+
switch (c) {
125+
126+
default:
127+
abort();
128+
}
129+
if (optind >= argc) {
130+
cerr << "Reads json, prints stats. " << endl;
131+
cerr << "Usage: " << argv[0] << " <jsonfile>" << endl;
132+
133+
exit(1);
134+
}
135+
const char *filename = argv[optind];
136+
if (optind + 1 < argc) {
137+
std::cerr << "warning: ignoring everything after " << argv[optind + 1] << std::endl;
138+
}
139+
std::string_view p;
140+
try {
141+
p = get_corpus(filename);
142+
} catch (const std::exception &e) { // caught by reference to base
143+
std::cerr << "Could not load the file " << filename << std::endl;
144+
return EXIT_FAILURE;
145+
}
146+
stat_t s = simdjson_computestats(p);
147+
if(!s.valid) {
148+
std::cerr << "not a valid JSON" << std::endl;
149+
return EXIT_FAILURE;
150+
}
151+
152+
153+
printf("# integer_count float_count string_count backslash_count nonasciibyte_count object_count array_count null_count true_count false_count byte_count structural_indexes_count\n");
154+
printf("%zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu %zu\n", s.integer_count, s.float_count,
155+
s.string_count, s.backslash_count, s.nonasciibyte_count, s.object_count, s.array_count,
156+
s.null_count, s.true_count, s.false_count, s.byte_count, s.structural_indexes_count);
157+
return EXIT_SUCCESS;
158+
}

0 commit comments

Comments
 (0)