Skip to content

Commit 610b629

Browse files
committed
[Refactor] Refactor .
1 parent 44a580a commit 610b629

13 files changed

Lines changed: 300 additions & 54 deletions

File tree

README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,18 @@
11
# feather
22
FEATure HashER
33

4+
## Installation
5+
* Python
6+
Run `cd feather && python3 -m pip install ./`, here is what you may see:
7+
```
8+
Processing /Path/To/feather Building wheels for collected packages: pyfeather
9+
Building wheel for pyfeather (setup.py) ... done Created wheel for pyfeather:
10+
filename=pyfeather-0.0.1-cp37-cp37m-macosx_10_15_x86_64.whl size=1284474 sha256=e3f9d0be1e7578274f3fcecb854c1e66336a24985b8e6ff4213375d76463299e
11+
Stored in directory: /private/var/folders/4q/50_2647d1yb47jt9j6plwx2r0000gq/T/pip-ephem-wheel-cache-996awbes/wheels/0f/bd/93/b6936ec0c1169201de264147e21ae7e2bb894720b34bcdce79
12+
Successfully built pyfeather Installing collected packages: pyfeather
13+
Successfully installed pyfeather-0.0.1
14+
```
15+
416
## Feature Hashing
517
### Notions
618
* **Feature Value**:
@@ -50,4 +62,4 @@ by the way, in case we want adjust each feature-slot's hash-bucket size, we can
5062
* Seperate classes' pybind codes, so when calling cpp interface, do not need linking pybind lib.
5163
* Supports mapping feature-hash to feather-index
5264
* Mapping feature-index back to feature-hash
53-
* Mapping feature-hash back to feature name.
65+
* Mapping feature-hash back to feature name.

conf/feather.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ fea8 108 10000 0
99
fea9 109 16 0
1010
fea10 110 1 1
1111
fea11 111 4 2
12+
fea12 112 100 0

example.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "feather/FeaExtractor.h"
1111
#include "feather/LibsvmExtractor.h"
1212
#include "feather/FeaValue.h"
13+
#include "feather/BiDict.h"
1314
#include "feather/utils.h"
1415

1516

@@ -44,11 +45,14 @@ void feavalue_example() {
4445
printf("Vector FeaValue hash of {1.1, 2.2, 3.3} is: ");
4546
for (auto x : vec_fea_val2_hash) { printf("%lld ", x); }
4647
printf("\n");
48+
49+
printf("end...\n\n");
4750
}
4851

4952

5053
void feahash_example() {
5154
feather::FeaHash feahash("../conf/feather.conf");
55+
printf("end...\n\n");
5256
}
5357

5458

@@ -59,12 +63,27 @@ void utils_example() {
5963
std::cout << "56 to code " << bucket_code2 << std::endl;
6064
std::string bucket_code3 = feather::num2str_code(888, 3);
6165
std::cout << "888 to code " << bucket_code3 << std::endl;
66+
printf("end...\n\n");
67+
}
68+
69+
70+
void bidict_example() {
71+
feather::BiDict bi_dict({"fea_name", "fea_hash", "slot_id"});
72+
bi_dict.Register({"fea1", "10100000", "101"});
73+
/// Map test
74+
std::string slot_id2fea_name = bi_dict.Map(
75+
"slot_id", "fea_name", "101")[0];
76+
printf("%s\n", ("Map slot_id 101 to fea_name " + slot_id2fea_name).c_str());
77+
78+
bi_dict.Persistence();
79+
printf("end...\n\n");
6280
}
6381

6482

6583
int main(int argc, char** argv) {
6684
feavalue_example();
6785
feahash_example();
6886
utils_example();
87+
bidict_example();
6988
return 0;
7089
}

example.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,38 @@ def main():
3535
fea11val2_id = fea_hash.FeaRegister("fea11", [4.0, 3.0, 2.0, 1.0])
3636
print("fea11#val2", fea11val2_id)
3737

38+
fea12val1_1_id = fea_hash.FeaRegister("fea12", "1")
39+
fea12val1_2_id = fea_hash.FeaRegister("fea12", 1)
40+
print("fea12='1' vs fea12=1: ", fea12val1_1_id, fea12val1_2_id)
41+
3842

3943
libsvm_extractor = pyfeather.LibsvmExtractor("../conf/feather.conf", "ctr", False)
4044
record1 = "{\"fea1\":2, \"fea2\": \"a\", \"fea8\": \"123\", \"fea10\": 3.14, \"fea11\": [2.3, 1.4, 3.5, 6.8], \"ctr\": 1}"
41-
print(record1)
45+
record2 = "{\"fea1\":\"2\", \"fea2\": \"a\", \"fea8\": \"123\", \"fea10\": 3.14, \"fea11\": [2.3, 1.4, 3.5, 6.8], \"ctr\": 1}"
46+
print("record1: \n", record1)
47+
print("record2: \n", record2)
4248
target = \
4349
"1.000000 10100025:1 11000000:3.140000 11100000:2.300000 11100001:1.400000 11100002:3.500000 11100003:6.800000 10200026:1 10805285:1"
44-
output = libsvm_extractor.Extract(record1)
45-
print(target)
46-
print(output)
47-
print(output.strip(" ") == target)
50+
target = target.split(" ")
51+
target = [target[0]] + sorted(target[1:], key=lambda x: x.split(":")[0])
52+
target = " ".join(target)
53+
output1 = libsvm_extractor.Extract(record1)
54+
output2 = libsvm_extractor.Extract(record2)
55+
print("target:\n" + target)
56+
print("output1:\n" + output1)
57+
print("output2:\n" + output2)
58+
print(output1.strip(" ") == target)
59+
print(output2.strip(" ") == target)
60+
61+
# Map fea-hash back to fea-group
62+
hash2name_1 = fea_hash.FeaHash2FeaName(10100025)
63+
print("10100025 is fea-hash of %s" % hash2name_1)
64+
hash2name_11_0 = fea_hash.FeaHash2FeaName(11100000)
65+
print("11100000 is fea-hash of %s" % hash2name_11_0)
66+
hash2name_11_3 = fea_hash.FeaHash2FeaName(11100003)
67+
print("11100003 is fea-hash of %s" % hash2name_11_3)
68+
hash2name_10 = fea_hash.FeaHash2FeaName(11000000)
69+
print("11000000 is fea-hash of %s" % hash2name_10)
4870

4971

5072
if __name__ == "__main__":

include/feather/BiDict.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/// file: BiDict.h
2+
/// Bidirectional dictionary class.
3+
4+
5+
#ifndef FEATHER_BIDICT_H_
6+
#define FEATHER_BIDICT_H_
7+
8+
9+
#include <string>
10+
#include <vector>
11+
#include <unordered_map>
12+
13+
14+
namespace feather {
15+
16+
17+
class BiDict {
18+
public:
19+
BiDict(const std::vector<std::string>& schema,
20+
const std::string& name="default");
21+
22+
int32_t Register(const std::vector<std::string>& record);
23+
24+
std::string Persistence(const std::string& path="");
25+
26+
std::vector<std::string> Map(
27+
const std::string from, const std::string to,
28+
const std::string& key);
29+
30+
protected:
31+
std::string Indexs2DictName(
32+
const int32_t index1, const int32_t index2);
33+
34+
private:
35+
std::string name_;
36+
std::vector<std::string> schema_;
37+
std::unordered_map<std::string, int32_t> col_schema_;
38+
std::unordered_map<
39+
std::string,
40+
std::unordered_map< std::string, std::vector<std::string> >
41+
> dicts_;
42+
};
43+
44+
45+
} // namespace feather
46+
47+
48+
#endif

include/feather/FeaHash.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
#define FEATHER_FEAHASH_H_
66

77

8+
#include <memory>
89
#include <functional>
910
#include <string>
1011
#include <vector>
1112
#include <nlohmann/json.hpp>
1213
#include <pybind11/pybind11.h>
1314

15+
#include "feather/BiDict.h"
1416
#include "feather/FeaSlot.h"
1517
#include "feather/FeaValue.h"
1618

@@ -30,17 +32,23 @@ class FeaHash {
3032

3133
std::vector<int64_t> FeaRegister(
3234
const std::string& fea_name, const std::string& fea_value);
35+
std::vector<int64_t> FeaRegister(
36+
const std::string& fea_name, const int32_t fea_value);
3337
std::vector<int64_t> FeaRegister(
3438
const std::string& fea_name, const std::vector<float>& fea_value);
3539
std::vector<int64_t> FeaRegister(
3640
const std::string& fea_name, const float fea_value);
3741

42+
std::string FeaHash2FeaName(const int64_t fea_hash);
43+
3844
const nlohmann::json& GetMeta();
3945

46+
const std::unordered_map<std::string, FeaSlot>* GetSlots();
47+
4048
int16_t FeaValCheck(const std::string& name);
4149
int16_t FeaValCheck(const std::string& name, FeaValue* val);
4250

43-
void Merge(FeaHash fea_hash);
51+
//void Merge(FeaHash fea_hash);
4452

4553
protected:
4654
std::vector<int32_t> FeaVal2FeaHashBucket(FeaValue* fea_val, FeaSlot* fea_slot);
@@ -52,6 +60,8 @@ class FeaHash {
5260

5361
private:
5462
std::string conf_path;
63+
std::vector<std::string> dict_schema_ = {"fea_name", "slot"};
64+
std::shared_ptr<BiDict> dict_ = std::make_shared<BiDict>(this->dict_schema_);
5565
nlohmann::json fea_hash = {
5666
{ "meta", {} },
5767
{ "slots", {} }

include/feather/FeaSlot.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ class FeaSlot {
4343

4444
int32_t GetBucketSize();
4545

46+
int8_t GetType() const;
47+
4648
void Merge(FeaSlot fea_slot);
4749

4850
private:

include/feather/FeaValue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ namespace feather {
1414

1515
class FeaValue {
1616
public:
17+
FeaValue() = default;
1718
FeaValue(const int32_t val, const int16_t type=0);
1819
FeaValue(const float& val, const int16_t type=1);
1920
FeaValue(const double& val, const int16_t type=1);

include/feather/LibsvmExtractor.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <string>
99
#include <unordered_map>
1010
#include <pybind11/pybind11.h>
11+
#include <nlohmann/json.hpp>
1112

1213
#include "feather/FeaExtractor.h"
1314
#include "feather/FeaHash.h"
@@ -29,6 +30,10 @@ class LibsvmExtractor : public FeaExtractor {
2930
std::string Extract(const std::string& flat_json);
3031
std::string Extract(const nlohmann::json& flat_json);
3132

33+
protected:
34+
//FeaValue* JsonVal2FeaVal(
35+
// const int8_t type, const nlohmann::basic_json& val);
36+
3237
private:
3338
std::string label;
3439
FeaHash fea_hash;

src/BiDict.cpp

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/// file: BiDict.cpp
2+
3+
4+
#include "feather/BiDict.h"
5+
6+
7+
namespace feather {
8+
9+
10+
BiDict::BiDict(const std::vector<std::string>& schema,
11+
const std::string& name) {
12+
this->name_ = name;
13+
this->schema_ = schema;
14+
15+
int32_t col_index = 0;
16+
for (const std::string& key1 : schema) {
17+
this->col_schema_[key1] = col_index++;
18+
for (const std::string& key2 : schema) {
19+
if (key1 != key2) {
20+
std::string dict_name = key1 + "2" + key2;
21+
this->dicts_[dict_name] = {};
22+
}
23+
}
24+
}
25+
}
26+
27+
28+
int32_t BiDict::Register(
29+
const std::vector<std::string>& record) {
30+
int32_t field_num = this->schema_.size();
31+
if (field_num != record.size()) {
32+
throw "The element number of record should be same with \
33+
schema field number.";
34+
}
35+
for (int32_t index1 = 0; index1 < field_num; ++index1) {
36+
for (int32_t index2 = 0; index2 < field_num; ++index2) {
37+
if (index1 == index2) { continue; }
38+
std::string dict = this->Indexs2DictName(index1, index2);
39+
/// TODO@202108281125: Check key existence first.
40+
this->dicts_[dict][record[index1]].emplace_back(record[index2]);
41+
}
42+
}
43+
return 0;
44+
}
45+
46+
47+
std::string BiDict::Indexs2DictName(
48+
const int32_t index1, const int32_t index2) {
49+
if (this->schema_.size() == 0 || this->dicts_.size() == 0) {
50+
throw "No schema or dict has been registered/initialized.";
51+
}
52+
return this->schema_[index1] + "2" + this->schema_[index2];
53+
}
54+
55+
56+
std::vector<std::string> BiDict::Map(
57+
const std::string from, const std::string to,
58+
const std::string& key) {
59+
std::vector<std::string> value;
60+
std::string using_dict = from + "2" + to;
61+
if (this->dicts_.find(using_dict) == this->dicts_.end()) {
62+
throw ("No dict can mapping from " + from + " to " + to);
63+
} else if (
64+
this->dicts_[using_dict].find(key) == this->dicts_[using_dict].end()
65+
) {
66+
throw ("In inner dict '" + using_dict + "', no key " + key);
67+
} else {
68+
value = this->dicts_[using_dict][key];
69+
}
70+
return value;
71+
}
72+
73+
74+
/// TODO@202108281800
75+
std::string BiDict::Persistence(const std::string& path) {
76+
std::string out_path;
77+
std::vector<std::string> record_vec;
78+
std::unordered_map<std::string, bool> records;
79+
if (path.size() == 0) { out_path = ("./" + this->name_ + ".txt"); }
80+
81+
for (auto dict_iter = this->dicts_.begin();
82+
dict_iter != this->dicts_.end(); ++dict_iter) {
83+
for (auto record_iter = dict_iter->second.begin();
84+
record_iter != dict_iter->second.end(); ++record_iter) {
85+
record_vec.clear();
86+
std::string dict_name = dict_iter->first;
87+
std::string key = record_iter->first;
88+
std::vector<std::string> value = record_iter->second;
89+
printf("dbg: %s: key=%s, val=%s\n",
90+
dict_name.c_str(), key.c_str(), value[0].c_str());
91+
}
92+
}
93+
return out_path;
94+
}
95+
96+
97+
} // namespace feather

0 commit comments

Comments
 (0)