#include #include #include #include #include #include #include #include #include #ifdef _OPENMP #include #endif #include "unordered_dense.h" // Transforms input string as follows: // '/foo/bar/file1.txt' // => vector{"foo", "bar", "file1.txt"} std::vector splitString(const std::string &input, const char &separator) { std::vector result; std::stringstream ss(input); std::string item; while (std::getline(ss, item, separator)) { if (item.size() > 0) { result.push_back(item); } } return result; } // Convert int64_t to binary string std::string int64ToBinaryString(int64_t num) { std::string result; for (int i = 63; i >= 0; --i) { result += ((num >> i) & 1) ? '1' : '0'; } return result; } // Convert a (8 char) string represented as int64_t to std::string std::string int64ToStr(int64_t key) { int nchars = 8; std::string str; int multip = nchars * 8; for (int i = 0; i <= nchars; i++) { char c = (key >> multip) & 255; str.push_back(c); multip -= 8; } return str; } void printVector(const std::vector &vec) { for (const auto &value : vec) { std::cout << value << " "; } } std::string charToBinaryString(char num) { std::string result; for (int i = 7; i >= 0; --i) { result += ((num >> i) & 1) ? '1' : '0'; } return result; } class Candidate; enum segmentType { Dir, File }; // A segment of a file path // e.g. if path is /foo/bar/baz.txt // segments are [{root}, foo, bar, baz.txt] class PathSegment { public: std::string str; int fileId; // (if FILE) Candidate *cand; PathSegment *parent; ankerl::unordered_dense::map children; segmentType type = Dir; PathSegment() : parent(NULL) {} PathSegment(std::string _str) : str(_str), parent(NULL) {} PathSegment(std::string _str, int _fileId) : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {} int size() { int sz = str.size(); PathSegment *cur = parent; // Sum up length of parent segments (+1 for divisors) while (cur->parent != NULL) { sz += cur->str.size() + 1; cur = cur->parent; } return sz; } }; // Candidate for result in string (filename) search class Candidate { public: std::vector v_charscore; PathSegment *seg; int fileId; // The string that this candidate represents std::string str; int len; // Query string length float minscore; float maxscore; int candLen; // Length of candidate Candidate(){}; Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) { // Initialize v_charscores with zeros v_charscore.resize(len, 0); candLen = str.size(); seg = NULL; } Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) { // Initialize v_charscores with zeros v_charscore.resize(len, 0); candLen = seg->size(); } float getScore() { int i = 0; float score = 0.0; candLen = seg->size(); for (float &charscore : v_charscore) { score += charscore; i++; } float div = len * len; float div2 = len * candLen; float score1 = score / div; float score2 = score / div2; score = score1 * 0.97 + score2 * 0.03; return score; } float operator[](int idx) { return v_charscore[idx]; } }; // This seems to give 10x speed improvement over std::unordered_map typedef ankerl::unordered_dense::map *> SegMap; // typedef std::unordered_map *> SegMap; typedef std::unordered_map CandMap; class StringIndex { private: int tmp; char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator) std::vector dirmaps; std::vector filemaps; std::vector segsToClean; std::unordered_map strlist; std::unordered_map seglist; PathSegment *root; int dirId = 0; float dirWeight = 0.7; // Give only 70% of score if match is for a directory public: StringIndex() { root = new PathSegment(); root->parent = NULL; root->str = "[ROOT]"; for (int i = 0; i <= 8; i++) { dirmaps.push_back(new SegMap); filemaps.push_back(new SegMap); } #ifdef _OPENMP std::cout << "OPENMP enabled\n"; #endif } void setDirSeparator(char sep) { dirSeparator = sep; } void setDirWeight(float val) { dirWeight = val; } ~StringIndex() { for (auto x : dirmaps) { for (auto y : *x) { y.second->clear(); delete (y.second); } x->clear(); delete x; } for (auto x : filemaps) { for (auto y : *x) { y.second->clear(); delete (y.second); } x->clear(); delete x; } clearPathSegmentChildren(root); } void addStrToIndex(std::string filePath, int fileId) { addStrToIndex(filePath, fileId, dirSeparator); } /** * Add a string to the index to be search for afterwards * * @param filePath String to index (e.g. /home/user/Project/main.cpp). * @param fileId Unique identifier for filePath. Will be return as result from findSimilar. * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually * one of {'\\', '/', '\0' (no separation)}. */ void addStrToIndex(std::string filePath, int fileId, const char &separator) { std::vector segs; if (separator == '\0') { // No separation to directories & files segs = {filePath}; } else { // Split path to segments segs = splitString(filePath, separator); } PathSegment *prev = NULL; prev = root; // Add segments to a tree type data structure // e.g. addStrToIndex('/foo/bar/file1.txt' ..) // addStrToIndex('/foo/faa/file2.txt' ..) // forms structure: // root -> foo |-> bar -> file1,txt // |-> faa -> file2.txt for (auto _x = segs.begin(); _x != segs.end(); ++_x) { auto x = *_x; PathSegment *p; auto it = prev->children.find(x); // this part of the path already exists in the tree if (it != prev->children.end()) { p = it->second; } else { p = new PathSegment(x, fileId); p->parent = prev; // If this is last item in segs if (_x == std::prev(segs.end())) { // therefore, it is a file. p->type = File; seglist[fileId] = p; } else { p->type = Dir; p->fileId = dirId; // Files use user input Id. Directories need to have it generated dirId++; } prev->children[x] = p; addPathSegmentKeys(p); } prev = p; } } /** * The search will find filepaths similar to the input string To be considered a candidate path, the file component of the path (e.g. file.txt) is required to have at least a substring of two characters in common with the query string. If that condition is true, then the directories will also add to the score, although with a smaller weight. The similarity measure between query and PathSegment in index works as follows: For each character c in the query string: - find the largest substring in the query which includes the character c and is also included in the PathSegment - take the lenght of that substring as score sum up the scores for each character c and divide by (string length)^2 For example, if query = "rngnomadriv" and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated as follows: rngnomadriv 33355555444 (subscores) FFFFFFFFDDD (F=file component, D=dir component) score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7) In final score, give a small penalty for larger candidate filenames: Divide main part of score with (query string length)^2 and minor part by (query string length)*(candidate string length) score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944 @param query String to search for inside the index */ std::vector> findSimilar(std::string query, int minChars) { CandMap fileCandMap; CandMap dirCandMap; // Find both files and directories that match the input query addToCandMap(fileCandMap, query, filemaps); addToCandMap(dirCandMap, query, dirmaps); /* If parent dir of a file matches the input string add the scores of the direcotry to the scores of the file */ mergeCandidateMaps(fileCandMap, dirCandMap); // Set all candidate pointers to NULL so they won't mess up future searches for (auto seg : segsToClean) { seg->cand = NULL; } segsToClean.clear(); // Form return result, 2d array with file id's and scores std::vector> results; for (auto &[fid, cand] : fileCandMap) { std::pair v; float sc = cand.getScore(); v.first = sc; v.second = fid; results.push_back(v); } // Sort highest score first std::sort(results.begin(), results.end(), [](std::pair a, std::pair b) { return a.first > b.first; }); return results; } // Return int64_t representation of the first nchars in str, starting from index i int64_t getKeyAtIdx(std::string str, int i, int nchars) { int64_t key = 0; for (int i_char = 0; i_char < nchars; i_char++) { key = key | static_cast(str[i + i_char]); if (i_char < nchars - 1) { // Shift 8 bits to the left except on the last iteration key = key << 8; } } return key; } void debug() { int nchars = 3; for (const auto &[key, value] : (*filemaps[nchars])) { int64_t x; x = key; int multip = nchars * 8; for (int i = 0; i <= nchars; i++) { char c = (x >> multip) & 255; std::cout << c; multip -= 8; } std::cout << "\n"; // for (auto y : *value) { // std::cout << y << " "; // } // std::cout << "\n"; } } private: void clearPathSegmentChildren(PathSegment *p) { if (p->children.size() > 0) { for (auto x : p->children) { clearPathSegmentChildren(x.second); } } delete p; } void addPathSegmentKeys(PathSegment *p) { // Input p is part of a path, e.g. 'barxyz' if path is /foo/barxyz/baz.txt // This function generates int64 representations (keys) of all substrings of size 2..8 in that // path segment and stores pointer to p in hash tables using these int values as keys. int maxChars = 8; int minChars = 2; std::string str = p->str; if (p->str.size() < 2) { return; } if (static_cast(p->str.size()) < maxChars) { maxChars = p->str.size(); } #ifdef _OPENMP #pragma omp parallel for #endif for (int sublen = minChars; sublen <= maxChars; sublen++) { SegMap *map; if (p->type == File) { map = filemaps[sublen]; } else { map = dirmaps[sublen]; } int count = str.size() - sublen + 1; for (int i = 0; i <= count; i++) { int64_t key = getKeyAtIdx(str, i, sublen); // Create a new std::set for key if doesn't exist already auto it = map->find(key); if (it == map->end()) { (*map)[key] = new std::set; } (*map)[key]->insert(p); } } } // Find pathsegments from that include the substring of which starts at index and // is of length . std::vector findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) { assert(i + nchars <= static_cast(str.size())); std::vector res; // Take substring of str, starting at i, spanning nchars // transform that to 64 bit integer int64_t key = getKeyAtIdx(str, i, nchars); // Find all path segments in map that have the same substring auto it = map.find(key); if (it != map.end()) { // key found auto set = it->second; for (auto value : *set) { res.push_back(value); } } return res; } void addToCandMap(CandMap &candmap, std::string query, std::vector &map // filemaps or dirmaps ) { int maxChars = 8; int minChars = 2; if (static_cast(query.size()) < maxChars) { maxChars = query.size(); } // Loop all substring lengths between minChars..maxChars for (int sublen = minChars; sublen <= maxChars; sublen++) { int count = query.size() - sublen + 1; // Loop all possible start positions for (int i = 0; i < count; i++) { std::vector res = findSimilarForNgram(query, i, sublen, *(map[sublen])); for (PathSegment *p : res) { addToResults(p, query, i, sublen, candmap); } } } } // Add parent directories scores to files void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) { for (auto &[fid, cand] : fileCandMap) { PathSegment *p = cand.seg->parent; while (p->parent != NULL) { if (p->cand != NULL) { auto &scoreA = cand.v_charscore; auto &scoreB = p->cand->v_charscore; for (int i = 0; i < cand.len; i++) { if (scoreA[i] < scoreB[i] * dirWeight) { scoreA[i] = scoreB[i] * dirWeight; } } } p = p->parent; } } } void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) { auto it2 = candmap.find(seg->fileId); if (it2 == candmap.end()) { Candidate cand(seg, str.size()); seg->cand = &(candmap[seg->fileId]); segsToClean.push_back(seg); candmap[seg->fileId] = cand; } for (int j = i; j < i + nchars; j++) { if (candmap[seg->fileId][j] < nchars) { candmap[seg->fileId].v_charscore[j] = nchars; } } } };