stridx.hpp in StrIdx-0.1.1 vs stridx.hpp in StrIdx-0.1.4

- old
+ new

@@ -1,23 +1,68 @@ +#ifndef SSSTRIDX_HPP +#define SSSTRIDX_HPP + #include <stdio.h> #include <stdlib.h> #include <cassert> #include <vector> +#include <array> #include <iostream> #include <unordered_map> #include <set> #include <algorithm> #include <sstream> -#ifdef _OPENMP -#include <omp.h> -#endif +#include <vector> +#include <mutex> +#include <thread> +#include "thread_pool.hpp" #include "unordered_dense.h" +namespace StrIdx { + +/* Alternative to using std::cout + Allows to control verbose level */ +class Output { +private: + int verboseLevel; + +public: + Output(int verb) : verboseLevel(verb) {} + Output() : Output(3) {} + ~Output() = default; + static void print() {} + + // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy" + template <typename T, typename... Types> static void print(T var1, Types... var2) { + std::cout << var1; + print(var2...); + } + + // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n" + template <typename... Types> static void printl(Types... var2) { + print(var2...); + print("\n"); + } + + /* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n" + * if verboseLevel >= 2 (first arg) + */ + template <typename... Types> void printv(int vlevel, Types... var2) { + if (verboseLevel < vlevel) { + return; + } + if (verboseLevel >= 3) { + print("[v=", vlevel, "] "); + } + printl(var2...); + } +}; + // Transforms input string as follows: // '/foo/bar/file1.txt' // => vector{"foo", "bar", "file1.txt"} std::vector<std::string> splitString(const std::string &input, const char &separator) { std::vector<std::string> result; @@ -32,20 +77,20 @@ return result; } // Convert int64_t to binary string -std::string int64ToBinaryString(int64_t num) { +[[nodiscard]] std::string int64ToBinaryString(const int64_t &num) { std::string result; for (int i = 63; i >= 0; --i) { result += ((num >> i) & 1) ? '1' : '0'; } return result; } -// Convert a (8 char) string represented as int64_t to std::string -std::string int64ToStr(int64_t key) { +// Debug. Convert a (8 char) string represented as int64_t to std::string +[[nodiscard]] std::string int64ToStr(const int64_t &key) { int nchars = 8; std::string str; int multip = nchars * 8; for (int i = 0; i <= nchars; i++) { char c = (key >> multip) & 255; @@ -53,57 +98,58 @@ multip -= 8; } return str; } +// Debug void printVector(const std::vector<int> &vec) { for (const auto &value : vec) { std::cout << value << " "; } } -std::string charToBinaryString(char num) { +// Debug +[[nodiscard]] std::string charToBinaryString(const char &chr) { std::string result; for (int i = 7; i >= 0; --i) { - result += ((num >> i) & 1) ? '1' : '0'; + result += ((chr >> i) & 1) ? '1' : '0'; } return result; } class Candidate; -enum segmentType { Dir, File }; +enum class segmentType { Dir, File }; // A segment of a file path // e.g. if path is /foo/bar/baz.txt // segments are [{root}, foo, bar, baz.txt] -class PathSegment { -public: +struct PathSegment { std::string str; int fileId; // (if FILE) Candidate *cand; PathSegment *parent; + std::mutex mu; ankerl::unordered_dense::map<std::string, PathSegment *> children; - segmentType type = Dir; - PathSegment() : parent(NULL) {} - PathSegment(std::string _str) : str(_str), parent(NULL) {} + segmentType type = segmentType::Dir; + PathSegment() : parent(nullptr) {} + PathSegment(std::string _str) : str(_str), parent(nullptr) {} PathSegment(std::string _str, int _fileId) - : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {} - int size() { + : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {} + [[nodiscard]] int size() const { int sz = str.size(); PathSegment *cur = parent; // Sum up length of parent segments (+1 for divisors) - while (cur->parent != NULL) { + while (cur->parent != nullptr) { sz += cur->str.size() + 1; cur = cur->parent; } return sz; } }; // Candidate for result in string (filename) search -class Candidate { -public: +struct Candidate { std::vector<float> v_charscore; PathSegment *seg; int fileId; // The string that this candidate represents std::string str; @@ -112,29 +158,21 @@ float minscore; float maxscore; int candLen; // Length of candidate Candidate(){}; - Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) { - // Initialize v_charscores with zeros - v_charscore.resize(len, 0); - candLen = str.size(); - seg = NULL; - } - Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) { // Initialize v_charscores with zeros v_charscore.resize(len, 0); candLen = seg->size(); } - float getScore() { + [[nodiscard]] float getScore() const { int i = 0; float score = 0.0; - candLen = seg->size(); - for (float &charscore : v_charscore) { + for (const float &charscore : v_charscore) { score += charscore; i++; } float div = len * len; float div2 = len * candLen; @@ -143,51 +181,64 @@ score = score1 * 0.97 + score2 * 0.03; return score; } - float operator[](int idx) { return v_charscore[idx]; } + [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; } }; // This seems to give 10x speed improvement over std::unordered_map typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap; // typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap; -typedef std::unordered_map<float, Candidate> CandMap; +typedef ankerl::unordered_dense::map<int, Candidate *> CandMap; +// typedef std::unordered_map<int, Candidate*> CandMap; class StringIndex { private: int tmp; char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator) + int numStrings = 0; std::vector<SegMap *> dirmaps; + std::array<std::mutex, 9> mts_d; // for dirmaps std::vector<SegMap *> filemaps; + std::array<std::mutex, 9> mts_f; // for filemaps std::vector<PathSegment *> segsToClean; - std::unordered_map<int, std::string> strlist; std::unordered_map<int, PathSegment *> seglist; PathSegment *root; int dirId = 0; float dirWeight = 0.7; // Give only 70% of score if match is for a directory + std::unique_ptr<ThreadPool> pool; + Output out{1}; // verbose level = 1 + public: - StringIndex() { + StringIndex(char sep) : dirSeparator(sep) { root = new PathSegment(); - root->parent = NULL; + root->parent = nullptr; root->str = "[ROOT]"; for (int i = 0; i <= 8; i++) { dirmaps.push_back(new SegMap); filemaps.push_back(new SegMap); } -#ifdef _OPENMP - std::cout << "OPENMP enabled\n"; -#endif + // Threads between 4 and 6 + // We don't seem to get any benefit from more than 6 threads even if the hardware supports it + int num_threads = std::max((int)std::thread::hardware_concurrency(), 4); + num_threads = std::min(num_threads, 6); + out.printv(2, "Number of threads: ", num_threads); + pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads)); } + /* Don't separate path to segments separator=\0. + This is slower, but can be used for other data than files also. */ + StringIndex() : StringIndex('\0') {} + void setDirSeparator(char sep) { dirSeparator = sep; } void setDirWeight(float val) { dirWeight = val; } ~StringIndex() { for (auto x : dirmaps) { @@ -211,31 +262,48 @@ void addStrToIndex(std::string filePath, int fileId) { addStrToIndex(filePath, fileId, dirSeparator); } + void addStrToIndexThreaded(std::string filePath, int fileId) { + pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); }); + } + void waitUntilReady() const { pool->waitUntilDone(); } + + void waitUntilDone() const { pool->waitUntilDone(); } + + int size() const { return seglist.size(); } + /** - * Add a string to the index to be search for afterwards + * Add a string to the index to be searched for afterwards * * @param filePath String to index (e.g. /home/user/Project/main.cpp). * @param fileId Unique identifier for filePath. Will be return as result from findSimilar. * @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually * one of {'\\', '/', '\0' (no separation)}. */ + void addStrToIndex(std::string filePath, int fileId, const char &separator) { + out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator); + // If a string with this index has beeen added already + if (seglist.find(fileId) != seglist.end()) { + return; + } + std::vector<std::string> segs; + numStrings += 1; if (separator == '\0') { // No separation to directories & files segs = {filePath}; } else { // Split path to segments segs = splitString(filePath, separator); } - PathSegment *prev = NULL; + PathSegment *prev = nullptr; prev = root; // Add segments to a tree type data structure // e.g. addStrToIndex('/foo/bar/file1.txt' ..) // addStrToIndex('/foo/faa/file2.txt' ..) // forms structure: @@ -243,38 +311,51 @@ // |-> faa -> file2.txt for (auto _x = segs.begin(); _x != segs.end(); ++_x) { auto x = *_x; PathSegment *p; - auto it = prev->children.find(x); + prev->mu.lock(); + // this part of the path already exists in the tree - if (it != prev->children.end()) { + if (auto it = prev->children.find(x); it != prev->children.end()) { p = it->second; + prev->mu.unlock(); } else { p = new PathSegment(x, fileId); p->parent = prev; - // If this is last item in segs + // If this is last item in segs, then it is a file. if (_x == std::prev(segs.end())) { - // therefore, it is a file. - p->type = File; + p->type = segmentType::File; seglist[fileId] = p; - } else { - p->type = Dir; + } else { // otherwise, it is a directory + p->type = segmentType::Dir; p->fileId = dirId; // Files use user input Id. Directories need to have it generated dirId++; } prev->children[x] = p; + prev->mu.unlock(); addPathSegmentKeys(p); } prev = p; } } + std::string getString(int id) { + std::string s = ""; + PathSegment *seg = seglist[id]; + s += seg->str; + while (seg->parent->parent != nullptr) { + seg = seg->parent; + s = seg->str + dirSeparator + s; + } + return s; + } + /** - * The search will find filepaths similar to the input string + The search will find filepaths similar to the input string To be considered a candidate path, the file component of the path (e.g. file.txt) is required to have at least a substring of two characters in common with the query string. If that condition is true, then the directories will also add to the score, although with a smaller weight. @@ -284,65 +365,77 @@ For each character c in the query string: - find the largest substring in the query which includes the character c and is also included in the PathSegment - take the lenght of that substring as score sum up the scores for each character c and divide by (string length)^2 - - For example, if query = "rngnomadriv" + + For example, if query = "rngnomadriv" and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated as follows: rngnomadriv 33355555444 (subscores) FFFFFFFFDDD (F=file component, D=dir component) score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7) In final score, give a small penalty for larger candidate filenames: - Divide main part of score with (query string length)^2 + Divide main part of score with (query string length)^2 and minor part by (query string length)*(candidate string length) score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944 @param query String to search for inside the index */ - std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) { + [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) { + return findSimilar(query, 2); + } + + [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) { CandMap fileCandMap; CandMap dirCandMap; + waitUntilDone(); + // Find both files and directories that match the input query addToCandMap(fileCandMap, query, filemaps); addToCandMap(dirCandMap, query, dirmaps); /* If parent dir of a file matches the input string add the scores of the direcotry to the scores of the file */ mergeCandidateMaps(fileCandMap, dirCandMap); - // Set all candidate pointers to NULL so they won't mess up future searches + // Set all candidate pointers to nullptr so they won't mess up future searches for (auto seg : segsToClean) { - seg->cand = NULL; + seg->cand = nullptr; } segsToClean.clear(); // Form return result, 2d array with file id's and scores std::vector<std::pair<float, int>> results; for (auto &[fid, cand] : fileCandMap) { std::pair<float, int> v; - float sc = cand.getScore(); + float sc = cand->getScore(); v.first = sc; v.second = fid; results.push_back(v); + delete cand; } + + for (auto &[fid, cand] : dirCandMap) { + delete cand; + } + // Sort highest score first std::sort(results.begin(), results.end(), [](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; }); return results; } // Return int64_t representation of the first nchars in str, starting from index i - int64_t getKeyAtIdx(std::string str, int i, int nchars) { + [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const { int64_t key = 0; for (int i_char = 0; i_char < nchars; i_char++) { - key = key | static_cast<int>(str[i + i_char]); + key = key | static_cast<int64_t>(str[i + i_char]); if (i_char < nchars - 1) { // Shift 8 bits to the left except on the last iteration key = key << 8; } } @@ -393,50 +486,58 @@ } if (static_cast<int>(p->str.size()) < maxChars) { maxChars = p->str.size(); } -#ifdef _OPENMP -#pragma omp parallel for -#endif for (int sublen = minChars; sublen <= maxChars; sublen++) { + std::mutex *mu; SegMap *map; - if (p->type == File) { + if (p->type == segmentType::File) { map = filemaps[sublen]; + mu = &mts_f[sublen]; } else { map = dirmaps[sublen]; + mu = &mts_d[sublen]; } int count = str.size() - sublen + 1; + int64_t keys[count + 1]; for (int i = 0; i <= count; i++) { - int64_t key = getKeyAtIdx(str, i, sublen); + keys[i] = getKeyAtIdx(str, i, sublen); + } + mu->lock(); + for (int i = 0; i <= count; i++) { + // int64_t key = getKeyAtIdx(str, i, sublen); + auto key = keys[i]; + // Create a new std::set for key if doesn't exist already auto it = map->find(key); if (it == map->end()) { (*map)[key] = new std::set<PathSegment *>; } (*map)[key]->insert(p); } + mu->unlock(); } } // Find pathsegments from <map> that include the substring of <str> which starts at index <i> and // is of length <nchars>. - std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) { + [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, + SegMap &map) const { assert(i + nchars <= static_cast<int>(str.size())); std::vector<PathSegment *> res; // Take substring of str, starting at i, spanning nchars // transform that to 64 bit integer int64_t key = getKeyAtIdx(str, i, nchars); // Find all path segments in map that have the same substring - auto it = map.find(key); - if (it != map.end()) { // key found + if (auto it = map.find(key); it != map.end()) { // key found auto set = it->second; for (auto value : *set) { res.push_back(value); } } @@ -469,16 +570,16 @@ // Add parent directories scores to files void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) { for (auto &[fid, cand] : fileCandMap) { - PathSegment *p = cand.seg->parent; - while (p->parent != NULL) { - if (p->cand != NULL) { - auto &scoreA = cand.v_charscore; + PathSegment *p = cand->seg->parent; + while (p->parent != nullptr) { + if (p->cand != nullptr) { + auto &scoreA = cand->v_charscore; auto &scoreB = p->cand->v_charscore; - for (int i = 0; i < cand.len; i++) { + for (int i = 0; i < cand->len; i++) { if (scoreA[i] < scoreB[i] * dirWeight) { scoreA[i] = scoreB[i] * dirWeight; } } } @@ -487,20 +588,24 @@ } } void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) { - auto it2 = candmap.find(seg->fileId); - if (it2 == candmap.end()) { - Candidate cand(seg, str.size()); - seg->cand = &(candmap[seg->fileId]); + if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) { + Candidate *cand = new Candidate(seg, str.size()); segsToClean.push_back(seg); candmap[seg->fileId] = cand; + seg->cand = cand; } for (int j = i; j < i + nchars; j++) { - if (candmap[seg->fileId][j] < nchars) { - candmap[seg->fileId].v_charscore[j] = nchars; + Candidate &cand = *(candmap[seg->fileId]); + if (cand[j] < nchars) { + cand.v_charscore[j] = nchars; } } } }; + +} // namespace StrIdx + +#endif