stridx.hpp in StrIdx-0.1.1 vs stridx.hpp in StrIdx-0.1.4
- old
+ new
@@ -1,23 +1,68 @@
+#ifndef SSSTRIDX_HPP
+#define SSSTRIDX_HPP
+
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <vector>
+#include <array>
#include <iostream>
#include <unordered_map>
#include <set>
#include <algorithm>
#include <sstream>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
+#include <vector>
+#include <mutex>
+#include <thread>
+#include "thread_pool.hpp"
#include "unordered_dense.h"
+namespace StrIdx {
+
+/* Alternative to using std::cout
+ Allows to control verbose level */
+class Output {
+private:
+ int verboseLevel;
+
+public:
+ Output(int verb) : verboseLevel(verb) {}
+ Output() : Output(3) {}
+ ~Output() = default;
+ static void print() {}
+
+ // When calling as print("xxx ",3, " yyy") outputs "xxx 3 yyy"
+ template <typename T, typename... Types> static void print(T var1, Types... var2) {
+ std::cout << var1;
+ print(var2...);
+ }
+
+ // When calling as printl("xxx ",3, " yyy") outputs "xxx 3 yyy\n"
+ template <typename... Types> static void printl(Types... var2) {
+ print(var2...);
+ print("\n");
+ }
+
+ /* When calling as printv(2, "xxx ",3, " yyy") outputs "xxx 3 yyy\n"
+ * if verboseLevel >= 2 (first arg)
+ */
+ template <typename... Types> void printv(int vlevel, Types... var2) {
+ if (verboseLevel < vlevel) {
+ return;
+ }
+ if (verboseLevel >= 3) {
+ print("[v=", vlevel, "] ");
+ }
+ printl(var2...);
+ }
+};
+
// Transforms input string as follows:
// '/foo/bar/file1.txt'
// => vector{"foo", "bar", "file1.txt"}
std::vector<std::string> splitString(const std::string &input, const char &separator) {
std::vector<std::string> result;
@@ -32,20 +77,20 @@
return result;
}
// Convert int64_t to binary string
-std::string int64ToBinaryString(int64_t num) {
+[[nodiscard]] std::string int64ToBinaryString(const int64_t &num) {
std::string result;
for (int i = 63; i >= 0; --i) {
result += ((num >> i) & 1) ? '1' : '0';
}
return result;
}
-// Convert a (8 char) string represented as int64_t to std::string
-std::string int64ToStr(int64_t key) {
+// Debug. Convert a (8 char) string represented as int64_t to std::string
+[[nodiscard]] std::string int64ToStr(const int64_t &key) {
int nchars = 8;
std::string str;
int multip = nchars * 8;
for (int i = 0; i <= nchars; i++) {
char c = (key >> multip) & 255;
@@ -53,57 +98,58 @@
multip -= 8;
}
return str;
}
+// Debug
void printVector(const std::vector<int> &vec) {
for (const auto &value : vec) {
std::cout << value << " ";
}
}
-std::string charToBinaryString(char num) {
+// Debug
+[[nodiscard]] std::string charToBinaryString(const char &chr) {
std::string result;
for (int i = 7; i >= 0; --i) {
- result += ((num >> i) & 1) ? '1' : '0';
+ result += ((chr >> i) & 1) ? '1' : '0';
}
return result;
}
class Candidate;
-enum segmentType { Dir, File };
+enum class segmentType { Dir, File };
// A segment of a file path
// e.g. if path is /foo/bar/baz.txt
// segments are [{root}, foo, bar, baz.txt]
-class PathSegment {
-public:
+struct PathSegment {
std::string str;
int fileId; // (if FILE)
Candidate *cand;
PathSegment *parent;
+ std::mutex mu;
ankerl::unordered_dense::map<std::string, PathSegment *> children;
- segmentType type = Dir;
- PathSegment() : parent(NULL) {}
- PathSegment(std::string _str) : str(_str), parent(NULL) {}
+ segmentType type = segmentType::Dir;
+ PathSegment() : parent(nullptr) {}
+ PathSegment(std::string _str) : str(_str), parent(nullptr) {}
PathSegment(std::string _str, int _fileId)
- : str(_str), fileId(_fileId), cand(NULL), parent(NULL) {}
- int size() {
+ : str(_str), fileId(_fileId), cand(nullptr), parent(nullptr) {}
+ [[nodiscard]] int size() const {
int sz = str.size();
PathSegment *cur = parent;
// Sum up length of parent segments (+1 for divisors)
- while (cur->parent != NULL) {
+ while (cur->parent != nullptr) {
sz += cur->str.size() + 1;
cur = cur->parent;
}
return sz;
}
};
// Candidate for result in string (filename) search
-class Candidate {
-public:
+struct Candidate {
std::vector<float> v_charscore;
PathSegment *seg;
int fileId;
// The string that this candidate represents
std::string str;
@@ -112,29 +158,21 @@
float minscore;
float maxscore;
int candLen; // Length of candidate
Candidate(){};
- Candidate(int _fileId, std::string _str, int _len) : fileId(_fileId), str(_str), len(_len) {
- // Initialize v_charscores with zeros
- v_charscore.resize(len, 0);
- candLen = str.size();
- seg = NULL;
- }
-
Candidate(PathSegment *_seg, int _len) : seg(_seg), len(_len) {
// Initialize v_charscores with zeros
v_charscore.resize(len, 0);
candLen = seg->size();
}
- float getScore() {
+ [[nodiscard]] float getScore() const {
int i = 0;
float score = 0.0;
- candLen = seg->size();
- for (float &charscore : v_charscore) {
+ for (const float &charscore : v_charscore) {
score += charscore;
i++;
}
float div = len * len;
float div2 = len * candLen;
@@ -143,51 +181,64 @@
score = score1 * 0.97 + score2 * 0.03;
return score;
}
- float operator[](int idx) { return v_charscore[idx]; }
+ [[nodiscard]] float operator[](int idx) const { return v_charscore[idx]; }
};
// This seems to give 10x speed improvement over std::unordered_map
typedef ankerl::unordered_dense::map<int64_t, std::set<PathSegment *> *> SegMap;
// typedef std::unordered_map<int64_t, std::set<PathSegment *> *> SegMap;
-typedef std::unordered_map<float, Candidate> CandMap;
+typedef ankerl::unordered_dense::map<int, Candidate *> CandMap;
+// typedef std::unordered_map<int, Candidate*> CandMap;
class StringIndex {
private:
int tmp;
char dirSeparator = '/'; // Usually '/', '\' or '\0' (no separator)
+ int numStrings = 0;
std::vector<SegMap *> dirmaps;
+ std::array<std::mutex, 9> mts_d; // for dirmaps
std::vector<SegMap *> filemaps;
+ std::array<std::mutex, 9> mts_f; // for filemaps
std::vector<PathSegment *> segsToClean;
- std::unordered_map<int, std::string> strlist;
std::unordered_map<int, PathSegment *> seglist;
PathSegment *root;
int dirId = 0;
float dirWeight = 0.7; // Give only 70% of score if match is for a directory
+ std::unique_ptr<ThreadPool> pool;
+ Output out{1}; // verbose level = 1
+
public:
- StringIndex() {
+ StringIndex(char sep) : dirSeparator(sep) {
root = new PathSegment();
- root->parent = NULL;
+ root->parent = nullptr;
root->str = "[ROOT]";
for (int i = 0; i <= 8; i++) {
dirmaps.push_back(new SegMap);
filemaps.push_back(new SegMap);
}
-#ifdef _OPENMP
- std::cout << "OPENMP enabled\n";
-#endif
+ // Threads between 4 and 6
+ // We don't seem to get any benefit from more than 6 threads even if the hardware supports it
+ int num_threads = std::max((int)std::thread::hardware_concurrency(), 4);
+ num_threads = std::min(num_threads, 6);
+ out.printv(2, "Number of threads: ", num_threads);
+ pool = std::unique_ptr<ThreadPool>(new ThreadPool(num_threads));
}
+ /* Don't separate path to segments separator=\0.
+ This is slower, but can be used for other data than files also. */
+ StringIndex() : StringIndex('\0') {}
+
void setDirSeparator(char sep) { dirSeparator = sep; }
void setDirWeight(float val) { dirWeight = val; }
~StringIndex() {
for (auto x : dirmaps) {
@@ -211,31 +262,48 @@
void addStrToIndex(std::string filePath, int fileId) {
addStrToIndex(filePath, fileId, dirSeparator);
}
+ void addStrToIndexThreaded(std::string filePath, int fileId) {
+ pool->enqueue([=] { addStrToIndex(filePath, fileId, dirSeparator); });
+ }
+ void waitUntilReady() const { pool->waitUntilDone(); }
+
+ void waitUntilDone() const { pool->waitUntilDone(); }
+
+ int size() const { return seglist.size(); }
+
/**
- * Add a string to the index to be search for afterwards
+ * Add a string to the index to be searched for afterwards
*
* @param filePath String to index (e.g. /home/user/Project/main.cpp).
* @param fileId Unique identifier for filePath. Will be return as result from findSimilar.
* @param separator Can be used to split filePath to components (e.g. 'home','user'...). Usually
* one of {'\\', '/', '\0' (no separation)}.
*/
+
void addStrToIndex(std::string filePath, int fileId, const char &separator) {
+ out.printv(3, "Add file:", filePath, ",", fileId, ",", separator, ",",dirSeparator);
+ // If a string with this index has beeen added already
+ if (seglist.find(fileId) != seglist.end()) {
+ return;
+ }
+
std::vector<std::string> segs;
+ numStrings += 1;
if (separator == '\0') {
// No separation to directories & files
segs = {filePath};
} else {
// Split path to segments
segs = splitString(filePath, separator);
}
- PathSegment *prev = NULL;
+ PathSegment *prev = nullptr;
prev = root;
// Add segments to a tree type data structure
// e.g. addStrToIndex('/foo/bar/file1.txt' ..)
// addStrToIndex('/foo/faa/file2.txt' ..)
// forms structure:
@@ -243,38 +311,51 @@
// |-> faa -> file2.txt
for (auto _x = segs.begin(); _x != segs.end(); ++_x) {
auto x = *_x;
PathSegment *p;
- auto it = prev->children.find(x);
+ prev->mu.lock();
+
// this part of the path already exists in the tree
- if (it != prev->children.end()) {
+ if (auto it = prev->children.find(x); it != prev->children.end()) {
p = it->second;
+ prev->mu.unlock();
} else {
p = new PathSegment(x, fileId);
p->parent = prev;
- // If this is last item in segs
+ // If this is last item in segs, then it is a file.
if (_x == std::prev(segs.end())) {
- // therefore, it is a file.
- p->type = File;
+ p->type = segmentType::File;
seglist[fileId] = p;
- } else {
- p->type = Dir;
+ } else { // otherwise, it is a directory
+ p->type = segmentType::Dir;
p->fileId = dirId;
// Files use user input Id. Directories need to have it generated
dirId++;
}
prev->children[x] = p;
+ prev->mu.unlock();
addPathSegmentKeys(p);
}
prev = p;
}
}
+ std::string getString(int id) {
+ std::string s = "";
+ PathSegment *seg = seglist[id];
+ s += seg->str;
+ while (seg->parent->parent != nullptr) {
+ seg = seg->parent;
+ s = seg->str + dirSeparator + s;
+ }
+ return s;
+ }
+
/**
- * The search will find filepaths similar to the input string
+ The search will find filepaths similar to the input string
To be considered a candidate path, the file component of the path (e.g. file.txt)
is required to have at least a substring of two characters in common with the
query string. If that condition is true, then the directories will also add to the
score, although with a smaller weight.
@@ -284,65 +365,77 @@
For each character c in the query string:
- find the largest substring in the query which includes the character c and
is also included in the PathSegment
- take the lenght of that substring as score
sum up the scores for each character c and divide by (string length)^2
-
- For example, if query = "rngnomadriv"
+
+ For example, if query = "rngnomadriv"
and candidate is "./drivers/char/hw_random/nomadik-rng.c", then scores are calculated
as follows:
rngnomadriv
33355555444 (subscores)
FFFFFFFFDDD (F=file component, D=dir component)
score1=(3+3+3+5+5+5+5+5+(4+4+4)*0.7)
In final score, give a small penalty for larger candidate filenames:
- Divide main part of score with (query string length)^2
+ Divide main part of score with (query string length)^2
and minor part by (query string length)*(candidate string length)
score = score1/(11*11)*0.97 + score1/(11*38)*0.03 = 0.342944
@param query String to search for inside the index
*/
- std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
+ [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query) {
+ return findSimilar(query, 2);
+ }
+
+ [[nodiscard]] std::vector<std::pair<float, int>> findSimilar(std::string query, int minChars) {
CandMap fileCandMap;
CandMap dirCandMap;
+ waitUntilDone();
+
// Find both files and directories that match the input query
addToCandMap(fileCandMap, query, filemaps);
addToCandMap(dirCandMap, query, dirmaps);
/* If parent dir of a file matches the input string add the scores of the direcotry to the
scores of the file */
mergeCandidateMaps(fileCandMap, dirCandMap);
- // Set all candidate pointers to NULL so they won't mess up future searches
+ // Set all candidate pointers to nullptr so they won't mess up future searches
for (auto seg : segsToClean) {
- seg->cand = NULL;
+ seg->cand = nullptr;
}
segsToClean.clear();
// Form return result, 2d array with file id's and scores
std::vector<std::pair<float, int>> results;
for (auto &[fid, cand] : fileCandMap) {
std::pair<float, int> v;
- float sc = cand.getScore();
+ float sc = cand->getScore();
v.first = sc;
v.second = fid;
results.push_back(v);
+ delete cand;
}
+
+ for (auto &[fid, cand] : dirCandMap) {
+ delete cand;
+ }
+
// Sort highest score first
std::sort(results.begin(), results.end(),
[](std::pair<float, int> a, std::pair<float, int> b) { return a.first > b.first; });
return results;
}
// Return int64_t representation of the first nchars in str, starting from index i
- int64_t getKeyAtIdx(std::string str, int i, int nchars) {
+ [[nodiscard]] int64_t getKeyAtIdx(const std::string &str, int i, int nchars) const {
int64_t key = 0;
for (int i_char = 0; i_char < nchars; i_char++) {
- key = key | static_cast<int>(str[i + i_char]);
+ key = key | static_cast<int64_t>(str[i + i_char]);
if (i_char < nchars - 1) {
// Shift 8 bits to the left except on the last iteration
key = key << 8;
}
}
@@ -393,50 +486,58 @@
}
if (static_cast<int>(p->str.size()) < maxChars) {
maxChars = p->str.size();
}
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
for (int sublen = minChars; sublen <= maxChars; sublen++) {
+ std::mutex *mu;
SegMap *map;
- if (p->type == File) {
+ if (p->type == segmentType::File) {
map = filemaps[sublen];
+ mu = &mts_f[sublen];
} else {
map = dirmaps[sublen];
+ mu = &mts_d[sublen];
}
int count = str.size() - sublen + 1;
+ int64_t keys[count + 1];
for (int i = 0; i <= count; i++) {
- int64_t key = getKeyAtIdx(str, i, sublen);
+ keys[i] = getKeyAtIdx(str, i, sublen);
+ }
+ mu->lock();
+ for (int i = 0; i <= count; i++) {
+ // int64_t key = getKeyAtIdx(str, i, sublen);
+ auto key = keys[i];
+
// Create a new std::set for key if doesn't exist already
auto it = map->find(key);
if (it == map->end()) {
(*map)[key] = new std::set<PathSegment *>;
}
(*map)[key]->insert(p);
}
+ mu->unlock();
}
}
// Find pathsegments from <map> that include the substring of <str> which starts at index <i> and
// is of length <nchars>.
- std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars, SegMap &map) {
+ [[nodiscard]] std::vector<PathSegment *> findSimilarForNgram(std::string str, int i, int nchars,
+ SegMap &map) const {
assert(i + nchars <= static_cast<int>(str.size()));
std::vector<PathSegment *> res;
// Take substring of str, starting at i, spanning nchars
// transform that to 64 bit integer
int64_t key = getKeyAtIdx(str, i, nchars);
// Find all path segments in map that have the same substring
- auto it = map.find(key);
- if (it != map.end()) { // key found
+ if (auto it = map.find(key); it != map.end()) { // key found
auto set = it->second;
for (auto value : *set) {
res.push_back(value);
}
}
@@ -469,16 +570,16 @@
// Add parent directories scores to files
void mergeCandidateMaps(CandMap &fileCandMap, CandMap &dirCandMap) {
for (auto &[fid, cand] : fileCandMap) {
- PathSegment *p = cand.seg->parent;
- while (p->parent != NULL) {
- if (p->cand != NULL) {
- auto &scoreA = cand.v_charscore;
+ PathSegment *p = cand->seg->parent;
+ while (p->parent != nullptr) {
+ if (p->cand != nullptr) {
+ auto &scoreA = cand->v_charscore;
auto &scoreB = p->cand->v_charscore;
- for (int i = 0; i < cand.len; i++) {
+ for (int i = 0; i < cand->len; i++) {
if (scoreA[i] < scoreB[i] * dirWeight) {
scoreA[i] = scoreB[i] * dirWeight;
}
}
}
@@ -487,20 +588,24 @@
}
}
void addToResults(PathSegment *seg, std::string str, int i, int nchars, CandMap &candmap) {
- auto it2 = candmap.find(seg->fileId);
- if (it2 == candmap.end()) {
- Candidate cand(seg, str.size());
- seg->cand = &(candmap[seg->fileId]);
+ if (auto it2 = candmap.find(seg->fileId); it2 == candmap.end()) {
+ Candidate *cand = new Candidate(seg, str.size());
segsToClean.push_back(seg);
candmap[seg->fileId] = cand;
+ seg->cand = cand;
}
for (int j = i; j < i + nchars; j++) {
- if (candmap[seg->fileId][j] < nchars) {
- candmap[seg->fileId].v_charscore[j] = nchars;
+ Candidate &cand = *(candmap[seg->fileId]);
+ if (cand[j] < nchars) {
+ cand.v_charscore[j] = nchars;
}
}
}
};
+
+} // namespace StrIdx
+
+#endif