Archived
1
0
Fork 0
This repository has been archived on 2024-10-19. You can view files and clone it, but cannot push or open issues or pull requests.
emacs/org/notdeft/xapian/notdeft-xapian.cc
2022-08-25 09:58:41 -04:00

806 lines
23 KiB
C++

#include <algorithm>
#include <ctype.h>
#include <dirent.h>
#include <fstream>
#include <iostream>
#include <string.h>
#include <sys/stat.h>
#include <tclap/CmdLine.h>
#include <unistd.h>
#include <xapian.h>
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#if !defined(XAPIAN_AT_LEAST)
#define XAPIAN_AT_LEAST(x,y,z) 0
#endif
#if XAPIAN_AT_LEAST(1,3,4) || XAPIAN_AT_LEAST(1,2,2) && !XAPIAN_AT_LEAST(1,3,0)
#define TG_CJK (Xapian::TermGenerator::FLAG_CJK_NGRAM)
#define QP_CJK (Xapian::QueryParser::FLAG_CJK_NGRAM)
#else
#define TG_CJK ((Xapian::TermGenerator::flags)0)
#define QP_CJK (0)
#endif
using namespace std;
namespace NotDeft {
struct ReadError {};
}
/** Serializes in a sorting friendly way, similarly to
`Xapian::sortable_serialise`. Should be quite portable when the
argument is coerced from `time_t`, although C does not actually
even guarantee an integer type in that case. */
static string time_serialize(const int64_t v) {
char buf[16+1];
// format in hexadecimal, zero padded, 64/4 digits
if (snprintf(buf, sizeof buf, "%016" PRIx64, v) != 16) {
// POSIX requires `errno` to be set, but C does not
throw Xapian::AssertionError("unexpected snprintf failure", errno);
}
return string(buf);
}
/** The inverse of `time_serialize`. */
static int64_t time_deserialize(const string& s) {
int64_t v;
if (sscanf(s.c_str(), "%" SCNx64, &v) != 1) {
throw Xapian::InvalidArgumentError("bad time_deserialize arg", errno);
}
return v;
}
/** Returns the length of any note header marker such as "#" or "%#"
* or "@;#". If the string is not a header string, returns 0. */
static size_t string_header_marker_len(const string& s) {
const size_t len = s.length();
if (len >= 1) {
if (s[0] == '#')
return 1;
if (len >= 2) {
if ((s[1] == '#') && (s[0] == '%'))
return 2;
if (len >= 3) {
if ((s[2] == '#') && (s[0] == '@') && (s[1] == ';'))
return 3;
if (len >= 5) {
if ((s[4] == '#') && (s[0] == '<') && (s[1] == '!') &&
(s[2] == '-') && (s[3] == '-'))
return 5;
}
}
}
}
return 0;
}
static bool line_skip_marker(const string& s, size_t& pos) {
const size_t len = string_header_marker_len(s);
if (len == 0)
return false;
pos = len;
return true;
}
/** Whether the lowercased string 's' matches 'pfx' starting at
* position 'pos'. If so, increment 'pos' to index the position after
* 'pfx'. */
static bool string_lc_skip_keyword(const string& s,
size_t& pos,
const string& pfx) {
auto pfx_len = pfx.length();
auto epos = pos + pfx_len;
if (s.length() < epos)
return false;
for (size_t i = 0; i < pfx_len; ++i) {
if (tolower(s[pos + i]) != pfx[i])
return false;
}
pos += pfx_len;
return true;
}
static bool string_ends_with(const string& s, const string& sfx) {
const int pos = s.length() - sfx.length();
return (pos >= 0) && (s.compare(pos, sfx.length(), sfx) == 0);
}
static bool string_ends_with_one_of(const string& s,
const vector<string>& sfxs) {
for (const string& sfx : sfxs) {
if (string_ends_with(s, sfx)) {
return true;
}
}
return false;
}
static bool drop_substring(string& s, const string& sub) {
auto found = s.rfind(sub);
if (found == string::npos)
return false;
s.replace(found, sub.length(), "");
return true;
}
static bool whitespace_p(const string& s) {
for (auto p = s.c_str(); *p; p++)
if (!isspace(*p))
return false;
return true;
}
static bool org_drawer_line_p(const string& s,
const char* kw = nullptr,
bool req_ws = false) {
auto p = s.c_str();
while (isblank(*p)) p++;
if (*p++ != ':') return false;
if (kw) {
/* Skip specified keyword, e.g., "END". */
auto len = strlen(kw);
if (strncmp(p, kw, len) != 0)
return false;
p += len;
} else {
/* Require a property name of at least one non-whitespace. */
if (!(*p && *p != ':' && !isspace(*p)))
return false;
p++;
while (*p && *p != ':' && !isspace(*p))
p++;
}
if (*p != ':') return false;
if (req_ws) {
while (*++p)
if (!isspace(*p))
return false;
}
return true;
}
static string downcase(const string& s) {
string data;
data.resize(s.length());
std::transform(s.begin(), s.end(), data.begin(), ::tolower);
return data;
}
static bool file_directory_p(const string& file) {
struct stat sb;
return (stat(file.c_str(), &sb) == 0) && S_ISDIR(sb.st_mode);
}
/** Returns an empty list on failure. */
static vector<string> ls(const string& file) {
vector<string> lst;
DIR* dir = opendir(file.c_str());
if (dir == NULL)
return lst;
struct dirent* entry;
while ((entry = readdir(dir)) != NULL) {
string name(entry->d_name);
if (name.length() > 0
&& name[0] != '.'
&& name[0] != '_'
&& name[0] != '#'
&& name.find('/') == string::npos) {
lst.push_back(name);
}
}
closedir(dir);
return lst;
}
static string file_join(const string& x, const string& y) {
if (x == ".")
return y;
if (string_ends_with(x, "/"))
return x + y;
return x + "/" + y;
}
/** Return the pathname of the parent directory of `s`, or return ""
if `s` has no directory components, or if `s` is "/". */
static string file_directory_path(const string& s) {
auto found = s.find_last_of('/');
if ((found == string::npos) || (found == 0))
return "";
return string(s.substr(0, found));
}
/** Return the non-directory component of pathname `s`, or return `s`
itself if `s` has no directory components. */
static string file_non_directory(const string& s) {
auto found = s.find_last_of('/');
if (found == string::npos)
return s;
return string(s.substr(found + 1));
}
/** Return the non-directory component of `s`, with its last extension
(if any) removed. A filename that is "all extension" has no
extension. */
static string file_basename(const string& s) {
auto basename = file_non_directory(s);
size_t found = basename.find_last_of('.');
if ((found == 0) || (found == string::npos))
return basename;
return string(basename.substr(0, found));
}
/** Return the last filename extension of `s`, with its leading ".",
or return "" if `s` has no extension. A filename that is "all
extension" has no extension. */
static string file_extension(const string& s) {
auto basename = file_non_directory(s);
size_t found = basename.find_last_of('.');
if ((found == 0) || (found == string::npos))
return "";
return string(basename.substr(found));
}
static void ls_org(vector<string>& res, const string& root,
const string& dir, const vector<string>& exts) {
auto absDir = file_join(root, dir);
for (const string& file : ls(absDir)) {
auto relFile = file_join(dir, file);
auto absFile = file_join(absDir, file);
bool isDir = file_directory_p(absFile);
if (string_ends_with_one_of(file, exts)) {
if (!isDir)
res.push_back(relFile);
} else if (isDir) {
ls_org(res, root, relFile, exts);
}
}
}
static bool uni_keyword_separator_p(const unsigned ch) {
return (ch == ':') || (ch == ';') || (ch == ',')
|| Xapian::Unicode::is_whitespace(ch);
}
/** Expects an UTF-8 encoded line as the argument `s`,
but reverts to octets for the remaining input if
non-UTF-8 encoding is detected. */
static void uni_index_keywords(Xapian::TermGenerator& indexer,
const string& s) {
Xapian::Utf8Iterator q(s);
for (;;) {
while (q.left() && uni_keyword_separator_p(*q)) q++;
if (!q.left()) break;
const char* const p = q.raw();
while (q.left() && !uni_keyword_separator_p(*q)) q++;
const string kw(p, q.raw());
indexer.index_text(kw, 0, "K");
indexer.increase_termpos();
if (!q.left()) break;
}
}
struct Op {
bool whole_dir;
string dir;
vector<string> files;
Op() {}
explicit Op(const string& d) : whole_dir(true), dir(d) {}
};
static bool parse_ops(istream& in, vector<Op>& lst) {
string opcode;
while (getline(in, opcode)) {
if (opcode == ":idir") {
string dir;
if (getline(in, dir)) {
lst.push_back(Op(dir));
} else {
return false; // expected directory name
}
} else if (opcode == ":ifiles") {
string dir;
if (!getline(in, dir))
return false; // expected directory name
string count_s;
if (!getline(in, count_s))
return false; // expected file count
int count = std::stoi(count_s);
if (count < 0)
return false; // expected non-negative integer
Op op;
op.whole_dir = false;
op.dir = dir;
string file;
for ( ; count > 0; count--) {
if (!getline(in, file))
return false; // expected count filenames
op.files.push_back(file);
}
lst.push_back(op);
} else {
return false; // unknown command
}
}
return true;
}
static void usage()
{
cerr << "notdeft-xapian" << endl;
cerr << "USAGE:" << endl;
cerr << "To build/refresh search indices" << endl;
cerr << "(for specified directories):" << endl;
cerr << " notdeft-xapian index [options] directory..." << endl;
cerr << "To find text documents" << endl;
cerr << "(matching the specified query):" << endl;
cerr << " notdeft-xapian search [options] directory..." << endl;
}
static constexpr Xapian::valueno DOC_MTIME = 0;
static constexpr Xapian::valueno DOC_FILENAME = 1;
static int doIndex(vector<string> subArgs) {
TCLAP::CmdLine cmdLine
("Specify any indexing commands via STDIN."
" For each command, specify its database index directory."
" All paths are used and stored as given."
" Search results are reported with the stored paths,"
" regardless of the search-time working directory.");
TCLAP::ValueArg<string>
langArg("l", "lang", "stemming language (e.g., 'en' or 'fi')",
false, "en", "language");
cmdLine.add(langArg);
TCLAP::MultiArg<string>
extArg("x", "extension", "filename extension (default: '.org')",
false, "extension");
cmdLine.add(extArg);
TCLAP::ValueArg<string>
chdirArg("c", "chdir", "change working directory first",
false, ".", "directory");
cmdLine.add(chdirArg);
TCLAP::SwitchArg
resetArg("r", "recreate", "recreate database", false);
cmdLine.add(resetArg);
TCLAP::ValueArg<int>
titleArg("t", "title-wdf", "title importance (default: 10)",
false, 10, "wdf_inc");
cmdLine.add(titleArg);
TCLAP::SwitchArg
verboseArg("v", "verbose", "be verbose", false);
cmdLine.add(verboseArg);
TCLAP::SwitchArg
inputArg("i", "input", "read instructions from STDIN", false);
cmdLine.add(inputArg);
TCLAP::SwitchArg
skipDrawersArg("", "allow-org-property-drawers",
"allow Org :PROPERTIES: drawers in header", false);
cmdLine.add(skipDrawersArg);
TCLAP::UnlabeledMultiArg<string>
dirsArg("directory...", "index specified dirs", false, "directory");
cmdLine.add(dirsArg);
cmdLine.parse(subArgs);
if (chdirArg.getValue() != ".") {
if (chdir(chdirArg.getValue().c_str()) == -1) {
auto e = errno;
cerr << "could not change into directory " <<
chdirArg.getValue() << " (errno: " << e << ")" << endl;
return 1;
}
}
vector<string> exts = extArg.getValue();
if (exts.empty())
exts.push_back(".org");
auto verbose = verboseArg.getValue();
string lang(langArg.getValue());
bool cjk = drop_substring(lang, ":cjk");
vector<Op> opList;
{
auto dirs = dirsArg.getValue();
for (auto dir : dirs) {
opList.push_back(Op(dir));
}
}
if (inputArg.getValue()) {
if (!parse_ops(cin, opList)) {
cerr << "option -i / --input given, "
"but failed to parse instructions from STDIN" << endl;
if (verbose) { // print out parsed instructions
cerr << "successfully parsed:" << endl;
ostream& out(cerr);
for (auto op : opList) {
out << op.dir;
if (op.whole_dir) {
out << endl << " (ALL)" << endl;
} else {
for (auto file : op.files) {
out << endl << " " << file;
}
out << endl;
}
}
}
return 1;
}
}
try {
Xapian::TermGenerator indexer;
Xapian::Stem stemmer(lang);
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
if (cjk)
indexer.set_flags(TG_CJK);
for (auto op : opList) {
auto dir = op.dir;
struct stat sb;
// Whether a readable and writable directory.
if ((stat(dir.c_str(), &sb) == 0) && S_ISDIR(sb.st_mode) &&
(access(dir.c_str(), R_OK|W_OK) != -1)) {
if (verbose) {
cerr << "indexing directory " << dir << endl;
}
string dbFile(file_join(dir, ".notdeft-db"));
Xapian::WritableDatabase db(dbFile,
resetArg.getValue() ?
Xapian::DB_CREATE_OR_OVERWRITE :
Xapian::DB_CREATE_OR_OPEN);
map<string, int64_t> fsFiles; // mtimes for files in file system
map<string, int64_t> dbFiles; // mtimes for files in database
map<string, Xapian::docid> dbIds;
vector<string> orgFiles;
if (op.whole_dir) {
ls_org(orgFiles, dir, ".", exts);
} else {
// Sparse directory paths must be specified relative to
// their database root.
orgFiles = op.files;
}
for (const string& file : orgFiles) { // `dir` relative `file`
auto filePath = file_join(dir, file);
struct stat sb;
if (stat(filePath.c_str(), &sb) == 0) {
fsFiles[filePath] = sb.st_mtime;
}
}
db.begin_transaction(false);
for (Xapian::PostingIterator it = db.postlist_begin("");
it != db.postlist_end(""); ++it) {
auto docId = *it;
auto doc = db.get_document(docId);
auto filePath = doc.get_data();
auto t = time_deserialize(doc.get_value(DOC_MTIME));
// Overwrites any existing value of the same key, and thus
// there will be no dupes in `dbFiles`, even if the database
// should have some.
dbFiles[filePath] = t;
dbIds[filePath] = docId;
}
{
auto makeDoc = [&] (const pair<string, int64_t>& x) {
const string& filePath = x.first;
ifstream infile(filePath);
Xapian::Document doc;
doc.set_data(filePath);
doc.add_value(DOC_MTIME, time_serialize(x.second));
const string fileNonDir = file_non_directory(filePath);
doc.add_value(DOC_FILENAME, fileNonDir);
indexer.set_document(doc);
{
const string fileDir = file_directory_path(filePath);
indexer.index_text(fileDir, 1, "P");
}
{
const string fileBase = file_basename(fileNonDir);
indexer.index_text(fileBase, 1, "F");
}
{
/* As for Omega, lowercase, without dot, and just "E"
for the no extension case. */
string fileExt = file_extension(fileNonDir);
if (!fileExt.empty()) {
fileExt = downcase(fileExt.substr(1));
}
//doc.add_boolean_term("E" + fileExt);
indexer.index_text_without_positions(fileExt, 0, "E");
//cerr << "ext: '" << fileExt << "'" << endl;
}
{
string line;
bool titleDone = false;
size_t pos = 0;
while (getline(infile, line)) {
if (whitespace_p(line)) {
// skip blank line
} else if (skipDrawersArg.getValue() &&
org_drawer_line_p(line, "PROPERTIES", true)) {
while (getline(infile, line)) {
// skip Org drawer
if (org_drawer_line_p(line, "END"))
break;
if (!org_drawer_line_p(line))
break; // unclosed drawer
}
} else if (!line_skip_marker(line, pos)) {
// non Org header mode
if (!titleDone) {
indexer.index_text(line, 1, "S");
indexer.index_text(line, titleArg.getValue());
indexer.increase_termpos();
} else {
indexer.index_text(line);
}
while (getline(infile, line)) {
//cerr << "body line: '" << line << "'" << endl;
indexer.index_text(line);
}
break;
} else if (string_lc_skip_keyword(line, pos, "+title:")) {
const string s = line.substr(pos);
indexer.index_text(s, 1, "S");
indexer.index_text(s, titleArg.getValue());
indexer.increase_termpos();
titleDone = true;
} else if (string_lc_skip_keyword(line, pos, "+keywords:") ||
string_lc_skip_keyword(line, pos, "+filetags:")) {
const string s = line.substr(pos);
uni_index_keywords(indexer, s);
indexer.index_text(s);
indexer.increase_termpos();
} else {
// skip comment (or unknown property) line
}
}
}
if (!infile.eof())
throw NotDeft::ReadError();
return doc;
}; // end makeDoc
auto addFile = [&] (const pair<string, int64_t>& x) {
if (verbose)
cerr << "indexing file " << x.first << endl;
try {
Xapian::Document doc = makeDoc(x);
db.add_document(doc);
} catch (const NotDeft::ReadError& e) {
// File not (fully) readable, so don't index.
}
};
auto updateFile = [&] (const pair<string, int64_t>& x,
Xapian::docid docId) {
if (verbose)
cerr << "re-indexing file " << x.first << endl;
try {
Xapian::Document doc = makeDoc(x);
db.replace_document(docId, doc);
} catch (const NotDeft::ReadError& e) {
// File no longer (fully) readable, so remove from index.
db.delete_document(docId);
}
};
auto rmFile = [&] (const pair<string, int64_t>& x) {
if (verbose)
cerr << "de-indexing file " << x.first << endl;
auto docId = dbIds[x.first];
db.delete_document(docId);
};
auto fi = fsFiles.cbegin();
auto di = dbFiles.cbegin();
for (;;) {
if (fi == fsFiles.cend()) {
// The remaining files have been deleted.
for ( ; di != dbFiles.cend(); ++di) {
rmFile(*di);
}
break;
} else if (di == dbFiles.cend()) {
// The remaining files are new.
for ( ; fi != fsFiles.cend(); ++fi) {
addFile(*fi);
}
break;
} else if ((*fi).first == (*di).first) {
if ((*fi).second != (*di).second) {
// The file has been modified.
updateFile(*fi, dbIds[(*di).first]);
}
fi++;
di++;
} else if ((*fi).first < (*di).first) {
// The file has been added.
addFile(*fi);
fi++;
} else if ((*fi).first > (*di).first) {
// The file has been deleted.
rmFile(*di);
di++;
} else {
throw Xapian::AssertionError("unexpected condition");
}
} // end `for`
}
db.commit_transaction();
}
}
} catch (const Xapian::Error &e) {
cerr << e.get_description() << endl;
return 1;
}
return 0;
}
static int doSearch(vector<string> subArgs) {
TCLAP::CmdLine cmdLine("Specify a query expression as a string.");
TCLAP::ValueArg<string>
langArg("l", "lang", "stemming language (e.g., 'en' or 'fi')",
false, "en", "language");
cmdLine.add(langArg);
TCLAP::ValueArg<string>
queryArg("q", "query", "specifies a query string", false, "", "string");
cmdLine.add(queryArg);
TCLAP::ValueArg<int>
countArg("c", "max-count", "maximum number of results", false, 0, "number");
cmdLine.add(countArg);
TCLAP::SwitchArg
timeArg("t", "time-sort", "sort by modification time", false);
cmdLine.add(timeArg);
TCLAP::SwitchArg
nameArg("f", "name-sort", "sort by file name (overrides '-t')", false);
cmdLine.add(nameArg);
TCLAP::SwitchArg
verboseArg("v", "verbose", "be verbose", false);
cmdLine.add(verboseArg);
TCLAP::SwitchArg
flag_pure_not("n", "pure-not", "allow NOT", false);
cmdLine.add(flag_pure_not);
TCLAP::SwitchArg
flag_boolean_any_case("a", "boolean-any-case",
"allow lowercase operators", false);
cmdLine.add(flag_boolean_any_case);
TCLAP::UnlabeledMultiArg<string>
dirsArg("dir...", "specifies directories to search", false, "directory");
cmdLine.add(dirsArg);
cmdLine.parse(subArgs);
auto maxDocCount = countArg.getValue();
bool nameSort = nameArg.getValue();
bool timeSort = timeArg.getValue();
auto verbose = verboseArg.getValue();
string lang(langArg.getValue());
bool cjk = drop_substring(lang, ":cjk");
try {
Xapian::Database db;
auto dirs = dirsArg.getValue();
int numDbFiles = 0;
for (auto dir : dirs) {
string dbFile(file_join(dir, ".notdeft-db"));
if (access(dbFile.c_str(), R_OK) != -1) {
Xapian::Database dirDb(dbFile);
db.add_database(dirDb);
numDbFiles++;
//cout << "Added database: " << db.get_description() << endl;
}
}
if (numDbFiles == 0)
return 0;
Xapian::Enquire enquire(db);
if (nameSort) // by filename, descending
enquire.set_sort_by_value(DOC_FILENAME, true);
else if (timeSort) // by modification time, descending
enquire.set_sort_by_value(DOC_MTIME, true);
Xapian::QueryParser qp;
qp.add_prefix("path", "P");
qp.add_prefix("file", "F");
qp.add_prefix("ext", "E");
qp.add_prefix("title", "S");
qp.add_prefix("tag", "K");
Xapian::Stem stemmer(lang);
Xapian::Query query;
if (queryArg.getValue() == "") {
query = Xapian::Query::MatchAll;
} else {
qp.set_stemmer(stemmer);
qp.set_database(db);
qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
unsigned flags =
Xapian::QueryParser::FLAG_DEFAULT |
(flag_pure_not.getValue() ?
Xapian::QueryParser::FLAG_PURE_NOT : 0) |
(flag_boolean_any_case.getValue() ?
Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE : 0) |
(cjk ? QP_CJK : 0);
query = qp.parse_query(queryArg.getValue(), flags);
if (verbose)
cerr << "parsed query is: " << query.get_description() << endl;
}
enquire.set_query(query);
int maxItems = (maxDocCount ? maxDocCount : db.get_doccount());
Xapian::MSet matches = enquire.get_mset(0, maxItems);
for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) {
cout << i.get_document().get_data() << endl;
}
} catch (const Xapian::Error &e) {
cerr << e.get_description() << endl;
return 1;
}
return 0;
}
int main(int argc, const char* argv[])
{
if (argc <= 1) {
usage();
return 1;
}
string cmd(argv[1]);
vector<string> args({ string(argv[0]) + " " + cmd });
for (int i = 2; i < argc; i++)
args.emplace_back(argv[i]);
// for (auto s : args) cout << s << endl;
if (cmd == "index") {
return doIndex(args);
} else if (cmd == "search") {
return doSearch(args);
} else if (cmd == "-h" || cmd == "--help") {
usage();
return 0;
} else {
usage();
return 1;
}
}
/*
Copyright (C) 2017 Tero Hasu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
See the file GPL-2 for the full text of the GNU GPL.
*/