fix(SQLParser): move to Data dir; add extradirs, remove vs 140,150 build scripts generation

This commit is contained in:
Alex Fabijanic
2024-02-13 14:04:23 +01:00
parent 3261ebbd42
commit ee39b611f2
167 changed files with 780 additions and 792 deletions

View File

@@ -0,0 +1,14 @@
# Benchmark
This directory contains the scripts to execute benchmarks of the parser. We use [Google Benchmark](https://github.com/google/benchmark) to define and run benchmarks.
## Install Google Benchmark
```bash
cmake -DCMAKE_BUILD_TYPE=Release
make
make install
```

View File

@@ -0,0 +1,28 @@
#include "benchmark/benchmark.h"
#include "benchmark_utils.h"
#include "queries.h"
int main(int argc, char** argv) {
// Create parse and tokenize benchmarks for TPC-H queries.
const auto tpch_queries = getTPCHQueries();
for (const auto& query : tpch_queries) {
std::string p_name = query.first + "-parse";
benchmark::RegisterBenchmark(p_name.c_str(), &BM_ParseBenchmark, query.second);
std::string t_name = query.first + "-tokenize";
benchmark::RegisterBenchmark(t_name.c_str(), &BM_TokenizeBenchmark, query.second);
}
// Create parse and tokenize benchmarks for all queries in sql_queries array.
for (unsigned i = 0; i < sql_queries.size(); ++i) {
const auto& query = sql_queries[i];
std::string p_name = getQueryName(i) + "-parse";
benchmark::RegisterBenchmark(p_name.c_str(), &BM_ParseBenchmark, query.second);
std::string t_name = getQueryName(i) + "-tokenize";
benchmark::RegisterBenchmark(t_name.c_str(), &BM_TokenizeBenchmark, query.second);
}
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
}

View File

@@ -0,0 +1,44 @@
#include "benchmark_utils.h"
#include <fstream>
#include <iostream>
#include "SQLParser.h"
size_t getNumTokens(const std::string& query) {
std::vector<int16_t> tokens;
hsql::SQLParser::tokenize(query, &tokens);
return tokens.size();
}
void BM_TokenizeBenchmark(benchmark::State& st, const std::string& query) {
st.counters["num_tokens"] = getNumTokens(query);
st.counters["num_chars"] = query.size();
while (st.KeepRunning()) {
std::vector<int16_t> tokens(512);
hsql::SQLParser::tokenize(query, &tokens);
}
}
void BM_ParseBenchmark(benchmark::State& st, const std::string& query) {
st.counters["num_tokens"] = getNumTokens(query);
st.counters["num_chars"] = query.size();
while (st.KeepRunning()) {
hsql::SQLParserResult result;
hsql::SQLParser::parse(query, &result);
if (!result.isValid()) {
std::cout << query << std::endl;
std::cout << result.errorMsg() << std::endl;
st.SkipWithError("Parsing failed!");
}
}
}
std::string readFileContents(const std::string& file_path) {
std::ifstream t(file_path.c_str());
std::string text((std::istreambuf_iterator<char>(t)),
std::istreambuf_iterator<char>());
return text;
}

View File

@@ -0,0 +1,41 @@
#ifndef __BENCHMARK_UTILS_H__
#define __BENCHMARK_UTILS_H__
#include "benchmark/benchmark.h"
size_t getNumTokens(const std::string& query);
void BM_TokenizeBenchmark(benchmark::State& st, const std::string& query);
void BM_ParseBenchmark(benchmark::State& st, const std::string& query);
std::string readFileContents(const std::string& file_path);
#define TIME_DIFF(end, start)\
std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
#define NOW()\
std::chrono::high_resolution_clock::now();
#define PARSE_QUERY_BENCHMARK(name, query)\
static void name(benchmark::State& st) {\
BM_ParseBenchmark(st, query);\
}\
BENCHMARK(name);
#define TOKENIZE_QUERY_BENCHMARK(name, query)\
static void name(benchmark::State& st) {\
BM_TokenizeBenchmark(st, query);\
}\
BENCHMARK(name);
#define BENCHMARK_QUERY(test_name, query)\
TOKENIZE_QUERY_BENCHMARK(test_name##Tokenize, query)\
PARSE_QUERY_BENCHMARK(test_name##Parse, query)
#endif

View File

@@ -0,0 +1,87 @@
#include <chrono>
#include <sstream>
#include "benchmark/benchmark.h"
#include "SQLParser.h"
#include "parser/bison_parser.h"
#include "parser/flex_lexer.h"
#include "benchmark_utils.h"
// Benchmark the influence of increasing size of the query, while
// the number of tokens remains unchanged.
static void BM_CharacterCount(benchmark::State& st) {
const size_t querySize = st.range(0);
// Base query has size of 18 characters.
std::string query = "SELECT %name% FROM test;";
const uint pad = querySize - 18;
const std::string filler = std::string(pad, 'a');
query.replace(7, 6, filler);
st.counters["num_tokens"] = getNumTokens(query);
st.counters["num_chars"] = query.size();
while (st.KeepRunning()) {
hsql::SQLParserResult result;
hsql::SQLParser::parse(query, &result);
}
}
BENCHMARK(BM_CharacterCount)
->RangeMultiplier(1 << 2)
->Ranges({{1 << 5, 1 << 15},
{5, 5}});
// Benchmark the influence of increasing number of tokens, while
// the number of characters remains unchanged.
static void BM_ConditionalTokens(benchmark::State& st) {
const size_t targetSize = st.range(0);
const size_t numTokens = st.range(1);
// Base query contains 6 tokens.
std::string query = "SELECT * FROM test";
// Create conditional.
std::stringstream condStream;
size_t missingTokens = numTokens - 4;
if (missingTokens > 0) {
condStream << " WHERE a";
missingTokens -= 2;
while (missingTokens > 0) {
condStream << " AND a";
missingTokens -= 2;
}
}
query += condStream.str();
if (targetSize >= query.size()) {
const size_t pad = targetSize - query.size();
const std::string filler = std::string(pad, 'a');
query.replace(7, 1, filler);
} else {
// Query can't be the same length as in the other benchmarks.
// Running this will result in unusable data.
fprintf(stderr, "Too many tokens. Query too long for benchmark char limit (%lu > %lu).\n",
query.size(), targetSize);
return;
}
st.counters["num_tokens"] = getNumTokens(query);
st.counters["num_chars"] = query.size();
while (st.KeepRunning()) {
hsql::SQLParserResult result;
hsql::SQLParser::parse(query, &result);
if (!result.isValid()) st.SkipWithError("Parsing failed!");
}
}
BENCHMARK(BM_ConditionalTokens)
->RangeMultiplier(1 << 2)
->Ranges({{1 << 14, 1 << 14},
{1 << 2, 1 << 11}});

View File

@@ -0,0 +1,47 @@
#include "queries.h"
#include <filesystem>
#include <algorithm>
#include <iostream>
#include <regex>
#include "benchmark_utils.h"
namespace filesystem = std::filesystem;
std::string getQueryName(unsigned i) {
if (sql_queries[i].first.empty()) {
std::string name = "#" + std::to_string(i + 1);
return name;
}
return std::string("") + sql_queries[i].first;
}
std::vector<SQLQuery> getQueriesFromDirectory(const std::string& dir_path) {
std::regex query_file_regex("\\.sql$");
std::vector<std::string> files;
for (auto& entry : filesystem::directory_iterator(dir_path)) {
if (filesystem::is_regular_file(entry)) {
std::string path_str = filesystem::path(entry);
if (std::regex_search(path_str, query_file_regex)) {
files.push_back(path_str);
}
}
}
std::sort(files.begin(), files.end());
std::vector<SQLQuery> queries;
for (const std::string& file_path : files) {
const filesystem::path p(file_path);
const std::string query = readFileContents(file_path);
queries.emplace_back(p.filename(), query);
}
return queries;
}
std::vector<SQLQuery> getTPCHQueries() {
return getQueriesFromDirectory("test/queries/");
}

View File

@@ -0,0 +1,56 @@
#ifndef __QUERIES_H__
#define __QUERIES_H__
#include <string>
#include <vector>
typedef std::pair<std::string, std::string> SQLQuery;
// name, query
static std::vector<SQLQuery> sql_queries = {
{"Q1", "SELECT * FROM test;"},
{"Q2", "SELECT a, b AS address FROM (SELECT * FROM test WHERE c < 100 AND b > 3) t1 WHERE a < 10 AND b < 100;"},
{"Q3", "SELECT \"left\".a, \"left\".b, \"right\".a, \"right\".b FROM table_a AS \"left\" JOIN table_b AS \"right\" ON \"left\".a = \"right\".a;"},
{"Q4", ""
"SELECT"
" l_orderkey,"
" SUM(l_extendedprice * (1 - l_discount)) AS revenue,"
" o_orderdate,"
" o_shippriority"
" FROM"
" customer,"
" orders,"
" lineitem"
" WHERE"
" c_mktsegment = '%s'"
" and c_custkey = o_custkey"
" and l_orderkey = o_orderkey"
" and o_orderdate < '%s'"
" and l_shipdate > '%s'"
" GROUP BY"
" l_orderkey,"
" o_orderdate,"
" o_shippriority"
" ORDER BY"
" revenue DESC,"
" o_orderdate;"
},
{"LongSelectList26", "SELECT a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z FROM test;"},
{"LongSelectElement26", "SELECT abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxy FROM test;"},
{"LongSelectList52", "SELECT a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z FROM test;"},
{"LongSelectElement52", "SELECT abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxy FROM test;"},
{"TwoSelects", "SELECT * FROM test; SELECT age, street AS address FROM data;"},
{"ExecuteNoParams", "EXECUTE procedure;"},
{"Execute2Params", "EXECUTE procedure(11, 'test');"},
{"Execute10Params", "EXECUTE procedure(11, 'test', 5.6, 4.2, 'abc', 6, 7, 8, 9, 10000);"},
// {"name", "query"},
};
std::string getQueryName(unsigned i);
std::vector<SQLQuery> getQueriesFromDirectory(const std::string& dir_path);
std::vector<SQLQuery> getTPCHQueries();
#endif