fix(SQLParser): move to Data dir; add extradirs, remove vs 140,150 build scripts generation

2025-10-24 09:12:28 +02:00 · 2024-02-13 14:04:23 +01:00
parent 3261ebbd42
commit ee39b611f2
167 changed files with 780 additions and 792 deletions
--- a/Data/SQLParser/benchmark/README.md
+++ b/Data/SQLParser/benchmark/README.md
@@ -0,0 +1,14 @@
+# Benchmark
+
+This directory contains the scripts to execute benchmarks of the parser. We use [Google Benchmark](https://github.com/google/benchmark) to define and run benchmarks.
+
+## Install Google Benchmark
+
+```bash
+cmake -DCMAKE_BUILD_TYPE=Release
+
+make
+
+make install
+```
+
--- a/Data/SQLParser/benchmark/benchmark.cpp
+++ b/Data/SQLParser/benchmark/benchmark.cpp
@@ -0,0 +1,28 @@
+#include "benchmark/benchmark.h"
+
+#include "benchmark_utils.h"
+#include "queries.h"
+
+int main(int argc, char** argv) {
+  // Create parse and tokenize benchmarks for TPC-H queries.
+  const auto tpch_queries = getTPCHQueries();
+  for (const auto& query : tpch_queries) {
+    std::string p_name = query.first + "-parse";
+    benchmark::RegisterBenchmark(p_name.c_str(), &BM_ParseBenchmark, query.second);
+    std::string t_name = query.first + "-tokenize";
+    benchmark::RegisterBenchmark(t_name.c_str(), &BM_TokenizeBenchmark, query.second);
+  }
+
+  // Create parse and tokenize benchmarks for all queries in sql_queries array.
+  for (unsigned i = 0; i < sql_queries.size(); ++i) {
+    const auto& query = sql_queries[i];
+    std::string p_name = getQueryName(i) + "-parse";
+    benchmark::RegisterBenchmark(p_name.c_str(), &BM_ParseBenchmark, query.second);
+
+    std::string t_name = getQueryName(i) + "-tokenize";
+    benchmark::RegisterBenchmark(t_name.c_str(), &BM_TokenizeBenchmark, query.second);
+  }
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+}
--- a/Data/SQLParser/benchmark/benchmark_utils.cpp
+++ b/Data/SQLParser/benchmark/benchmark_utils.cpp
@@ -0,0 +1,44 @@
+#include "benchmark_utils.h"
+
+#include <fstream>
+#include <iostream>
+
+#include "SQLParser.h"
+
+size_t getNumTokens(const std::string& query) {
+  std::vector<int16_t> tokens;
+  hsql::SQLParser::tokenize(query, &tokens);
+  return tokens.size();
+}
+
+void BM_TokenizeBenchmark(benchmark::State& st, const std::string& query) {
+  st.counters["num_tokens"] = getNumTokens(query);
+  st.counters["num_chars"] = query.size();
+
+  while (st.KeepRunning()) {
+    std::vector<int16_t> tokens(512);
+    hsql::SQLParser::tokenize(query, &tokens);
+  }
+}
+
+void BM_ParseBenchmark(benchmark::State& st, const std::string& query) {
+  st.counters["num_tokens"] = getNumTokens(query);
+  st.counters["num_chars"] = query.size();
+
+  while (st.KeepRunning()) {
+    hsql::SQLParserResult result;
+    hsql::SQLParser::parse(query, &result);
+    if (!result.isValid()) {
+      std::cout << query << std::endl;
+      std::cout << result.errorMsg() << std::endl;
+      st.SkipWithError("Parsing failed!");
+    }
+  }
+}
+
+std::string readFileContents(const std::string& file_path) {
+  std::ifstream t(file_path.c_str());
+  std::string text((std::istreambuf_iterator<char>(t)),
+                   std::istreambuf_iterator<char>());
+  return text;
+}
--- a/Data/SQLParser/benchmark/benchmark_utils.h
+++ b/Data/SQLParser/benchmark/benchmark_utils.h
@@ -0,0 +1,41 @@
+#ifndef __BENCHMARK_UTILS_H__
+#define __BENCHMARK_UTILS_H__
+
+#include "benchmark/benchmark.h"
+
+size_t getNumTokens(const std::string& query);
+
+void BM_TokenizeBenchmark(benchmark::State& st, const std::string& query);
+
+void BM_ParseBenchmark(benchmark::State& st, const std::string& query);
+
+std::string readFileContents(const std::string& file_path);
+
+
+
+
+#define TIME_DIFF(end, start)\
+  std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+
+#define NOW()\
+  std::chrono::high_resolution_clock::now();
+
+#define PARSE_QUERY_BENCHMARK(name, query)\
+  static void name(benchmark::State& st) {\
+    BM_ParseBenchmark(st, query);\
+  }\
+  BENCHMARK(name);
+
+#define TOKENIZE_QUERY_BENCHMARK(name, query)\
+  static void name(benchmark::State& st) {\
+    BM_TokenizeBenchmark(st, query);\
+  }\
+  BENCHMARK(name);
+
+
+#define BENCHMARK_QUERY(test_name, query)\
+  TOKENIZE_QUERY_BENCHMARK(test_name##Tokenize, query)\
+  PARSE_QUERY_BENCHMARK(test_name##Parse, query)
+
+
+#endif
--- a/Data/SQLParser/benchmark/parser_benchmark.cpp
+++ b/Data/SQLParser/benchmark/parser_benchmark.cpp
@@ -0,0 +1,87 @@
+
+#include <chrono>
+#include <sstream>
+#include "benchmark/benchmark.h"
+
+#include "SQLParser.h"
+#include "parser/bison_parser.h"
+#include "parser/flex_lexer.h"
+
+#include "benchmark_utils.h"
+
+// Benchmark the influence of increasing size of the query, while
+// the number of tokens remains unchanged.
+static void BM_CharacterCount(benchmark::State& st) {
+  const size_t querySize = st.range(0);
+
+  // Base query has size of 18 characters.
+  std::string query = "SELECT %name% FROM test;";
+
+  const uint pad = querySize - 18;
+  const std::string filler = std::string(pad, 'a');
+  query.replace(7, 6, filler);
+
+  st.counters["num_tokens"] = getNumTokens(query);
+  st.counters["num_chars"] = query.size();
+  while (st.KeepRunning()) {
+    hsql::SQLParserResult result;
+    hsql::SQLParser::parse(query, &result);
+  }
+}
+BENCHMARK(BM_CharacterCount)
+  ->RangeMultiplier(1 << 2)
+  ->Ranges({{1 << 5, 1 << 15},
+            {5, 5}});
+
+// Benchmark the influence of increasing number of tokens, while
+// the number of characters remains unchanged.
+static void BM_ConditionalTokens(benchmark::State& st) {
+  const size_t targetSize = st.range(0);
+  const size_t numTokens = st.range(1);
+
+  // Base query contains 6 tokens.
+  std::string query = "SELECT * FROM test";
+
+  // Create conditional.
+  std::stringstream condStream;
+  size_t missingTokens = numTokens - 4;
+  if (missingTokens > 0) {
+    condStream << " WHERE a";
+    missingTokens -= 2;
+
+    while (missingTokens > 0) {
+      condStream << " AND a";
+      missingTokens -= 2;
+    }
+  }
+
+  query += condStream.str();
+
+  if (targetSize >= query.size()) {
+    const size_t pad = targetSize - query.size();
+    const std::string filler = std::string(pad, 'a');
+    query.replace(7, 1, filler);
+
+  } else {
+    // Query can't be the same length as in the other benchmarks.
+    // Running this will result in unusable data.
+    fprintf(stderr, "Too many tokens. Query too long for benchmark char limit (%lu > %lu).\n",
+      query.size(), targetSize);
+    return;
+  }
+
+  st.counters["num_tokens"] = getNumTokens(query);
+  st.counters["num_chars"] = query.size();
+  while (st.KeepRunning()) {
+    hsql::SQLParserResult result;
+    hsql::SQLParser::parse(query, &result);
+    if (!result.isValid()) st.SkipWithError("Parsing failed!");
+  }
+}
+BENCHMARK(BM_ConditionalTokens)
+  ->RangeMultiplier(1 << 2)
+  ->Ranges({{1 << 14, 1 << 14},
+            {1 << 2, 1 << 11}});
+
+
+
--- a/Data/SQLParser/benchmark/queries.cpp
+++ b/Data/SQLParser/benchmark/queries.cpp
@@ -0,0 +1,47 @@
+#include "queries.h"
+
+#include <filesystem>
+#include <algorithm>
+#include <iostream>
+#include <regex>
+
+#include "benchmark_utils.h"
+
+namespace filesystem = std::filesystem;
+
+std::string getQueryName(unsigned i) {
+  if (sql_queries[i].first.empty()) {
+    std::string name = "#" + std::to_string(i + 1);
+    return name;
+  }
+  return std::string("") + sql_queries[i].first;
+}
+
+std::vector<SQLQuery> getQueriesFromDirectory(const std::string& dir_path) {
+  std::regex query_file_regex("\\.sql$");
+  std::vector<std::string> files;
+
+  for (auto& entry : filesystem::directory_iterator(dir_path)) {
+    if (filesystem::is_regular_file(entry)) {
+      std::string path_str = filesystem::path(entry);
+
+      if (std::regex_search(path_str, query_file_regex)) {
+        files.push_back(path_str);
+      }
+    }
+  }
+
+  std::sort(files.begin(), files.end());
+
+  std::vector<SQLQuery> queries;
+  for (const std::string& file_path : files) {
+    const filesystem::path p(file_path);
+    const std::string query = readFileContents(file_path);
+    queries.emplace_back(p.filename(), query);
+  }
+  return queries;
+}
+
+std::vector<SQLQuery> getTPCHQueries() {
+  return getQueriesFromDirectory("test/queries/");
+}
--- a/Data/SQLParser/benchmark/queries.h
+++ b/Data/SQLParser/benchmark/queries.h
@@ -0,0 +1,56 @@
+#ifndef __QUERIES_H__
+#define __QUERIES_H__
+
+#include <string>
+#include <vector>
+
+typedef std::pair<std::string, std::string> SQLQuery;
+
+// name, query
+static std::vector<SQLQuery> sql_queries = {
+  {"Q1", "SELECT * FROM test;"},
+  {"Q2", "SELECT a, b AS address FROM (SELECT * FROM test WHERE c < 100 AND b > 3) t1 WHERE a < 10 AND b < 100;"},
+  {"Q3", "SELECT \"left\".a, \"left\".b, \"right\".a, \"right\".b FROM table_a AS \"left\" JOIN table_b AS \"right\" ON \"left\".a = \"right\".a;"},
+  {"Q4", ""
+"SELECT"
+"    l_orderkey,"
+"    SUM(l_extendedprice * (1 - l_discount)) AS revenue,"
+"    o_orderdate,"
+"    o_shippriority"
+"  FROM"
+"    customer,"
+"    orders,"
+"    lineitem"
+"  WHERE"
+"    c_mktsegment = '%s'"
+"    and c_custkey = o_custkey"
+"    and l_orderkey = o_orderkey"
+"    and o_orderdate < '%s'"
+"    and l_shipdate > '%s'"
+"  GROUP BY"
+"    l_orderkey,"
+"    o_orderdate,"
+"    o_shippriority"
+"  ORDER BY"
+"    revenue DESC,"
+"    o_orderdate;"
+},
+
+  {"LongSelectList26",    "SELECT a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z FROM test;"},
+  {"LongSelectElement26", "SELECT abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxy FROM test;"},
+  {"LongSelectList52",    "SELECT a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z FROM test;"},
+  {"LongSelectElement52", "SELECT abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxy FROM test;"},
+  {"TwoSelects",          "SELECT * FROM test; SELECT age, street AS address FROM data;"},
+  {"ExecuteNoParams",     "EXECUTE procedure;"},
+  {"Execute2Params",      "EXECUTE procedure(11, 'test');"},
+  {"Execute10Params",     "EXECUTE procedure(11, 'test', 5.6, 4.2, 'abc', 6, 7, 8, 9, 10000);"},
+  // {"name", "query"},
+};
+
+std::string getQueryName(unsigned i);
+
+std::vector<SQLQuery> getQueriesFromDirectory(const std::string& dir_path);
+
+std::vector<SQLQuery> getTPCHQueries();
+
+#endif