Source code for gpudb_sql_context

# ---------------------------------------------------------------------------
# gpudb_sql_context.py - Kinetica API class for creating and formatting 
# LLM SQL contexts.
#
# Copyright (c) 2024 Kinetica DB Inc.
# ---------------------------------------------------------------------------

import textwrap
from collections import OrderedDict
from typing import Optional, List, Tuple, Dict


class GPUdbBaseClause:
    """ Base class for SQL-GPT context clauses. """

    def __str__(self) -> str:
        return self.format_sql()

    def format_sql(self) -> str:
        """ Format the clause as a SQL string. """
        raise NotImplementedError("format_sql must be implemented by subclass.")


[docs] class GPUdbSamplesClause(GPUdbBaseClause): """ A :class:`GPUdbSamplesClause` represents the samples clause of a SQL-GPT context and can convert its contents to the SQL form expected by a ``CREATE CONTEXT`` statement. Example:: samples_ctx = GPUdbSamplesClause(samples = [ ("What are the shortest, average, and longest trip lengths for each taxi vendor?", \"\"\" SELECT th.vendor_id, MIN(th.trip_distance) AS shortest_trip_length, AVG(th.h.trip_distance) AS average_trip_length, MAX(th.trip_distance) AS longest_trip_length FROM sa_quickstart.taxi_data_historical AS th GROUP BY th.vendor_id; \"\"\"), ("How many trips did each taxi vendor make to JFK International Airport?", \"\"\" SELECT th.vendor_id, COUNT(*) AS trip_count FROM sa_quickstart.taxi_data_historical AS th JOIN sa_quickstart.nyct2020 AS n_dropoff ON KI_FN.STXY_WITHIN(th.dropoff_longitude, th.dropoff_latitude, n_dropoff.geom) AND n_dropoff.NTAName = 'John F. Kennedy International Airport' GROUP BY th.vendor_id; \"\"\"), ]) """ def __init__(self, samples: Optional[List[Tuple[str, str]]] = None) -> None: """ Parameters: samples (list[tuple[str,str]]): A list of tuples containing question/sql pairs. """ if (samples is None): samples = [] self.samples = samples
[docs] def format_sql(self) -> str: ctx_dict = { 'SAMPLES': _GPUdbSqlContextFormatter._quote_dict(OrderedDict(self.samples)) } return _GPUdbSqlContextFormatter._format_clause(ctx_dict)
[docs] class GPUdbTableClause(GPUdbBaseClause): """ A :class:`GPUdbTableClause` represents the table clause of a SQL-GPT context and can convert its contents to the SQL form expected by a ``CREATE CONTEXT`` statement. Example:: table_ctx = GPUdbTableClause( table = "sa_quickstart.nyct2020", comment = "This table contains spatial boundaries and attributes of the New York City.", col_comments = dict( gid = "This is the unique identifier for each record in the table.", geom = "The spatial boundary in WKT format of each NTA neighborhood.", BoroCode = "The code of the borough to which the neighborhood belongs to."), rules = ["Join this table using STXY_WITHIN() = 1", "Another rule here"]) """ def __init__(self, table: str, comment: Optional[str] = None, rules: Optional[List[str]] = None, col_comments: Optional[Dict[str, str]] = None) -> None: """ Parameters: table (str) Fully qualified table name (e.g. "schema.table") comment (str) Comment for the table. rules (list[str]) A list of rules that apply to the table. col_comments (dict[str,str]) A dictionary with mapping of column names to column comments. """ if (comment is None): comment = "" if (rules is None): rules = [] if (col_comments is None): col_comments = {} self.table = table self.comment = comment self.col_comments = col_comments self.rules = rules
[docs] def format_sql(self) -> str: ctx_dict = { 'TABLE': _GPUdbSqlContextFormatter._quote_sql_obj(self.table), 'COMMENT': _GPUdbSqlContextFormatter._quote_text(self.comment), 'RULES': _GPUdbSqlContextFormatter._quote_list(self.rules), 'COMMENTS': _GPUdbSqlContextFormatter._quote_dict(self.col_comments) } return _GPUdbSqlContextFormatter._format_clause(ctx_dict)
class _GPUdbSqlContextFormatter: """ Formatter to generate a SQL-GPT context from clauses. """ @classmethod def format_sql(cls, name: str, ctx_list: List[GPUdbBaseClause]) -> str: """ Format a list of SQL clauses as a ``CREATE CONTEXT`` statement. The result can be passed to :meth:`GPUDB.execute` to create the context. Args: name (str): Fully qualified name of the context. (e.g. "schema.context") ctx_list (list[GPUdbBaseClause]): List of SQL clauses to include in the context. Returns (str): The formatted ``CREATE CONTEXT`` statement. """ str_list = [ctx.format_sql() for ctx in ctx_list] sql_context = ",\n".join(str_list) name = cls._quote_sql_obj(name) return f"CREATE OR REPLACE CONTEXT {name} {sql_context}" @classmethod def _quote_sql_obj(cls, obj: str) -> str: parts = obj.split(".") parts = [f'"{p}"' for p in parts] return ".".join(parts) @classmethod def _quote_text(cls, text: str) -> str: if len(text) == 0: return "" text = text.replace("'", "''").strip() return f"'{text}'" @classmethod def _parens(cls, lines: List[str]) -> str: if len(lines) == 0: return "" params_str = ','.join(lines) return f"( {params_str} )" @classmethod def _quote_list(cls, items: List[str]) -> str: lines = [] for item in items: item_text = cls._quote_text(item) lines.append(f"\n {item_text}") return cls._parens(lines) @classmethod def _quote_dict(cls, params: Dict[str, str]) -> str: lines = [] for question, sql in params.items(): question = cls._quote_text(question) sql = textwrap.dedent(sql) sql = cls._quote_text(sql) lines.append(f"\n {question} = {sql}") return cls._parens(lines) @classmethod def _format_clause(cls, params: Dict[str, str]) -> str: lines = [] for param, val in params.items(): if len(val) == 0: continue lines.append(f" {param} = {val}") context = ',\n'.join(lines) return f"(\n{context}\n)"
[docs] class GPUdbSqlContext: """ Represents a collection of clauses that comprise a SQL-GPT context. Example:: sql_context = GPUdbSqlContext( name = "sa_quickstart.nyc_ctx", tables = [table_ctx], samples = samples_ctx) create_context_sql = sql_context.build_sql() db.execute(create_context_sql) """ def __init__(self, name: str, tables: List[GPUdbTableClause], samples: GPUdbSamplesClause) -> None: """ Args: name (str): Fully qualified name of the context. (e.g. "schema.context") tables (list[GPUdbTableClause]): A list of :class:`GPUdbTableClause` objects representing the Kinetica tables (and metadata) referenced in this context. samples (GPUdbSamplesClause): A :class:`GPUdbSamplesClause` object representing query/answer pairs referenced in this context. """ self.name = name self.samples = samples self.tables = tables def __repr__(self) -> str: return f"SqlContext(name={self.name})"
[docs] def build_sql(self) -> str: """ Format this :class:`GPUdbSqlContext` as a SQL ``CREATE CONTEXT`` statement. The result can be passed to :meth:`GPUDB.execute` to create the context. Returns (str): The SQL-formatted ``CREATE CONTEXT`` statement. """ ctx_list: List[GPUdbBaseClause] = [] if self.tables is not None: ctx_list.extend(self.tables) if self.samples is not None: ctx_list.append(self.samples) return _GPUdbSqlContextFormatter.format_sql(name=self.name, ctx_list=ctx_list)