Loading Data

Copy-paste examples of loading data with SQL

LOAD INTO

Single & Multiple KiFS Files

Single File
1
2
LOAD DATA INTO example.product
FROM FILE PATHS 'kifs://data/products.csv'
Multiple Files by List
1
2
LOAD DATA INTO example.product
FROM FILE PATHS 'kifs://data/products.csv', 'kifs://data/products.kh.csv'

Data Sources

File/Queue-Based

One-Time Load
1
2
3
LOAD DATA INTO example.product
FROM FILE PATHS 'products.csv'
WITH OPTIONS (DATA SOURCE = 'example.product_ds')
Change Data Capture
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
-- Load files in the orders directory with timestamps newer than the time of the
--   previous poll interval; poll every 5 minutes
LOAD DATA INTO example.orders
FROM FILE PATHS 'orders/'
WITH OPTIONS
(
      DATA SOURCE = 'example.order_ds',
      SUBSCRIBE = TRUE,
      POLL_INTERVAL = 300
)
Streaming
1
2
3
LOAD DATA INTO example.orders
FORMAT JSON
WITH OPTIONS (DATA SOURCE = 'kafka_ds', SUBSCRIBE = TRUE)
Streaming w/ Schema Registry
1
2
3
4
5
6
7
8
LOAD DATA INTO example.orders
FORMAT AVRO
WITH OPTIONS
(
      DATA SOURCE = 'kafka_ds',
      SCHEMA_REGISTRY_SCHEMA_NAME = 'order_schema',
      SUBSCRIBE = TRUE
)

Query-Based

One-Time Load
1
2
3
LOAD INTO example.employee_dept2
FROM REMOTE QUERY 'SELECT * FROM example.employee WHERE dept_id = 2'
WITH OPTIONS (DATA SOURCE = 'example.jdbc_ds')
Change Data Capture
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
-- Load new orders for product 42 continuously into a table
--   order_id is an ever-increasing sequence allotted to each new order
LOAD INTO example.order_product42
FROM REMOTE QUERY 'SELECT * FROM example.orders WHERE product_id = 42'
WITH OPTIONS
(
      DATA SOURCE = 'example.jdbc_ds',
      SUBSCRIBE = TRUE,
      REMOTE_QUERY_INCREASING_COLUMN = 'order_id'
)
Init Options
1
2
3
4
5
6
7
LOAD INTO example.ts
FROM REMOTE QUERY 'SELECT ts FROM example.ts_source'
WITH OPTIONS
(
      DATA SOURCE = 'example.jdbc_ds',
      JDBC_SESSION_INIT_STATEMENT = 'SET TIME ZONE ''EST'''
)
Sizing Options
1
2
3
4
5
6
7
8
LOAD INTO example.employee_dept2
FROM REMOTE QUERY 'SELECT * FROM example.employee WHERE dept_id = 2'
WITH OPTIONS
(
      DATA SOURCE = 'example.jdbc_ds',
      BATCH SIZE = 10000,
      JDBC_FETCH_SIZE = 10000
)

File Types

Delimited Text Files

CSV
1
2
LOAD DATA INTO example.product
FROM FILE PATHS 'kifs://data/products.csv'
Forced Format
1
2
3
LOAD DATA INTO example.product
FROM FILE PATHS 'kifs://data/products.dat'
FORMAT TEXT
No Header
1
2
3
LOAD DATA INTO example.product
FROM FILE PATHS 'kifs://data/products.nh.csv'
FORMAT TEXT (INCLUDES HEADER = FALSE)
Columns Mapped by Name
1
2
3
LOAD DATA INTO example.product_name_stock
FROM FILE PATHS 'kifs://data/products.title-case.csv'
WITH OPTIONS (FIELDS MAPPED BY NAME(ID, Name, Stock))
Columns Ignored by Position
1
2
3
LOAD DATA INTO example.product_name_stock
FROM FILE PATHS 'kifs://data/products.title-case.csv'
WITH OPTIONS (FIELDS IGNORED BY POSITION(2, 4))
Text Options
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
LOAD DATA INTO example.product
FROM FILE PATHS 'kifs://data/products.ssv'
FORMAT TEXT
(
      COMMENT = '--',
      DELIMITER = ';',
      ESCAPE = '`',
      INCLUDES HEADER = TRUE,
      NULL = '<null>',
      QUOTE = ''''
)

JSON/GeoJSON Files

JSON File
1
2
LOAD DATA INTO example.product
FROM FILE PATHS 'kifs://data/products.json'
GeoJSON File
1
2
LOAD DATA INTO example.events
FROM FILE PATHS 'kifs://data/events.json'
Forced Format
1
2
3
LOAD DATA INTO example.product
FROM FILE PATHS 'kifs://data/products.js'
FORMAT JSON
Properties Mapped by Name
1
2
3
LOAD DATA INTO example.product_name_stock
FROM FILE PATHS 'kifs://data/products.json'
WITH OPTIONS (FIELDS MAPPED BY NAME(id, name, stock))
Properties Ignored by Name
1
2
3
LOAD DATA INTO example.product_name_stock
FROM FILE PATHS 'kifs://data/products.json'
WITH OPTIONS (FIELDS IGNORED BY NAME(category, description))

Parquet Files

Implicit Format
1
2
LOAD DATA INTO example.ext_employee
FROM FILE PATHS 'kifs://data/employee.parquet'
Forced Format
1
2
3
CREATE EXTERNAL TABLE example.ext_employee
FILE PATHS 'kifs://data/employee.pq'
FORMAT PARQUET
Properties Mapped By Name
1
2
3
LOAD DATA INTO example.product_name_stock
FROM FILE PATHS 'kifs://data/products.parquet'
WITH OPTIONS (FIELDS MAPPED BY NAME(id, name, stock))
Properties Ignored By Name
1
2
3
LOAD DATA INTO example.product_name_stock
FROM FILE PATHS 'kifs://data/products.parquet'
WITH OPTIONS (FIELDS IGNORED BY NAME(category, description))

Shapefiles

Implicit Format
1
2
3
LOAD DATA INTO example.events
FROM FILE PATHS 'shapefile/events.shp'
WITH OPTIONS (DATA SOURCE = 'example.events_ds')
Forced Format
1
2
3
LOAD DATA INTO example.events
FROM FILE PATHS 'kifs://data/events.shp'
FORMAT SHAPEFILE
Properties Mapped by Name
1
2
3
4
LOAD DATA INTO example.events
FROM FILE PATHS 'kifs://data/events.shp'
FORMAT SHAPEFILE
WITH OPTIONS (FIELDS MAPPED BY NAME(ID, kiGeometry))
Properties Ignored by Name
1
2
3
4
LOAD DATA INTO example.events
FROM FILE PATHS 'kifs://data/events.shp'
FORMAT SHAPEFILE
WITH OPTIONS (FIELDS IGNORED BY NAME(type))

Avro Files

Embedded Schema
1
2
3
4
5
6
7
LOAD DATA INTO example.orders
FORMAT AVRO
WITH OPTIONS
(
      DATA SOURCE = 'kafka_ds',
      SUBSCRIBE = TRUE
)
Schema Registry Lookup
1
2
3
4
5
6
7
8
LOAD DATA INTO example.orders
FORMAT AVRO
WITH OPTIONS
(
      DATA SOURCE = 'kafka_ds',
      SCHEMA_REGISTRY_SCHEMA_NAME = 'order_schema',
      SUBSCRIBE = TRUE
)

Primary/Shard Keys

Primary Key
1
2
3
LOAD INTO example.employee
FROM FILE PATHS 'kifs://data/employee.parquet'
WITH OPTIONS (PRIMARY KEY = (id))
Primary & Shard Key
1
2
3
4
5
6
7
LOAD INTO example.employee
FROM FILE PATHS 'kifs://data/employee.csv'
WITH OPTIONS
(
    PRIMARY KEY = (id, dept_id),
    SHARD KEY = (id)
)

Query Partitioning Options

Partition Column
1
2
3
4
5
6
7
LOAD INTO example.employee_local
FROM REMOTE QUERY 'SELECT * FROM example.employee'
WITH OPTIONS
(
      DATA SOURCE = 'example.jdbc_ds',
      REMOTE_QUERY_PARTITION_COLUMN = 'id'
)
Ordered Blocks
1
2
3
4
5
6
7
LOAD INTO example.employee_local
FROM REMOTE QUERY 'SELECT * FROM example.employee'
WITH OPTIONS
(
      DATA SOURCE = 'example.jdbc_ds',
      REMOTE_QUERY_ORDER_BY = 'hire_date, dept_id'
)
Disable Partitioning
1
2
3
4
5
6
7
LOAD INTO example.employee_local
FROM REMOTE QUERY 'SELECT * FROM example.employee'
WITH OPTIONS
(
      DATA SOURCE = 'example.jdbc_ds',
      REMOTE_QUERY_NO_SPLIT = true
)
Rank/Split Processing
1
2
3
4
5
6
7
8
LOAD INTO example.employee_local
FROM REMOTE QUERY 'SELECT * FROM example.employee'
WITH OPTIONS
(
      DATA SOURCE = 'example.jdbc_ds',
      NUM_SPLITS_PER_RANK = 16,
      NUM_TASKS_PER_RANK = 4
)

Other Options

Column Formats
1
2
3
4
5
6
7
8
9
LOAD INTO example.employee
FROM FILE PATHS 'kifs://data/employee.csv'
WITH OPTIONS
(
      COLUMN FORMATS = '
      {
            "hire_date": {"date": "YYYY-MM-DD"}
      }'
)
Table Options
1
2
3
LOAD DATA INTO example.product_create_options
FROM FILE PATHS 'kifs://data/products.csv'
USING TABLE PROPERTIES (CHUNK SIZE = 1000000, TTL = 5)
Load Options
1
2
3
LOAD DATA INTO example.product_load_options
FROM FILE PATHS 'kifs://data/products.csv'
WITH OPTIONS (BATCH SIZE = 20000)

External Tables

Refresh on Start
1
2
3
CREATE EXTERNAL TABLE example.ext_product
FILE PATHS 'products.csv'
WITH OPTIONS (DATA SOURCE = 'example.product_ds', REFRESH ON START = TRUE)
File Change Data Capture
1
2
3
CREATE EXTERNAL TABLE example.ext_order
FILE PATHS 'orders/'
WITH OPTIONS (DATA SOURCE = 'example.order_ds', SUBSCRIBE = TRUE)
JDBC Data Source Remote Query
1
2
3
CREATE EXTERNAL TABLE example.ext_employee_dept2
REMOTE QUERY 'SELECT * FROM example.employee WHERE dept_id = 2'
WITH OPTIONS (DATA SOURCE = 'example.jdbc_ds')
Query Change Data Capture
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
-- Load new orders for product 42 continuously into a table
--   order_id is an ever-increasing sequence allotted to each new order
CREATE EXTERNAL TABLE example.ext_order_product42
REMOTE QUERY 'SELECT * FROM example.orders WHERE product_id = 42'
WITH OPTIONS
(
      DATA SOURCE = 'example.jdbc_ds',
      SUBSCRIBE = TRUE,
      REMOTE_QUERY_INCREASING_COLUMN = 'order_id'
)