GPUdb C++ API  Version 6.2.0.3
GPUdbMultiHeadIOUtils.h
Go to the documentation of this file.
1 #ifndef __GPUDB_MULTIHEAD_IO_UTILS_H__
2 #define __GPUDB_MULTIHEAD_IO_UTILS_H__
3 
4 #include "gpudb/GPUdb.hpp"
5 #include "gpudb/Http.hpp"
6 #include "gpudb/Type.hpp"
7 
8 #include <regex>
9 
10 
11 
12 namespace gpudb
13 {
14 
15 // Forward declaration
16 class GPUdb;
17 
18 
19 /*
20  * A list of worker URLs to use for multi-head ingest.
21  */
23 {
24 private:
25 
26  typedef std::vector<gpudb::HttpUrl> worker_list;
27 
28 
29 public:
30 
31  /* Creates a <see cref="WorkerList"/> object and automatically populates it with the
32  * worker URLs from GPUdb to support multi-head ingest. ( If the
33  * specified GPUdb instance has multi-head ingest disabled, the worker
34  * list will be empty and multi-head ingest will not be used.) Note that
35  * in some cases, workers may be configured to use more than one IP
36  * address, not all of which may be accessible to the client; this
37  * constructor uses the first IP returned by the server for each worker.
38  *
39  * If multi-head ingestion is turned off, then returns the server's head
40  * node address.
41  * </summary>
42  *
43  * <param name="db">The <see cref="GPUdb"/> instance from which to
44  * obtain the worker URLs.</param>
45  */
46  WorkerList( const GPUdb &gpudb );
47 
48  /* Creates a <see cref="WorkerList"/> object and automatically populates it with the
49  * worker URLs from GPUdb to support multi-head ingest. ( If the
50  * specified GPUdb instance has multi-head ingest disabled, the worker
51  * list will be empty and multi-head ingest will not be used.) Note that
52  * in some cases, workers may be configured to use more than one IP
53  * address, not all of which may be accessible to the client; this
54  * constructor uses the provided regular expression to match the workers in each
55  * group, and only uses matching workers, if any.
56  *
57  * If multi-head ingestion is turned off, then returns the server's head
58  * node address.
59  * </summary>
60  *
61  * <param name="db">The <see cref="GPUdb"/> instance from which to
62  * obtain the worker URLs.</param>
63  * <param name="ip_regex_str">A regular expression pattern for the IPs to match.</param>
64  */
65  WorkerList( const GPUdb &gpudb, const std::string& ip_regex_str );
66 
67 // ~WorkerList();
68 
69 
70  // Return the size of this WorkerList
71  size_t size() const { return m_worker_urls.size(); }
72 
73  // Iterator related stuff
74  typedef worker_list::const_iterator const_iterator;
75  const_iterator begin() const { return m_worker_urls.begin(); }
76  const_iterator end() const { return m_worker_urls.end(); }
77 
78  // Returns if this WorkerList is empty
79  bool empty() const { return m_worker_urls.empty(); }
80 
81  // Returns a string representation of the workers contained within
82  std::string toString() const;
83 
84 
85 private:
86 
87  worker_list m_worker_urls;
88 
89  static void split_string( const std::string &in_string,
90  char delim,
91  std::vector<std::string> &elements );
92 }; // end class WorkerList
93 
94 
95 
96 /*
97  * A key based on a given record that serves as either a primary key
98  * or a shard key. The <see cref="RecordKeyBuilder"/> class creates
99  * these record keys.
100  */
102 {
103 public:
104 
105  RecordKey();
106  RecordKey( size_t buffer_size );
107  RecordKey( const RecordKey &other );
108  ~RecordKey();
109 
110  // Returns whether the key is valid at the moment
111  bool is_valid() const { return m_is_valid; }
112 
113  // Return the key's hash code
114  int32_t get_hash_code() const { return m_hash_code; }
115 
116  // Resets the key to be an empty one with the new buffer size
117  void reset( size_t buffer_size );
118 
119 
120  // Adds a char1 to the buffer
121  void add_char1( const std::string& value, bool is_null );
122 
123  // Adds a char2 to the buffer
124  void add_char2( const std::string& value, bool is_null );
125 
126  // Adds a char4 to the buffer
127  void add_char4( const std::string& value, bool is_null );
128 
129  // Adds a char8 to the buffer
130  void add_char8( const std::string& value, bool is_null );
131 
132  // Adds a char16 to the buffer
133  void add_char16( const std::string& value, bool is_null );
134 
135  // Adds a char32 to the buffer
136  void add_char32( const std::string& value, bool is_null );
137 
138  // Adds a char64 to the buffer
139  void add_char64( const std::string& value, bool is_null );
140 
141  // Adds a char128 to the buffer
142  void add_char128( const std::string& value, bool is_null );
143 
144  // Adds a char256 to the buffer
145  void add_char256( const std::string& value, bool is_null );
146 
147  // Adds a date to the buffer
148  void add_date( const std::string& value, bool is_null );
149 
150  // Adds a datetime to the buffer
151  void add_datetime( const std::string& value, bool is_null );
152 
153  // Adds a decimal to the buffer
154  void add_decimal( const std::string& value, bool is_null );
155 
156  // Adds a double to the buffer
157  void add_double( double value, bool is_null );
158 
159  // Adds a float to the buffer
160  void add_float( float value, bool is_null );
161 
162  // Adds an int8 to the buffer
163  void add_int8( int8_t value, bool is_null );
164 
165  // Adds an int16 to the buffer
166  void add_int16( int16_t value, bool is_null );
167 
168  // Adds an integer to the buffer
169  void add_int( int32_t value, bool is_null );
170 
171  // Adds a IPv4 address to the buffer
172  void add_ipv4( const std::string& value, bool is_null );
173 
174  // Adds a long to the buffer
175  void add_long( int64_t value, bool is_null );
176 
177  // Adds a time to the buffer
178  void add_time( const std::string& value, bool is_null );
179 
180  // Adds a timestamp (long) to the buffer
181  void add_timestamp( int64_t value, bool is_null );
182 
183  // Adds (the hash value of) a string to the buffer
184  void add_string( const std::string& value, bool is_null );
185 
186 
188  void compute_hash();
189 
192  size_t route( const std::vector<int32_t>& routing_table ) const;
193 
195  RecordKey& operator=(const RecordKey& other);
196 
198  bool operator==(const RecordKey& rhs) const;
199  bool operator!=(const RecordKey& rhs) const { return !(*this == rhs); }
200 
202  bool operator<(const RecordKey& rhs) const;
203  bool operator>(const RecordKey& rhs) const { return ( !(*this < rhs)
204  && !(*this == rhs) ); }
205 
206  std::string toString( const std::string& separator = " " ) const;
207 private:
208 
209  // Copy contents of another key into this one
210  void copy( const RecordKey& other );
211 
212  // Returns whether the buffer is full or not
213  bool is_buffer_full( bool throw_if_full = true ) const;
214 
215  // Check whether the buffer will overflow if we attempt to add n more bytes
216  bool will_buffer_overflow( int n, bool throw_if_overflow = true ) const;
217 
218  // Adds a single byte to the buffer; does the accounting, too
219  void add( uint8_t b );
220 
221  std::vector<unsigned char> m_buffer;
222  size_t m_buffer_size;
223  size_t m_current_size;
224  int32_t m_hash_code;
225  int64_t m_routing_hash;
226  bool m_is_valid;
227  bool m_key_is_complete;
228 
229 }; // end class RecordKey
230 
231 
232 
234 {
235 private:
236 
237  enum ColumnType_T
238  {
239  CHAR1,
240  CHAR2,
241  CHAR4,
242  CHAR8,
243  CHAR16,
244  CHAR32,
245  CHAR64,
246  CHAR128,
247  CHAR256,
248  DATE,
249  DATETIME,
250  DECIMAL,
251  DOUBLE,
252  FLOAT,
253  INT,
254  INT8,
255  INT16,
256  IPV4,
257  LONG,
258  STRING,
259  TIME,
260  TIMESTAMP
261  };
262 
263  // Some typedefs for nullable types
264  typedef boost::optional<int32_t> nullableInt;
265 
266  gpudb::Type m_record_type;
267  std::vector<int32_t> m_pk_shard_key_indices;
268  std::vector<ColumnType_T> m_column_types;
269  size_t m_key_buffer_size;
270 
271  RecordKeyBuilder() : m_record_type( gpudb::Type( "empty_type" ) ) {}
272 
273 public:
274 
275  // Constructs a RecordKey builder
276  RecordKeyBuilder( bool is_primary_key, const gpudb::Type& record_type );
277 
278  // Build a RecordKey object based on a generic record
279  bool build( const gpudb::GenericRecord& record, RecordKey& record_key ) const;
280 
281  /*
282  * Build a key-lookup expression based on a generic record.
283  *
284  * Returns true if expression building succeeded, false otherwise.
285  */
286  bool buildExpression( const gpudb::GenericRecord& record,
287  std::string& result ) const;
288 
289  // Returns whether this builder builds any routing keys. That is,
290  // if there are any routing columns in the relevant record type
291  bool has_key() const { return !m_pk_shard_key_indices.empty(); }
292 
293  // Returns true if the other RecordKeyBuilder is equivalent to this builder
294  bool operator==(const RecordKeyBuilder& rhs) const;
295 
296  bool operator!=(const RecordKeyBuilder& rhs) const { return !(*this == rhs); }
297 }; // end class RecordKeyBuilder
298 
299 
300 
301 /*
302  * The WorkerQueue class maintains queues of record to be inserted
303  * into GPUdb. It is templated on the type of the record that is to
304  * be ingested into the DB server.
305  */
307 {
308 public:
309 
310  // We need a shared pointer to move vectors of records around
311  typedef std::vector<gpudb::GenericRecord> recordVector_T;
312 
313 private:
314 
315  gpudb::HttpUrl m_url;
316  size_t m_capacity;
317  bool m_has_primary_key;
318  bool m_update_on_existing_pk;
319  recordVector_T m_queue;
320 
322 // typedef std::map<const RecordKey&, size_t> primary_key_map_t;
323  typedef std::map<RecordKey, size_t> primary_key_map_t;
324  primary_key_map_t m_primary_key_map;
325 
326  WorkerQueue();
327 
328 public:
329 
330  // Takes a string for the url; capacity 1, no PK, no update on existing PK
331  // (this is used by the RecordRetriever class which doesn't care about
332  // the other arguments)
333  WorkerQueue( const std::string& url );
334 
335  // Takes a string for the url
336  WorkerQueue( const std::string& url, size_t capacity, bool has_primary_key,
337  bool update_on_existing_pk );
338  ~WorkerQueue();
339 
341  const gpudb::HttpUrl& get_url() const { return m_url; }
342 
344  void flush( recordVector_T& flushed_records );
345 
347  bool insert( const gpudb::GenericRecord& record,
348  const RecordKey& key,
349  recordVector_T& flushed_records );
350 
351 
352 }; // end class WorkerQueue
353 
354 
355 
356 
357 
358 } // namespace gpudb
359 
360 
361 
362 #endif // __GPUDB_MULTIHEAD_IO_UTILS_H__
363 
364 
const std::string CHAR1
This property provides optimized memory, disk and query performance for string columns.
const std::string CHAR4
This property provides optimized memory, disk and query performance for string columns.
WorkerList(const GPUdb &gpudb)
const std::string IPV4
This property provides optimized memory, disk and query performance for string columns representing I...
const std::string TIMESTAMP
Valid only for &#39;long&#39; columns.
const std::string CHAR16
This property provides optimized memory, disk and query performance for string columns.
const std::string TIME
Valid only for &#39;string&#39; columns.
int32_t get_hash_code() const
const std::string CHAR32
This property provides optimized memory, disk and query performance for string columns.
std::string toString() const
std::vector< gpudb::GenericRecord > recordVector_T
const std::string CHAR64
This property provides optimized memory, disk and query performance for string columns.
const_iterator begin() const
bool operator!=(const RecordKey &rhs) const
bool operator>(const RecordKey &rhs) const
const std::string INT8
This property provides optimized memory and query performance for int columns.
const std::string INT16
This property provides optimized memory and query performance for int columns.
const std::string DATE
Valid only for &#39;string&#39; columns.
const std::string CHAR128
This property provides optimized memory, disk and query performance for string columns.
const std::string CHAR256
This property provides optimized memory, disk and query performance for string columns.
const std::string DECIMAL
Valid only for &#39;string&#39; columns.
bool operator!=(const RecordKeyBuilder &rhs) const
worker_list::const_iterator const_iterator
const_iterator end() const
const std::string DATETIME
Valid only for &#39;string&#39; columns.
const std::string CHAR2
This property provides optimized memory, disk and query performance for string columns.
const std::string CHAR8
This property provides optimized memory, disk and query performance for string columns.
const gpudb::HttpUrl & get_url() const
Returns the URL in string format for this worker.