@@ -222,110 +222,106 @@ async function generateSqlQuery(apiKey: string, schemaInfo: string, question: st
222
222
const systemPrompt = `You are a PostgreSQL expert. Generate secure, read-only SQL queries based on natural language questions.
223
223
Schema information: ${ schemaInfo }
224
224
225
- Important: Return ONLY the raw SQL query without any formatting, markdown, or code blocks.
226
-
227
- Rules:
228
- - Use ONLY tables and columns that exist in the provided schema information
229
- - Do not make assumptions about columns that aren't explicitly listed in the schema
230
- - Generate only SELECT queries (no INSERT, UPDATE, DELETE, etc.)
231
- - Ensure queries are optimized for performance:
232
- * Limit complex calculations to necessary rows
233
- * Use WHERE clauses before aggregations
234
- * Keep CTEs simple and focused
235
- * Avoid multiple passes over large datasets
236
- * Use indexes when available (usually primary keys)
237
- - Include relevant JOINs when needed
238
- - Add inline comments with -- to explain the query
239
- - Limit results to ${ maxRows } rows using LIMIT clause
240
- - Use explicit column names instead of SELECT *
241
- - Add ORDER BY clauses when relevant
242
- - When using numeric calculations:
243
- * Cast numeric values explicitly (e.g., CAST(value AS NUMERIC))
244
- * Use ROUND(CAST(value AS NUMERIC), 2) for decimal places
245
- * Handle NULL values with COALESCE
246
- * For averages, use AVG(CAST(column AS NUMERIC))
247
- * For sums, use SUM(CAST(column AS NUMERIC))
248
- * For counts, use COUNT(*) when possible
249
- - For aggregations and grouping:
250
- * Calculate base values before aggregating using CTEs:
251
- WITH base_calculations AS (
252
- SELECT
253
- order_id,
254
- CAST(value AS NUMERIC) as value,
255
- CAST(discount AS NUMERIC) as discount
256
- FROM orders
257
- ),
258
- segment_metrics AS (
225
+ CRITICAL RULES:
226
+ 1. NEVER use window functions inside GROUP BY
227
+ 2. NEVER use aggregates inside GROUP BY
228
+ 3. NEVER put ORDER BY inside CTEs with UNION
229
+ 4. ALWAYS use result_type column for sorting with UNION + totals
230
+ 5. ALWAYS calculate raw values before aggregating
231
+ 6. ALWAYS use explicit CAST for numeric calculations
232
+ 7. ALWAYS handle NULL values with NULLIF in divisions
233
+ 8. ALWAYS ensure segments are MECE (Mutually Exclusive, Collectively Exhaustive)
234
+
235
+ CRITICAL PATTERNS:
236
+ 1. Segmentation with Totals:
237
+ WITH base_data AS (
238
+ SELECT
239
+ /* Calculate all raw values first */
240
+ CAST(value AS NUMERIC) as value,
241
+ CAST(discount AS NUMERIC) as discount
242
+ FROM source_table
243
+ ),
244
+ segment_metrics AS (
245
+ SELECT
246
+ 'Segment' as result_type,
247
+ CASE WHEN condition THEN 'Type A' ELSE 'Type B' END as segment,
248
+ COUNT(*) as count,
249
+ AVG(value) as avg_value,
250
+ SUM(value) as total_value
251
+ FROM base_data
252
+ GROUP BY CASE WHEN condition THEN 'Type A' ELSE 'Type B' END
253
+ ),
254
+ total_metrics AS (
255
+ SELECT
256
+ 'Total' as result_type,
257
+ 'Total' as segment,
258
+ COUNT(*) as count,
259
+ AVG(value) as avg_value,
260
+ SUM(value) as total_value
261
+ FROM base_data
262
+ )
263
+ SELECT * FROM segment_metrics
264
+ UNION ALL
265
+ SELECT * FROM total_metrics
266
+ ORDER BY result_type DESC, segment;
267
+
268
+ 2. Correlation Analysis:
269
+ WITH segment_data AS (
270
+ SELECT
271
+ segment,
272
+ AVG(x) as avg_x,
273
+ AVG(y) as avg_y,
274
+ COUNT(*) as n,
275
+ STDDEV_POP(x) as stddev_x,
276
+ STDDEV_POP(y) as stddev_y,
277
+ SUM(x * y) as sum_xy,
278
+ SUM(x) as sum_x,
279
+ SUM(y) as sum_y
280
+ FROM (
259
281
SELECT
260
- CASE WHEN discount > 0 THEN 'With Discount'
261
- ELSE 'Without Discount'
262
- END as segment,
263
- COUNT(*) as order_count,
264
- AVG(value) as avg_value,
265
- SUM(value) as total_value,
266
- AVG(discount) as avg_discount,
267
- SUM(discount) as total_discount
268
- FROM base_calculations
269
- GROUP BY
270
- CASE WHEN discount > 0 THEN 'With Discount'
271
- ELSE 'Without Discount'
272
- END
273
- )
274
- SELECT * FROM segment_metrics;
275
- * Never use calculated fields or aggregates in GROUP BY
276
- * Pre-calculate complex values in earlier CTEs
277
- * Use simple CASE statements for grouping
278
- * Structure multi-level aggregations:
279
- 1. Base calculations (raw values, type casting)
280
- 2. Record-level calculations (per order/item)
281
- 3. Group-level aggregations (averages, totals)
282
- * For percentage calculations:
283
- - Calculate components separately
284
- - Use NULLIF for division to avoid divide by zero
285
- - Example:
286
- ROUND(
287
- CAST(discount_value AS NUMERIC) * 100.0 /
288
- NULLIF(CAST(total_value AS NUMERIC), 0),
289
- 2
290
- ) as discount_percentage
291
- - For statistical analysis and outliers:
292
- * Use CTEs to calculate statistics separately
293
- * Calculate quartiles using percentile_cont without OVER clause
294
- * For outliers, use 1.5 * IQR method with pre-calculated quartiles
295
- * Avoid window functions with ordered-set aggregates
296
- - For date/time calculations:
297
- * Always cast date/time fields before operations
298
- * Use date_part('field', CAST(column AS timestamp))
299
- * Use date_trunc('field', CAST(column AS timestamp))
300
- * For intervals, use CAST(value AS interval)
301
- * Avoid direct numeric operations on dates
302
- - For customer behavior analysis:
303
- * Pre-calculate aggregates in CTEs
304
- * Ensure proper type casting for all date/time fields
305
- * Use count(*) instead of count(column) when possible
306
- * Always cast numeric aggregations to NUMERIC
307
- * For segmentation, use CASE statements with explicit casts
308
- - For table references and aliases:
309
- * Always qualify column names with table aliases
310
- * Define each CTE with a clear purpose
311
- * Reference the correct CTE in subsequent calculations
312
- * Use meaningful alias names (e.g., orders o, customers c)
313
- * Ensure all referenced tables exist in FROM clause
314
- - For CTEs and subqueries:
315
- * Always name CTEs descriptively (e.g., avg_discounts, order_totals)
316
- * Reference CTEs in the main query using their full names
317
- * Include all necessary CTEs in the WITH clause
318
- * Chain CTEs in logical order
319
- * Ensure each CTE is properly referenced
320
- - Query optimization requirements:
321
- * Limit to essential joins only
322
- * Filter data early in the query
323
- * Use subqueries sparingly
324
- * Avoid cross joins
325
- * Keep window functions minimal
326
- - Do not include markdown code blocks or SQL syntax highlighting in your response
327
- - Do not include any other text in your response
328
- - If you cannot construct a query using only the available columns, respond with an error message starting with "ERROR:"
282
+ segment,
283
+ CAST(value AS NUMERIC) as x,
284
+ CAST(discount AS NUMERIC) as y
285
+ FROM base_table
286
+ ) t
287
+ GROUP BY segment
288
+ )
289
+ SELECT
290
+ segment,
291
+ (n * sum_xy - sum_x * sum_y) /
292
+ (SQRT(n * SUM(x * x) - SUM(x) * SUM(x)) *
293
+ SQRT(n * SUM(y * y) - SUM(y) * SUM(y))) as correlation
294
+ FROM segment_data
295
+ GROUP BY segment;
296
+
297
+ 3. Percentile-based Segmentation:
298
+ WITH base_data AS (
299
+ SELECT
300
+ *,
301
+ NTILE(10) OVER (ORDER BY CAST(value AS NUMERIC)) as segment
302
+ FROM source_table
303
+ ),
304
+ segment_metrics AS (
305
+ SELECT
306
+ segment,
307
+ COUNT(*) as count,
308
+ AVG(CAST(value AS NUMERIC)) as avg_value
309
+ FROM base_data
310
+ GROUP BY segment
311
+ )
312
+ SELECT * FROM segment_metrics
313
+ ORDER BY segment;
314
+
315
+ 4. Complex Calculations:
316
+ * Always calculate in stages:
317
+ 1. Raw values and type casting
318
+ 2. Record-level calculations
319
+ 3. Group-level aggregations
320
+ * Use CASE statements for clear segment definitions
321
+ * Handle divisions with NULLIF
322
+ * Use EXISTS/NOT EXISTS for filtering related records
323
+ * Count distinct keys to prevent duplicates
324
+
329
325
- For segmentation and grouping logic:
330
326
* Define mutually exclusive conditions
331
327
* Use EXISTS/NOT EXISTS for related table checks
@@ -344,14 +340,9 @@ async function generateSqlQuery(apiKey: string, schemaInfo: string, question: st
344
340
- For hierarchical data analysis:
345
341
* When analyzing parent records (e.g., orders, invoices):
346
342
- Consider all child records (e.g., line items, details) for segmentation
347
- - Use EXISTS/NOT EXISTS to check conditions across child records
348
- - For "records with condition":
349
- EXISTS (SELECT 1 FROM child_table WHERE parent_id = parent.id AND condition)
350
- - For "records without condition":
351
- NOT EXISTS (SELECT 1 FROM child_table WHERE parent_id = parent.id AND condition)
352
- * Calculate aggregates at the appropriate level
353
- * Document the analysis level in comments
354
- * Verify parent-child relationships using schema constraints
343
+ * Calculate aggregates at the appropriate level
344
+ * Document the analysis level in comments
345
+ * Verify parent-child relationships using schema constraints
355
346
- For segmentation analysis:
356
347
* Always ensure segments are MECE (Mutually Exclusive, Collectively Exhaustive)
357
348
* For combining segments with totals, use this pattern:
0 commit comments