CockroachDB Performance Monitoring Rule
This rule provides comprehensive guidance for monitoring CockroachDB performance, covering cluster health metrics, query performance analysis, resource utilization tracking, and alerting strategies. It focuses on CockroachDB-specific monitoring approaches using built-in observability features and external tools.
# CockroachDB Performance Monitoring Rule
## Overview
This rule provides comprehensive guidance for monitoring CockroachDB performance, covering cluster health metrics, query performance analysis, resource utilization tracking, and alerting strategies. It focuses on CockroachDB-specific monitoring approaches using built-in observability features and external tools.
## Implementation
### 1. Built-in Monitoring Queries
#### Cluster Health Assessment
```sql
-- Check overall cluster status
SELECT 
    node_id,
    address,
    build_tag,
    started_at,
    updated_at,
    locality,
    is_available,
    is_live
FROM crdb_internal.gossip_nodes
ORDER BY node_id;
-- Monitor node liveness
SELECT 
    node_id,
    epoch,
    expiration,
    draining,
    decommissioning,
    membership_status
FROM crdb_internal.gossip_liveness
ORDER BY node_id;
-- Check range distribution
SELECT 
    store_id,
    node_id,
    range_count,
    lease_count,
    bytes_used,
    bytes_available,
    used_capacity_percent
FROM crdb_internal.kv_store_status
ORDER BY used_capacity_percent DESC;
```
#### Query Performance Monitoring
```sql
-- Top slowest queries
SELECT 
    query,
    application_name,
    count,
    mean_lat,
    p50_lat,
    p90_lat,
    p99_lat,
    max_lat,
    max_mem_usage,
    max_disk_usage
FROM crdb_internal.statement_statistics
WHERE mean_lat > 0.1  -- Queries slower than 100ms
ORDER BY mean_lat DESC
LIMIT 20;
-- Most resource-intensive queries
SELECT 
    query,
    application_name,
    count,
    mean_lat,
    total_lat,
    max_mem_usage,
    max_disk_usage,
    rows_read,
    rows_written
FROM crdb_internal.statement_statistics
ORDER BY total_lat DESC
LIMIT 20;
-- Query retry statistics
SELECT 
    query,
    application_name,
    count,
    max_retries,
    CASE 
        WHEN count > 0 THEN (max_retries::FLOAT / count::FLOAT) * 100
        ELSE 0
    END as retry_rate_percent
FROM crdb_internal.statement_statistics
WHERE max_retries > 0
ORDER BY retry_rate_percent DESC;
```
#### Connection and Transaction Monitoring
```sql
-- Active connections per application
SELECT 
    application_name,
    COUNT(*) as active_connections,
    COUNT(DISTINCT user_name) as unique_users
FROM crdb_internal.cluster_sessions
WHERE status = 'ACTIVE'
GROUP BY application_name
ORDER BY active_connections DESC;
-- Long-running transactions
SELECT 
    session_id,
    query,
    start,
    NOW() - start as duration,
    application_name,
    user_name,
    client_address
FROM crdb_internal.cluster_queries
WHERE NOW() - start > INTERVAL '30 seconds'
ORDER BY duration DESC;
-- Transaction lock waits
SELECT 
    waiting_txn_id,
    waiting_query,
    blocking_txn_id,
    blocking_query,
    lock_key,
    lock_strength,
    wait_start,
    NOW() - wait_start as wait_duration
FROM crdb_internal.cluster_locks
WHERE NOW() - wait_start > INTERVAL '5 seconds'
ORDER BY wait_duration DESC;
```
### 2. Resource Utilization Monitoring
#### Memory Usage Tracking
```sql
-- Node memory usage
SELECT 
    node_id,
    component,
    level,
    value,
    unit
FROM crdb_internal.feature_usage
WHERE component = 'sql.mem'
ORDER BY node_id, level;
-- SQL memory usage by query
SELECT 
    query,
    application_name,
    count,
    mean_lat,
    max_mem_usage,
    (max_mem_usage / (1024*1024))::INT as max_mem_mb
FROM crdb_internal.statement_statistics
WHERE max_mem_usage > 0
ORDER BY max_mem_usage DESC
LIMIT 20;
-- KV store memory metrics
SELECT 
    store_id,
    node_id,
    capacity_bytes,
    available_bytes,
    used_bytes,
    (used_bytes::FLOAT / capacity_bytes::FLOAT * 100)::INT as usage_percent
FROM crdb_internal.kv_store_status
ORDER BY usage_percent DESC;
```
#### Disk I/O Monitoring
```sql
-- Storage usage by table
SELECT 
    table_schema,
    table_name,
    range_count,
    approximate_disk_bytes,
    (approximate_disk_bytes / (1024*1024*1024))::INT as size_gb
FROM crdb_internal.table_spans
WHERE approximate_disk_bytes > 0
ORDER BY approximate_disk_bytes DESC;
-- Range splits and merges
SELECT 
    table_name,
    range_id,
    start_key,
    end_key,
    range_size_mb,
    lease_holder,
    replicas
FROM crdb_internal.ranges
WHERE range_size_mb > 64  -- Ranges larger than 64MB
ORDER BY range_size_mb DESC;
```
### 3. Performance Alerting System
#### Go-based Monitoring Service
```go
package main
import (
    "context"
    "database/sql"
    "fmt"
    "log"
    "time"
    _ "github.com/lib/pq"
)
type MetricThreshold struct {
    Name      string
    Query     string
    Threshold float64
    Unit      string
}
type AlertManager struct {
    db         *sql.DB
    thresholds []MetricThreshold
    alertChan  chan Alert
}
type Alert struct {
    Metric    string
    Value     float64
    Threshold float64
    Timestamp time.Time
    Severity  string
}
func NewAlertManager(db *sql.DB) *AlertManager {
    return &AlertManager{
        db: db,
        thresholds: []MetricThreshold{
            {
                Name:      "slow_query_count",
                Query:     "SELECT COUNT(*) FROM crdb_internal.statement_statistics WHERE mean_lat > 1.0",
                Threshold: 10,
                Unit:      "queries",
            },
            {
                Name:      "high_retry_rate",
                Query:     "SELECT MAX(max_retries::FLOAT / GREATEST(count, 1)::FLOAT) FROM crdb_internal.statement_statistics",
                Threshold: 0.1,
                Unit:      "ratio",
            },
            {
                Name:      "storage_usage_percent",
                Query:     "SELECT MAX(used_bytes::FLOAT / capacity_bytes::FLOAT * 100) FROM crdb_internal.kv_store_status",
                Threshold: 80,
                Unit:      "percent",
            },
            {
                Name:      "connection_count",
                Query:     "SELECT COUNT(*) FROM crdb_internal.cluster_sessions WHERE status = 'ACTIVE'",
                Threshold: 500,
                Unit:      "connections",
            },
        },
        alertChan: make(chan Alert, 100),
    }
}
func (am *AlertManager) StartMonitoring(ctx context.Context) {
    ticker := time.NewTicker(30 * time.Second)
    defer ticker.Stop()
    go am.handleAlerts(ctx)
    for {
        select {
        case <-ctx.Done():
            return
        case <-ticker.C:
            am.checkMetrics()
        }
    }
}
func (am *AlertManager) checkMetrics() {
    for _, threshold := range am.thresholds {
        var value float64
        err := am.db.QueryRow(threshold.Query).Scan(&value)
        if err != nil {
            log.Printf("Error checking metric %s: %v", threshold.Name, err)
            continue
        }
        if value > threshold.Threshold {
            severity := "WARNING"
            if value > threshold.Threshold*1.5 {
                severity = "CRITICAL"
            }
            alert := Alert{
                Metric:    threshold.Name,
                Value:     value,
                Threshold: threshold.Threshold,
                Timestamp: time.Now(),
                Severity:  severity,
            }
            select {
            case am.alertChan <- alert:
            default:
                log.Printf("Alert channel full, dropping alert for %s", threshold.Name)
            }
        }
    }
}
func (am *AlertManager) handleAlerts(ctx context.Context) {
    for {
        select {
        case <-ctx.Done():
            return
        case alert := <-am.alertChan:
            am.processAlert(alert)
        }
    }
}
func (am *AlertManager) processAlert(alert Alert) {
    log.Printf("[%s] %s: %.2f %s (threshold: %.2f)",
        alert.Severity, alert.Metric, alert.Value, "unit", alert.Threshold)
    
    // Send to external alerting system
    // sendToSlack(alert)
    // sendToEmail(alert)
    // sendToPagerDuty(alert)
}
// Performance metrics collection
func (am *AlertManager) CollectPerformanceMetrics() map[string]interface{} {
    metrics := make(map[string]interface{})
    // Query latency metrics
    var avgLatency, p99Latency float64
    am.db.QueryRow(`
        SELECT AVG(mean_lat), PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY mean_lat)
        FROM crdb_internal.statement_statistics
        WHERE count > 0
    `).Scan(&avgLatency, &p99Latency)
    metrics["avg_query_latency"] = avgLatency
    metrics["p99_query_latency"] = p99Latency
    // Connection metrics
    var activeConnections int
    am.db.QueryRow(`
        SELECT COUNT(*) FROM crdb_internal.cluster_sessions WHERE status = 'ACTIVE'
    `).Scan(&activeConnections)
    metrics["active_connections"] = activeConnections
    // Storage metrics
    var totalStorageUsed, totalStorageAvailable int64
    am.db.QueryRow(`
        SELECT SUM(used_bytes), SUM(available_bytes)
        FROM crdb_internal.kv_store_status
    `).Scan(&totalStorageUsed, &totalStorageAvailable)
    metrics["storage_used_bytes"] = totalStorageUsed
    metrics["storage_available_bytes"] = totalStorageAvailable
    return metrics
}
```
### 4. External Monitoring Integration
#### Prometheus Metrics Export
```yaml
# prometheus.yml configuration
global:
  scrape_interval: 15s
scrape_configs:
  - job_name: 'cockroachdb'
    static_configs:
      - targets: ['localhost:8080']
    metrics_path: '/_status/vars'
    scrape_interval: 10s
rule_files:
  - "cockroachdb_rules.yml"
alerting:
  alertmanagers:
    - static_configs:
        - targets: ['localhost:9093']
```
#### Grafana Dashboard Configuration
```json
{
  "dashboard": {
    "title": "CockroachDB Performance Dashboard",
    "panels": [
      {
        "title": "Query Latency",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.99, rate(sql_exec_latency_bucket[5m]))",
            "legendFormat": "P99"
          },
          {
            "expr": "histogram_quantile(0.95, rate(sql_exec_latency_bucket[5m]))",
            "legendFormat": "P95"
          }
        ]
      },
      {
        "title": "Throughput",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(sql_query_count[5m])",
            "legendFormat": "Queries/sec"
          }
        ]
      },
      {
        "title": "Connection Count",
        "type": "singlestat",
        "targets": [
          {
            "expr": "sql_conns",
            "legendFormat": "Active Connections"
          }
        ]
      }
    ]
  }
}
```
### 5. Custom Monitoring Scripts
#### Health Check Script
```bash
#!/bin/bash
# CockroachDB Health Check Script
set -e
DB_HOST="localhost:26257"
DB_NAME="myapp"
DB_USER="monitoring"
ALERT_EMAIL="admin@company.com"
# Function to execute SQL and get result
execute_sql() {
    cockroach sql --host=$DB_HOST --database=$DB_NAME --user=$DB_USER --execute="$1" --format=csv
}
# Check cluster health
check_cluster_health() {
    echo "Checking cluster health..."
    
    # Check for down nodes
    down_nodes=$(execute_sql "SELECT COUNT(*) FROM crdb_internal.gossip_nodes WHERE is_live = false" | tail -n 1)
    if [ "$down_nodes" -gt 0 ]; then
        echo "ALERT: $down_nodes nodes are down!"
        echo "Down nodes detected in CockroachDB cluster" | mail -s "CockroachDB Alert" $ALERT_EMAIL
    fi
    
    # Check for high storage usage
    high_storage=$(execute_sql "SELECT COUNT(*) FROM crdb_internal.kv_store_status WHERE (used_bytes::FLOAT / capacity_bytes::FLOAT * 100) > 80" | tail -n 1)
    if [ "$high_storage" -gt 0 ]; then
        echo "ALERT: $high_storage nodes have high storage usage!"
        echo "High storage usage detected" | mail -s "CockroachDB Storage Alert" $ALERT_EMAIL
    fi
}
# Check query performance
check_query_performance() {
    echo "Checking query performance..."
    
    # Check for slow queries
    slow_queries=$(execute_sql "SELECT COUNT(*) FROM crdb_internal.statement_statistics WHERE mean_lat > 1.0" | tail -n 1)
    if [ "$slow_queries" -gt 10 ]; then
        echo "ALERT: $slow_queries slow queries detected!"
        echo "High number of slow queries detected" | mail -s "CockroachDB Performance Alert" $ALERT_EMAIL
    fi
    
    # Check retry rate
    high_retry=$(execute_sql "SELECT COUNT(*) FROM crdb_internal.statement_statistics WHERE max_retries > count * 0.1" | tail -n 1)
    if [ "$high_retry" -gt 0 ]; then
        echo "ALERT: High retry rate detected!"
        echo "High transaction retry rate detected" | mail -s "CockroachDB Retry Alert" $ALERT_EMAIL
    fi
}
# Check replication health
check_replication() {
    echo "Checking replication health..."
    
    # Check for under-replicated ranges
    under_replicated=$(execute_sql "SELECT COUNT(*) FROM crdb_internal.ranges WHERE array_length(replicas, 1) < 3" | tail -n 1)
    if [ "$under_replicated" -gt 0 ]; then
        echo "ALERT: $under_replicated under-replicated ranges!"
        echo "Under-replicated ranges detected" | mail -s "CockroachDB Replication Alert" $ALERT_EMAIL
    fi
}
# Main execution
main() {
    echo "Starting CockroachDB health check at $(date)"
    
    check_cluster_health
    check_query_performance
    check_replication
    
    echo "Health check completed at $(date)"
}
# Run if called directly
if [ "${BASH_SOURCE[0]}" == "${0}" ]; then
    main "$@"
fi
```
#### Performance Report Generator
```python
import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
class CockroachDBPerformanceReport:
    def __init__(self, connection_string):
        self.conn = psycopg2.connect(connection_string)
        
    def generate_daily_report(self):
        """Generate comprehensive daily performance report"""
        
        # Query performance metrics
        query_stats = self.get_query_performance_stats()
        
        # Resource utilization
        resource_stats = self.get_resource_utilization()
        
        # Cluster health
        cluster_health = self.get_cluster_health()
        
        # Generate visualizations
        self.create_performance_charts(query_stats, resource_stats)
        
        # Generate HTML report
        html_report = self.generate_html_report(query_stats, resource_stats, cluster_health)
        
        return html_report
    
    def get_query_performance_stats(self):
        """Get query performance statistics"""
        query = """
        SELECT 
            query,
            application_name,
            count,
            mean_lat,
            p99_lat,
            max_mem_usage,
            max_retries,
            rows_read,
            rows_written
        FROM crdb_internal.statement_statistics
        WHERE count > 0
        ORDER BY total_lat DESC
        LIMIT 50
        """
        
        return pd.read_sql(query, self.conn)
    
    def get_resource_utilization(self):
        """Get resource utilization metrics"""
        query = """
        SELECT 
            node_id,
            store_id,
            capacity_bytes,
            available_bytes,
            used_bytes,
            (used_bytes::FLOAT / capacity_bytes::FLOAT * 100) as usage_percent
        FROM crdb_internal.kv_store_status
        ORDER BY usage_percent DESC
        """
        
        return pd.read_sql(query, self.conn)
    
    def get_cluster_health(self):
        """Get cluster health metrics"""
        query = """
        SELECT 
            node_id,
            address,
            is_available,
            is_live,
            locality
        FROM crdb_internal.gossip_nodes
        ORDER BY node_id
        """
        
        return pd.read_sql(query, self.conn)
    
    def create_performance_charts(self, query_stats, resource_stats):
        """Create performance visualization charts"""
        
        # Create figure with subplots
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Query latency distribution
        axes[0, 0].hist(query_stats['mean_lat'], bins=30, alpha=0.7)
        axes[0, 0].set_title('Query Latency Distribution')
        axes[0, 0].set_xlabel('Mean Latency (seconds)')
        axes[0, 0].set_ylabel('Frequency')
        
        # Top queries by execution count
        top_queries = query_stats.nlargest(10, 'count')
        axes[0, 1].barh(range(len(top_queries)), top_queries['count'])
        axes[0, 1].set_title('Top Queries by Execution Count')
        axes[0, 1].set_xlabel('Execution Count')
        
        # Storage usage by node
        axes[1, 0].bar(resource_stats['node_id'], resource_stats['usage_percent'])
        axes[1, 0].set_title('Storage Usage by Node')
        axes[1, 0].set_xlabel('Node ID')
        axes[1, 0].set_ylabel('Usage Percent')
        
        # Memory usage distribution
        axes[1, 1].scatter(query_stats['count'], query_stats['max_mem_usage'])
        axes[1, 1].set_title('Memory Usage vs Query Count')
        axes[1, 1].set_xlabel('Query Count')
        axes[1, 1].set_ylabel('Max Memory Usage (bytes)')
        
        plt.tight_layout()
        plt.savefig('performance_report.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    def generate_html_report(self, query_stats, resource_stats, cluster_health):
        """Generate HTML performance report"""
        
        html_template = """
        <!DOCTYPE html>
        <html>
        <head>
            <title>CockroachDB Performance Report</title>
            <style>
                body { font-family: Arial, sans-serif; margin: 40px; }
                .header { background-color: #f0f0f0; padding: 20px; border-radius: 5px; }
                .metric { margin: 10px 0; padding: 10px; border-left: 4px solid #007cba; }
                .alert { border-left-color: #ff6b6b; background-color: #fff5f5; }
                .warning { border-left-color: #ffa500; background-color: #fff8e1; }
                .good { border-left-color: #51cf66; background-color: #f3fff3; }
                table { width: 100%; border-collapse: collapse; margin: 20px 0; }
                th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
                th { background-color: #f2f2f2; }
            </style>
        </head>
        <body>
            <div class="header">
                <h1>CockroachDB Performance Report</h1>
                <p>Generated: {timestamp}</p>
            </div>
            
            <h2>Cluster Health Summary</h2>
            <div class="metric {health_class}">
                <strong>Cluster Status:</strong> {cluster_status}<br>
                <strong>Active Nodes:</strong> {active_nodes}<br>
                <strong>Total Nodes:</strong> {total_nodes}
            </div>
            
            <h2>Query Performance</h2>
            <div class="metric">
                <strong>Average Query Latency:</strong> {avg_latency:.3f}s<br>
                <strong>P99 Query Latency:</strong> {p99_latency:.3f}s<br>
                <strong>Total Queries:</strong> {total_queries:,}
            </div>
            
            <h2>Resource Utilization</h2>
            <div class="metric {storage_class}">
                <strong>Average Storage Usage:</strong> {avg_storage:.1f}%<br>
                <strong>Max Storage Usage:</strong> {max_storage:.1f}%
            </div>
            
            <h2>Top Slow Queries</h2>
            <table>
                <thead>
                    <tr>
                        <th>Query</th>
                        <th>Count</th>
                        <th>Mean Latency</th>
                        <th>P99 Latency</th>
                        <th>Max Memory</th>
                    </tr>
                </thead>
                <tbody>
                    {slow_queries_table}
                </tbody>
            </table>
            
            <h2>Performance Chart</h2>
            <img src="performance_report.png" alt="Performance Charts" style="max-width: 100%;"/>
            
        </body>
        </html>
        """
        
        # Calculate metrics
        active_nodes = cluster_health['is_live'].sum()
        total_nodes = len(cluster_health)
        avg_latency = query_stats['mean_lat'].mean()
        p99_latency = query_stats['p99_lat'].quantile(0.99)
        total_queries = query_stats['count'].sum()
        avg_storage = resource_stats['usage_percent'].mean()
        max_storage = resource_stats['usage_percent'].max()
        
        # Determine health status
        health_class = "good" if active_nodes == total_nodes else "alert"
        cluster_status = "Healthy" if active_nodes == total_nodes else "Degraded"
        storage_class = "good" if max_storage < 80 else "warning" if max_storage < 90 else "alert"
        
        # Generate slow queries table
        slow_queries = query_stats.nlargest(10, 'mean_lat')
        slow_queries_table = ""
        for _, row in slow_queries.iterrows():
            slow_queries_table += f"""
                <tr>
                    <td>{row['query'][:100]}...</td>
                    <td>{row['count']}</td>
                    <td>{row['mean_lat']:.3f}s</td>
                    <td>{row['p99_lat']:.3f}s</td>
                    <td>{row['max_mem_usage']:,} bytes</td>
                </tr>
            """
        
        return html_template.format(
            timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            health_class=health_class,
            cluster_status=cluster_status,
            active_nodes=active_nodes,
            total_nodes=total_nodes,
            avg_latency=avg_latency,
            p99_latency=p99_latency,
            total_queries=total_queries,
            storage_class=storage_class,
            avg_storage=avg_storage,
            max_storage=max_storage,
            slow_queries_table=slow_queries_table
        )
# Usage example
if __name__ == "__main__":
    connection_string = "postgresql://monitoring:password@localhost:26257/myapp"
    reporter = CockroachDBPerformanceReport(connection_string)
    
    # Generate daily report
    report = reporter.generate_daily_report()
    
    # Save to file
    with open('daily_performance_report.html', 'w') as f:
        f.write(report)
    
    print("Performance report generated successfully!")
```
## Best Practices
### 1. Monitoring Strategy
- **Establish baselines**: Collect performance metrics during normal operations
- **Set appropriate thresholds**: Use percentiles rather than averages for alerting
- **Monitor trends**: Track performance degradation over time
- **Correlate metrics**: Combine multiple metrics for better insights
### 2. Alert Configuration
- **Avoid alert fatigue**: Set thresholds that indicate real problems
- **Use escalation**: Implement tiered alerting for different severity levels
- **Include context**: Provide actionable information in alerts
- **Test alerting**: Regularly test alert delivery mechanisms
### 3. Performance Baselines
- **Regular benchmarking**: Establish performance baselines for key operations
- **Seasonal adjustments**: Account for expected traffic patterns
- **Capacity planning**: Use historical data for future capacity needs
- **SLA tracking**: Monitor against defined service level agreements
### 4. Data Retention
- **Archive old metrics**: Implement data retention policies for monitoring data
- **Aggregate historical data**: Use rollup strategies for long-term storage
- **Backup monitoring configs**: Version control monitoring configurations
- **Document thresholds**: Maintain documentation for alert thresholds
## Common Issues
### 1. High Query Latency
**Problem**: Queries taking longer than expected
**Solution**: 
```sql
-- Identify slow queries
SELECT 
    query,
    count,
    mean_lat,
    p99_lat,
    max_mem_usage,
    rows_read
FROM crdb_internal.statement_statistics
WHERE mean_lat > 1.0
ORDER BY mean_lat DESC;
-- Check for missing indexes
SELECT 
    query,
    plan_gist,
    count,
    mean_lat
FROM crdb_internal.statement_statistics
WHERE plan_gist LIKE '%Scan%'
  AND mean_lat > 0.5
ORDER BY mean_lat DESC;
```
### 2. Memory Pressure
**Problem**: Nodes experiencing high memory usage
**Solution**: 
```sql
-- Monitor memory usage by query
SELECT 
    query,
    application_name,
    count,
    max_mem_usage,
    (max_mem_usage / (1024*1024))::INT as max_mem_mb
FROM crdb_internal.statement_statistics
WHERE max_mem_usage > 100*1024*1024  -- > 100MB
ORDER BY max_mem_usage DESC;
-- Check for memory leaks
SELECT 
    node_id,
    store_id,
    capacity_bytes,
    available_bytes,
    used_bytes
FROM crdb_internal.kv_store_status
WHERE (used_bytes::FLOAT / capacity_bytes::FLOAT) > 0.8;
```
### 3. Connection Pool Issues
**Problem**: Connection pool exhaustion or inefficiency
**Solution**: 
```sql
-- Monitor connection usage
SELECT 
    application_name,
    user_name,
    client_address,
    COUNT(*) as connection_count,
    COUNT(DISTINCT session_id) as unique_sessions
FROM crdb_internal.cluster_sessions
GROUP BY application_name, user_name, client_address
ORDER BY connection_count DESC;
-- Check for idle connections
SELECT 
    session_id,
    application_name,
    last_active_query,
    session_start,
    NOW() - last_active_query as idle_time
FROM crdb_internal.cluster_sessions
WHERE NOW() - last_active_query > INTERVAL '10 minutes'
ORDER BY idle_time DESC;
```
### 4. Storage Hotspots
**Problem**: Uneven storage distribution across nodes
**Solution**: 
```sql
-- Check storage distribution
SELECT 
    node_id,
    store_id,
    range_count,
    lease_count,
    (used_bytes / (1024*1024*1024))::INT as used_gb,
    (available_bytes / (1024*1024*1024))::INT as available_gb
FROM crdb_internal.kv_store_status
ORDER BY used_bytes DESC;
-- Identify hot ranges
SELECT 
    range_id,
    table_name,
    start_key,
    end_key,
    lease_holder,
    range_size_mb
FROM crdb_internal.ranges
WHERE range_size_mb > 64
ORDER BY range_size_mb DESC;
```
### 5. Replication Lag
**Problem**: Replication falling behind
**Solution**: 
```sql
-- Check replication status
SELECT 
    range_id,
    lease_holder,
    replicas,
    replica_localities,
    split_enforced_until
FROM crdb_internal.ranges
WHERE array_length(replicas, 1) < 3
ORDER BY range_id;
-- Monitor raft log size
SELECT 
    range_id,
    raft_leader,
    raft_state,
    approximate_disk_bytes
FROM crdb_internal.ranges
WHERE approximate_disk_bytes > 64*1024*1024  -- > 64MB
ORDER BY approximate_disk_bytes DESC;
```
## Automated Monitoring Setup
### Cron Job for Regular Checks
```bash
# Add to crontab: crontab -e
# Run health check every 5 minutes
*/5 * * * * /path/to/cockroach_health_check.sh >> /var/log/cockroach_monitoring.log 2>&1
# Generate daily performance report
0 6 * * * /path/to/generate_performance_report.py >> /var/log/cockroach_reports.log 2>&1
# Weekly storage cleanup
0 2 * * 0 /path/to/cleanup_old_metrics.sh >> /var/log/cockroach_cleanup.log 2>&1
```
### Systemd Service for Continuous Monitoring
```ini
[Unit]
Description=CockroachDB Performance Monitor
After=network.target
[Service]
Type=simple
User=
Created: 6/1/2025
Keywords: text snippets, slack for ai prompts, slack for ai, AI consulting, AI Cheat Tool, AI Cheat Tool for developers, AI Cheat Tool for AI, AI Cheat Tool for ChatGPT, chatgpt prompt generator, AI Cheat Tool for email, AI Cheat Tool for text, AI Cheat Tool for keyboard shortcuts, AI Cheat Tool for text expansion, AI Cheat Tool for text snippets, AI Cheat Tool for text replacement, AI Cheating Tool, AI Cheating Tool for developers, AI Cheating Tool for AI, AI Cheating Tool for ChatGPT, AI Cheating Tool for email, AI Cheating Tool for text, AI Cheating Tool for keyboard shortcuts, prompt cheating, AI prompt engineering, AI context engineering, context engineering, ai prompt manager, AI prompt manager, AI prompt management, ai consulting, prompt engineering consulting, generative ai consulting, ai implementation services, llm integration consultants, ai strategy for enterprises, enterprise ai transformation, ai prompt optimization, large language model consulting, ai training for teams, ai workflow automation, build ai knowledge base, llm prompt management, ai prompt infrastructure, ai adoption consulting, enterprise ai onboarding, custom ai workflow design, ai integration for dev teams, ai productivity tools, team prompt collaboration, github gists, github snippets, github code snippets, github code snippets automation, github, text expansion, text automation, snippet manager, code snippets, team collaboration tools, shared snippets, snippet sharing, keyboard shortcuts, productivity tools, workflow automation, AI-powered productivity, snippet tool for teams, team knowledge base, AI text completion, text expander for teams, snippet collaboration, multi-platform productivity, custom keyboard shortcuts, snippet sharing platform, collaborative snippet management, knowledge base automation, team productivity software, business productivity tools, snippet management software, quick text input, macOS productivity apps, Windows productivity tools, Linux productivity tools, cloud-based snippets, cross-platform snippets, team workspace tools, workflow enhancement tools, automation tools for teams, text automation software, team knowledge sharing, task automation, integrated team tools, real-time collaboration, AI for team productivity, business text automation, time-saving tools, clipboard manager, multi-device clipboard, keyboard shortcut manager, team communication tools, project management integration, productivity boost AI, text snippet sharing, text replacement software, text management tools, efficient team collaboration, AI workspace tools, modern productivity apps, custom text automation, digital workspace tools, collaborative workspaces, cloud productivity tools, streamline team workflows, smart text management, snippets AI app, snippet management for teams, shared knowledge platforms, team-focused text automation, team productivity platform, AI text expansion tools, snippet taking app, note taking app, note taking software, note taking tools, note taking app for teams, note taking app for developers, note taking app for AI, note taking app for ChatGPT, snippet software, snippet tools, snippet app for teams, snippet app for developers, snippet app for AI, snippet app for ChatGPT, AI agent builder, AI agent snippets, AI agent prompts, prompt management, prompt engineering, ChatGPT snippets, ChatGPT prompts, AI prompt optimization, AI-powered prompts, prompt libraries for AI, prompt sharing for ChatGPT, GPT productivity tools, AI assistant snippets, ChatGPT integrations, custom AI prompts, AI agent workflows, machine learning snippets, automated AI prompts, AI workflow automation, collaborative AI prompts, personalized AI agents, text snippets for ChatGPT, AI prompt creation tools, AI code snippet manager, GPT-4 text automation, AI-powered writing assistants, AI tools for developers, AI agent integrations, developer prompt snippets, AI text generation workflows, AI-enhanced productivity, GPT prompt sharing tools, team collaboration for AI, openAI integrations, text automation for AI teams, AI-powered collaboration tools, GPT-4 team tools, AI-driven text expanders, AI-driven productivity solutions, AI agent for email writing, AI agent for text expansion, AI agent for text automation, AI agent for text snippets, AI agent for text replacement, AI agent for keyboard shortcuts, AI Agent Developer, Prompt engineering, Machine Learning Engineer, AI Engineer, Customer Support, Code snippets for developers, Recruiting, AI agent for automation, AI agent for AI automation, AI agent for ChatGPT automation, AI agent for email automation, electron app for snippets, desktop snippet manager, code snippet organization, AI prompt repository, intelligent text expansion, vibe coding, Claude cli ai prompts, prompt optimizer, buy prompts, sell prompts, snippets store, sell scripts, buy scripts, buy python scripts, scraping scripts, AI prompt marketplace, ChatGPT prompt marketplace, best AI prompts, best ChatGPT prompts, AI prompt database, AI prompt packs, AI prompt bundles, GPT prompt marketplace, prompt engineering masterclass, prompt engineering certification, prompt engineering course, ChatGPT prompt store, AI prompt store, prompt monetization, sell AI prompts, buy AI prompts, prompt marketplace platform, AI prompt plugins, Claude prompt marketplace, AI prompt subscription, Custom GPT, real-time prompt collaboration, developer workflow optimization, team prompt library, knowledge management for developers, code snippet search, searchable code library, reusable code blocks, prompt engineering tools, prompt template management, collaborative coding, cross-team knowledge sharing, code snippet versioning, AI prompt templates, technical documentation tools, developer productivity suite, team snippet repository, AI prompt history, snippet synchronization, cloud snippet backup, markdown snippet support, syntax highlighting for snippets, code categorization, programming language snippets, language-specific code templates, contextual code suggestions, snippets with AI integration, command palette for snippets, code snippet folder organization, team snippet discovery, private and public snippets, enterprise code management, team codebase documentation, prompt engineering best practices, Vibe Coding, Vibe Coding for developers, Vibe Coding for AI, Vibe Coding for ChatGPT, Vibe Coding for email, Vibe Coding for text, Vibe Coding for keyboard shortcuts, Vibe Coding for text expansion, Vibe Coding for text snippets, Vibe Coding for text replacement, free prompt generator, ai prompt generator, prompt generator, promptlayer, promptimize ai, langchain prompt management, lanhsmith prompt management, latitude, langchain, langgraph, langchain documentation, raycast, text expander, raycast snippets, raycast mac, cursor, cursro ai, cursor snippets, cursor rules, cursor ai rules, learn prompting, how to prompt, prompting guide, prompting tutorials, best prompting practices, ai prompt best practices, prompting techniques, prompting, go, rest, machine learning, performance, testing, spa, postgresql, api, git, ios, analytics, seo, node, c++, react, python, pandas, matplotlib, monitoring
AI Prompts, ChatGPT, Code Snippets, Prompt Engineering