# frozen_string_literal: true

require_relative 'table'

module GitlabQuality
  module TestTooling
    module CodeCoverage
      module ClickHouse
        class TestFileMappingsTable < GitlabQuality::TestTooling::CodeCoverage::ClickHouse::Table
          TABLE_NAME = "test_file_mappings"

          # Override push to filter out duplicate mappings before inserting
          # This prevents accumulating 36M duplicate rows per day in ReplacingMergeTree
          #
          # @param data [Array<Hash>] Code coverage related data to be pushed to ClickHouse
          # @return [nil]
          def push(data) # rubocop:disable Metrics/AbcSize
            return logger.warn("#{LOG_PREFIX} No data found, skipping ClickHouse export!") if data.empty?

            logger.debug("#{LOG_PREFIX} Starting data export to ClickHouse")
            sanitized_data = sanitize(data)

            return logger.warn("#{LOG_PREFIX} No valid data found after sanitization, skipping ClickHouse export!") if sanitized_data.empty?

            # Filter out records that already exist with identical values
            new_or_changed_records = filter_duplicates(sanitized_data)

            if new_or_changed_records.empty?
              logger.info("#{LOG_PREFIX} All #{sanitized_data.size} mappings already exist with same values, skipping insert to #{full_table_name}")
              return
            end

            client.insert_json_data(table_name, new_or_changed_records)
            skipped_count = sanitized_data.size - new_or_changed_records.size
            logger.info("#{LOG_PREFIX} Successfully pushed #{new_or_changed_records.size} new/changed records " \
              "to #{full_table_name} (skipped #{skipped_count} duplicates)")
          rescue StandardError => e
            logger.error("#{LOG_PREFIX} Error occurred while pushing data to #{full_table_name}: #{e.message}")
            raise
          end

          private

          # @return [Boolean] True if the record is valid, false otherwise
          def valid_record?(record)
            valid_test_file?(record) && valid_source_file?(record)
          end

          # @return [Boolean] True if the test_file field is present
          def valid_test_file?(record)
            return true unless record[:test_file].blank?

            logger.warn("#{LOG_PREFIX} Skipping record with nil/empty test_file: #{record}")
            false
          end

          # @return [Boolean] True if the source_file field is present
          def valid_source_file?(record)
            return true unless record[:source_file].blank?

            logger.warn("#{LOG_PREFIX} Skipping record with nil/empty source_file: #{record}")
            false
          end

          # @return [Hash] Transformed mapping data including timestamp and CI metadata
          def sanitized_data_record(record)
            {
              timestamp: time,
              test_file: record[:test_file],
              source_file: record[:source_file],
              # CI_PROJECT_PATH is set by GitLab CI runner and considered trusted infrastructure input.
              # In non-CI environments, this may be user-controlled, but sanitize_for_clickhouse
              # provides SQL injection protection.
              ci_project_path: ENV.fetch('CI_PROJECT_PATH', nil),
              category: record[:category] || '',
              group: record[:group] || '',
              stage: record[:stage] || '',
              section: record[:section] || ''
            }
          end

          # Filter out mappings that already exist in ClickHouse with identical enriched column values
          # Only returns records that are:
          # - New mappings (test_file + source_file combination doesn't exist), OR
          # - Changed mappings (enriched columns category/group/stage/section differ from existing)
          #
          # This prevents inserting 36M duplicate rows per day while still allowing updates when
          # ownership (category/group/stage/section) changes over time.
          #
          # @param records [Array<Hash>] Sanitized records to insert
          # @return [Array<Hash>] Records that are new or have changed values
          def filter_duplicates(records) # rubocop:disable Metrics/AbcSize
            return records if records.empty?

            # Assumes all records belong to the same project (batch from single pipeline run)
            ci_project_path = records.first[:ci_project_path]
            return records if ci_project_path.blank? # Can't query without project path

            logger.debug("#{LOG_PREFIX} Checking for duplicate mappings in #{full_table_name}")

            # Build a hash of existing mappings with their current enriched column values
            # Key: "test_file|source_file", Value: {category, group, stage, section}
            existing_mappings = fetch_existing_mappings(ci_project_path, records)

            # Filter records: keep only new or changed mappings
            new_or_changed = records.select do |record|
              key = "#{record[:test_file]}|#{record[:source_file]}"
              existing = existing_mappings[key]

              # New mapping - doesn't exist yet
              next true if existing.nil?

              # Existing mapping - check if enriched columns changed
              existing[:category] != record[:category] ||
                existing[:group] != record[:group] ||
                existing[:stage] != record[:stage] ||
                existing[:section] != record[:section]
            end

            logger.debug("#{LOG_PREFIX} Found #{new_or_changed.size} new/changed mappings out of #{records.size} total")
            new_or_changed
          rescue StandardError => e
            # If duplicate detection fails, fall back to inserting all records (safer than failing completely)
            logger.warn("#{LOG_PREFIX} Failed to check for duplicates: #{e.message}. Inserting all records.")
            records
          end

          # Fetch existing mappings from ClickHouse for the given project
          # Returns a hash mapping "test_file|source_file" => {category, group, stage, section}
          #
          # @param ci_project_path [String] The CI project path to filter by
          # @param records [Array<Hash>] Records being inserted (used to limit query scope)
          # @return [Hash] Existing mappings with their enriched column values
          def fetch_existing_mappings(ci_project_path, records) # rubocop:disable Metrics/AbcSize
            # Query for ALL latest mappings for this project
            # We filter in Ruby rather than building dynamic SQL to avoid injection risks
            sql = <<~SQL
              SELECT
                test_file,
                source_file,
                category,
                `group`,
                stage,
                section
              FROM #{full_table_name}
              WHERE ci_project_path = {project_path:String}
                AND (ci_project_path, test_file, source_file, timestamp) IN (
                  SELECT
                    ci_project_path,
                    test_file,
                    source_file,
                    MAX(timestamp) AS timestamp
                  FROM #{full_table_name}
                  WHERE ci_project_path = {project_path:String}
                  GROUP BY ci_project_path, test_file, source_file
                )
            SQL

            # Substitute project_path value using manual escaping (client doesn't support parameterized queries)
            sql_with_params = sql.strip.gsub('{project_path:String}', "'#{sanitize_for_clickhouse(ci_project_path)}'")
            results = client.query(sql_with_params)

            # Build lookup hash, filtering to only the test_files we're checking if dataset is small
            test_files_set = records.to_set { |r| r[:test_file] } if records.size <= 10_000

            results.each_with_object({}) do |row, hash|
              # Skip if we're doing selective filtering and this test_file isn't in our set
              next if test_files_set && !test_files_set.include?(row['test_file'])

              key = "#{row['test_file']}|#{row['source_file']}"
              hash[key] = {
                category: row['category'],
                group: row['group'],
                stage: row['stage'],
                section: row['section']
              }
            end
          rescue StandardError => e
            logger.warn("#{LOG_PREFIX} Failed to fetch existing mappings: #{e.message}")
            {} # Return empty hash on error (will treat all records as new)
          end

          # Sanitize string for ClickHouse by escaping single quotes and backslashes
          # This protects against SQL injection when we must use string interpolation
          # @param str [String] String to sanitize
          # @return [String] Sanitized string
          def sanitize_for_clickhouse(str)
            str.to_s.gsub(/\\/, '\\\\\\\\').gsub("'", "''") # rubocop:disable Style/RedundantRegexpArgument
          end
        end
      end
    end
  end
end
