DiTo97 · November 13, 2025 00:09
diff --git a/sampling_methods_for_evaluation.py b/sampling_methods_for_evaluation.py
 import math
 import numpy as np
 from scipy.stats import beta
 from collections import defaultdict

 # ==============================================================================
 # 1. Metric Estimation Functions
 # ==============================================================================

 def pass_at_k_unbiased_estimator(n, c, k):
    """
    Computes the unbiased estimator for pass@k.

    Args:
        n (int): Total number of solutions generated.
        c (int): Number of correct solutions among n.
        k (int): The k in pass@k.

    Returns:
        float: The pass@k estimate.
    """
    if n < k:
        raise ValueError("n must be greater than or equal to k")
    if n - c < k:
        # Trivial case: not enough failures to pick k failures.
        return 1.0
    
    # The estimator is 1 - P(all k solutions fail)
    # The probability of all k failing is comb(n-c, k) / comb(n, k)
    log_comb_n_k = math.log(math.comb(n, k))
    log_comb_n_minus_c_k = math.log(math.comb(n - c, k))
    
    return 1.0 - math.exp(log_comb_n_minus_c_k - log_comb_n_k)

 def pass_power_k_estimator(n, c, k):
    """
    Computes the pass^k estimator.

    Args:
        n (int): Total number of solutions generated.
        c (int): Number of correct solutions among n.
        k (int): The k in pass^k.

    Returns:
        float: The pass^k estimate.
    """
    if n == 0:
        return 0.0
    # The probability of a single success is c/n.
    # For k independent trials, the probability of all succeeding is (c/n)^k.
    p = c / n
    return p ** k

 # ==============================================================================
 # 2. Beta-Binomial Dynamic Sampling
 # ==============================================================================

 def get_beta_binomial_params(n, c):
    """
    Computes the parameters (alpha, beta) of the posterior beta distribution
    for the problem's success probability, given n trials and c successes.
    
    We use a uniform Beta(1, 1) as the prior.
    
    Args:
        n (int): Number of trials.
        c (int): Number of successes.
        
    Returns:
        tuple: A tuple (alpha, beta) for the posterior beta distribution.
    """
    return c + 1, n - c + 1

 def dynamic_sampling(problem, generator_func, max_n=500, tolerance=0.01):
    """
    Dynamically samples solutions for a single problem using a beta-binomial model.
    The process stops when the posterior standard deviation of the underlying
    success probability is sufficiently small.
    
    This is based on the idea from Kulal et al. (2025) and other Bayesian
    evaluation methods.
    
    Args:
        problem (dict): The problem data (e.g., prompt, tests).
        generator_func (function): A function that takes a problem and returns
                                   a generated solution.
        max_n (int): The maximum number of samples to generate.
        tolerance (float): The desired standard deviation of the posterior.
                           A smaller value means more samples and higher confidence.
                           
    Returns:
        tuple: A tuple (n, c), where n is the total number of samples and
               c is the number of correct ones.
    """
    c = 0
    
    # First, run a small pilot study to get an initial estimate
    pilot_n = min(10, max_n)
    solutions = [generator_func(problem) for _ in range(pilot_n)]
    c += sum(1 for sol in solutions if is_correct(sol, problem))
    n = pilot_n
    
    while n < max_n:
        alpha, beta_param = get_beta_binomial_params(n, c)
        
        # Calculate the standard deviation of the posterior Beta distribution
        posterior_std = math.sqrt((alpha * beta_param) / ((alpha + beta_param)**2 * (alpha + beta_param + 1)))
        
        if posterior_std < tolerance:
            break
        
        # Sample one more solution
        n += 1
        solution = generator_func(problem)
        if is_correct(solution, problem):
            c += 1
            
    return n, c

 # ==============================================================================
 # 3. Main Evaluation Function and Helper Classes
 # ==============================================================================

 class Problem:
    def __init__(self, problem_id, prompt, tests):
        self.problem_id = problem_id
        self.prompt = prompt
        self.tests = tests

 def is_correct(solution, problem):
    """
    A placeholder function for running tests against a generated solution.
    In a real-world scenario, this would involve a sandboxed execution environment.
    
    Args:
        solution (str): The generated code solution.
        problem (Problem): The problem object containing test cases.
    
    Returns:
        bool: True if the solution passes all tests, False otherwise.
    """
    # This is a mock implementation.
    # In practice, this needs to be a robust, sandboxed execution.
    try:
        # A simple check: if 'pass' is in the code, assume it's valid but potentially wrong.
        # A real check would run the tests.
        return len(solution) > 50 and 'return' in solution
    except Exception:
        return False

 def evaluate_experiment(problems, generation_func, k_values, dynamic_sampling_enabled=True):
    """
    Executes the evaluation experiment and computes pass@k and pass^k.
    
    Args:
        problems (list): A list of Problem objects.
        generation_func (function): A function to generate solutions.
        k_values (list): An array of k values to compute metrics for.
        dynamic_sampling_enabled (bool): Whether to use dynamic sampling or a fixed n.
        
    Returns:
        dict: A dictionary containing the computed pass@k and pass^k results.
    """
    results_per_problem = defaultdict(dict)
    
    for problem in problems:
        if dynamic_sampling_enabled:
            n, c = dynamic_sampling(problem, generation_func)
        else:
            # If not dynamic, use a fixed n, e.g., max_k * 2 or a standard value.
            max_k = max(k_values)
            fixed_n = max(max_k, 200)
            solutions = [generation_func(problem) for _ in range(fixed_n)]
            c = sum(1 for sol in solutions if is_correct(sol, problem))
            n = fixed_n

        for k in k_values:
            if k > n:
                print(f"Warning: k={k} > n={n} for problem {problem.problem_id}. Skipping pass@k.")
                continue
            
            pass_at_k = pass_at_k_unbiased_estimator(n, c, k)
            pass_power_k = pass_power_k_estimator(n, c, k)
            
            results_per_problem[problem.problem_id][f'pass@{k}'] = pass_at_k
            results_per_problem[problem.problem_id][f'pass^{k}'] = pass_power_k
            results_per_problem[problem.problem_id]['n'] = n
            results_per_problem[problem.problem_id]['c'] = c
    
    # Aggregate results
    final_metrics = defaultdict(list)
    for problem_id, metrics in results_per_problem.items():
        for metric, value in metrics.items():
            if metric.startswith('pass'):
                final_metrics[metric].append(value)
    
    average_metrics = {metric: np.mean(values) for metric, values in final_metrics.items()}
    
    return {
        'problem_level_results': results_per_problem,
        'average_results': average_metrics,
    }

 # ==============================================================================
 # 4. Example Usage
 # ==============================================================================

 if __name__ == '__main__':
    # Define a set of mock problems
    mock_problems = [
        Problem(problem_id='prob_001', prompt='Write a function that adds two numbers.', tests=[]),
        Problem(problem_id='prob_002', prompt='Write a function that reverses a string.', tests=[]),
        # Add more problems as needed
    ]
    
    # Define a mock generation function
    def mock_generator(problem):
        # This function simulates generating a solution.
        # Here, we simulate a model that is correct 60% of the time.
        if np.random.rand() < 0.6:
            return f"def {problem.prompt.split(' ')[-1].replace('.', '')}(...):\n    return ..."
        else:
            return "def some_wrong_solution(...):\n    return 'wrong'"
            
    # Define the K values to evaluate
    k_values_to_test = [1, 10, 50, 100]
    
    # Run the experiment with dynamic sampling
    print("Running experiment with dynamic beta-binomial sampling...")
    dynamic_results = evaluate_experiment(
        problems=mock_problems, 
        generation_func=mock_generator, 
        k_values=k_values_to_test,
        dynamic_sampling_enabled=True,
    )
    
    print("\n--- Dynamic Sampling Results ---")
    print("Average Metrics:", dynamic_results['average_results'])
    # Optional: Print detailed results per problem
    # for p_id, res in dynamic_results['problem_level_results'].items():
    #     print(f"Problem {p_id}: n={res['n']}, c={res['c']}, Results: {res}")
        
    # Run the experiment with a fixed n
    print("\nRunning experiment with fixed n=200...")
    fixed_results = evaluate_experiment(
        problems=mock_problems, 
        generation_func=mock_generator, 
        k_values=k_values_to_test,
        dynamic_sampling_enabled=False,
    )
    
    print("\n--- Fixed Sampling Results (n=200) ---")
    print("Average Metrics:", fixed_results['average_results'])
    # Optional: Print detailed results per problem
    # for p_id, res in fixed_results['problem_level_results'].items():
    #     print(f"Problem {p_id}: n={res['n']}, c={res['c']}, Results: {res}")
	import math
	import numpy as np
	from scipy.stats import beta
	from collections import defaultdict

	# ==============================================================================
	# 1. Metric Estimation Functions
	# ==============================================================================

	def pass_at_k_unbiased_estimator(n, c, k):
	"""
	Computes the unbiased estimator for pass@k.

	Args:
	n (int): Total number of solutions generated.
	c (int): Number of correct solutions among n.
	k (int): The k in pass@k.

	Returns:
	float: The pass@k estimate.
	"""
	if n < k:
	raise ValueError("n must be greater than or equal to k")
	if n - c < k:
	# Trivial case: not enough failures to pick k failures.
	return 1.0

	# The estimator is 1 - P(all k solutions fail)
	# The probability of all k failing is comb(n-c, k) / comb(n, k)
	log_comb_n_k = math.log(math.comb(n, k))
	log_comb_n_minus_c_k = math.log(math.comb(n - c, k))

	return 1.0 - math.exp(log_comb_n_minus_c_k - log_comb_n_k)

	def pass_power_k_estimator(n, c, k):
	"""
	Computes the pass^k estimator.

	Args:
	n (int): Total number of solutions generated.
	c (int): Number of correct solutions among n.
	k (int): The k in pass^k.

	Returns:
	float: The pass^k estimate.
	"""
	if n == 0:
	return 0.0
	# The probability of a single success is c/n.
	# For k independent trials, the probability of all succeeding is (c/n)^k.
	p = c / n
	return p ** k

	# ==============================================================================
	# 2. Beta-Binomial Dynamic Sampling
	# ==============================================================================

	def get_beta_binomial_params(n, c):
	"""
	Computes the parameters (alpha, beta) of the posterior beta distribution
	for the problem's success probability, given n trials and c successes.

	We use a uniform Beta(1, 1) as the prior.

	Args:
	n (int): Number of trials.
	c (int): Number of successes.

	Returns:
	tuple: A tuple (alpha, beta) for the posterior beta distribution.
	"""
	return c + 1, n - c + 1

	def dynamic_sampling(problem, generator_func, max_n=500, tolerance=0.01):
	"""
	Dynamically samples solutions for a single problem using a beta-binomial model.
	The process stops when the posterior standard deviation of the underlying
	success probability is sufficiently small.

	This is based on the idea from Kulal et al. (2025) and other Bayesian
	evaluation methods.

	Args:
	problem (dict): The problem data (e.g., prompt, tests).
	generator_func (function): A function that takes a problem and returns
	a generated solution.
	max_n (int): The maximum number of samples to generate.
	tolerance (float): The desired standard deviation of the posterior.
	A smaller value means more samples and higher confidence.

	Returns:
	tuple: A tuple (n, c), where n is the total number of samples and
	c is the number of correct ones.
	"""
	c = 0

	# First, run a small pilot study to get an initial estimate
	pilot_n = min(10, max_n)
	solutions = [generator_func(problem) for _ in range(pilot_n)]
	c += sum(1 for sol in solutions if is_correct(sol, problem))
	n = pilot_n

	while n < max_n:
	alpha, beta_param = get_beta_binomial_params(n, c)

	# Calculate the standard deviation of the posterior Beta distribution
	posterior_std = math.sqrt((alpha * beta_param) / ((alpha + beta_param)*2 (alpha + beta_param + 1)))

	if posterior_std < tolerance:
	break

	# Sample one more solution
	n += 1
	solution = generator_func(problem)
	if is_correct(solution, problem):
	c += 1

	return n, c

	# ==============================================================================
	# 3. Main Evaluation Function and Helper Classes
	# ==============================================================================

	class Problem:
	def __init__(self, problem_id, prompt, tests):
	self.problem_id = problem_id
	self.prompt = prompt
	self.tests = tests

	def is_correct(solution, problem):
	"""
	A placeholder function for running tests against a generated solution.
	In a real-world scenario, this would involve a sandboxed execution environment.

	Args:
	solution (str): The generated code solution.
	problem (Problem): The problem object containing test cases.

	Returns:
	bool: True if the solution passes all tests, False otherwise.
	"""
	# This is a mock implementation.
	# In practice, this needs to be a robust, sandboxed execution.
	try:
	# A simple check: if 'pass' is in the code, assume it's valid but potentially wrong.
	# A real check would run the tests.
	return len(solution) > 50 and 'return' in solution
	except Exception:
	return False

	def evaluate_experiment(problems, generation_func, k_values, dynamic_sampling_enabled=True):
	"""
	Executes the evaluation experiment and computes pass@k and pass^k.

	Args:
	problems (list): A list of Problem objects.
	generation_func (function): A function to generate solutions.
	k_values (list): An array of k values to compute metrics for.
	dynamic_sampling_enabled (bool): Whether to use dynamic sampling or a fixed n.

	Returns:
	dict: A dictionary containing the computed pass@k and pass^k results.
	"""
	results_per_problem = defaultdict(dict)

	for problem in problems:
	if dynamic_sampling_enabled:
	n, c = dynamic_sampling(problem, generation_func)
	else:
	# If not dynamic, use a fixed n, e.g., max_k * 2 or a standard value.
	max_k = max(k_values)
	fixed_n = max(max_k, 200)
	solutions = [generation_func(problem) for _ in range(fixed_n)]
	c = sum(1 for sol in solutions if is_correct(sol, problem))
	n = fixed_n

	for k in k_values:
	if k > n:
	print(f"Warning: k={k} > n={n} for problem {problem.problem_id}. Skipping pass@k.")
	continue

	pass_at_k = pass_at_k_unbiased_estimator(n, c, k)
	pass_power_k = pass_power_k_estimator(n, c, k)

	results_per_problem[problem.problem_id][f'pass@{k}'] = pass_at_k
	results_per_problem[problem.problem_id][f'pass^{k}'] = pass_power_k
	results_per_problem[problem.problem_id]['n'] = n
	results_per_problem[problem.problem_id]['c'] = c

	# Aggregate results
	final_metrics = defaultdict(list)
	for problem_id, metrics in results_per_problem.items():
	for metric, value in metrics.items():
	if metric.startswith('pass'):
	final_metrics[metric].append(value)

	average_metrics = {metric: np.mean(values) for metric, values in final_metrics.items()}

	return {
	'problem_level_results': results_per_problem,
	'average_results': average_metrics,
	}

	# ==============================================================================
	# 4. Example Usage
	# ==============================================================================

	if __name__ == '__main__':
	# Define a set of mock problems
	mock_problems = [
	Problem(problem_id='prob_001', prompt='Write a function that adds two numbers.', tests=[]),
	Problem(problem_id='prob_002', prompt='Write a function that reverses a string.', tests=[]),
	# Add more problems as needed
	]

	# Define a mock generation function
	def mock_generator(problem):
	# This function simulates generating a solution.
	# Here, we simulate a model that is correct 60% of the time.
	if np.random.rand() < 0.6:
	return f"def {problem.prompt.split(' ')[-1].replace('.', '')}(...):\n return ..."
	else:
	return "def some_wrong_solution(...):\n return 'wrong'"

	# Define the K values to evaluate
	k_values_to_test = [1, 10, 50, 100]

	# Run the experiment with dynamic sampling
	print("Running experiment with dynamic beta-binomial sampling...")
	dynamic_results = evaluate_experiment(
	problems=mock_problems,
	generation_func=mock_generator,
	k_values=k_values_to_test,
	dynamic_sampling_enabled=True,
	)

	print("\n--- Dynamic Sampling Results ---")
	print("Average Metrics:", dynamic_results['average_results'])
	# Optional: Print detailed results per problem
	# for p_id, res in dynamic_results['problem_level_results'].items():
	# print(f"Problem {p_id}: n={res['n']}, c={res['c']}, Results: {res}")

	# Run the experiment with a fixed n
	print("\nRunning experiment with fixed n=200...")
	fixed_results = evaluate_experiment(
	problems=mock_problems,
	generation_func=mock_generator,
	k_values=k_values_to_test,
	dynamic_sampling_enabled=False,
	)

	print("\n--- Fixed Sampling Results (n=200) ---")
	print("Average Metrics:", fixed_results['average_results'])
	# Optional: Print detailed results per problem
	# for p_id, res in fixed_results['problem_level_results'].items():
	# print(f"Problem {p_id}: n={res['n']}, c={res['c']}, Results: {res}")
No results found