Created
November 13, 2025 00:09
-
-
Save DiTo97/22bd1c1341a827d9360a3644a266d798 to your computer and use it in GitHub Desktop.
efficient and reliable sampling methodologies for pass@k and pass^k evaluation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import math | |
| import numpy as np | |
| from scipy.stats import beta | |
| from collections import defaultdict | |
| # ============================================================================== | |
| # 1. Metric Estimation Functions | |
| # ============================================================================== | |
| def pass_at_k_unbiased_estimator(n, c, k): | |
| """ | |
| Computes the unbiased estimator for pass@k. | |
| Args: | |
| n (int): Total number of solutions generated. | |
| c (int): Number of correct solutions among n. | |
| k (int): The k in pass@k. | |
| Returns: | |
| float: The pass@k estimate. | |
| """ | |
| if n < k: | |
| raise ValueError("n must be greater than or equal to k") | |
| if n - c < k: | |
| # Trivial case: not enough failures to pick k failures. | |
| return 1.0 | |
| # The estimator is 1 - P(all k solutions fail) | |
| # The probability of all k failing is comb(n-c, k) / comb(n, k) | |
| log_comb_n_k = math.log(math.comb(n, k)) | |
| log_comb_n_minus_c_k = math.log(math.comb(n - c, k)) | |
| return 1.0 - math.exp(log_comb_n_minus_c_k - log_comb_n_k) | |
| def pass_power_k_estimator(n, c, k): | |
| """ | |
| Computes the pass^k estimator. | |
| Args: | |
| n (int): Total number of solutions generated. | |
| c (int): Number of correct solutions among n. | |
| k (int): The k in pass^k. | |
| Returns: | |
| float: The pass^k estimate. | |
| """ | |
| if n == 0: | |
| return 0.0 | |
| # The probability of a single success is c/n. | |
| # For k independent trials, the probability of all succeeding is (c/n)^k. | |
| p = c / n | |
| return p ** k | |
| # ============================================================================== | |
| # 2. Beta-Binomial Dynamic Sampling | |
| # ============================================================================== | |
| def get_beta_binomial_params(n, c): | |
| """ | |
| Computes the parameters (alpha, beta) of the posterior beta distribution | |
| for the problem's success probability, given n trials and c successes. | |
| We use a uniform Beta(1, 1) as the prior. | |
| Args: | |
| n (int): Number of trials. | |
| c (int): Number of successes. | |
| Returns: | |
| tuple: A tuple (alpha, beta) for the posterior beta distribution. | |
| """ | |
| return c + 1, n - c + 1 | |
| def dynamic_sampling(problem, generator_func, max_n=500, tolerance=0.01): | |
| """ | |
| Dynamically samples solutions for a single problem using a beta-binomial model. | |
| The process stops when the posterior standard deviation of the underlying | |
| success probability is sufficiently small. | |
| This is based on the idea from Kulal et al. (2025) and other Bayesian | |
| evaluation methods. | |
| Args: | |
| problem (dict): The problem data (e.g., prompt, tests). | |
| generator_func (function): A function that takes a problem and returns | |
| a generated solution. | |
| max_n (int): The maximum number of samples to generate. | |
| tolerance (float): The desired standard deviation of the posterior. | |
| A smaller value means more samples and higher confidence. | |
| Returns: | |
| tuple: A tuple (n, c), where n is the total number of samples and | |
| c is the number of correct ones. | |
| """ | |
| c = 0 | |
| # First, run a small pilot study to get an initial estimate | |
| pilot_n = min(10, max_n) | |
| solutions = [generator_func(problem) for _ in range(pilot_n)] | |
| c += sum(1 for sol in solutions if is_correct(sol, problem)) | |
| n = pilot_n | |
| while n < max_n: | |
| alpha, beta_param = get_beta_binomial_params(n, c) | |
| # Calculate the standard deviation of the posterior Beta distribution | |
| posterior_std = math.sqrt((alpha * beta_param) / ((alpha + beta_param)**2 * (alpha + beta_param + 1))) | |
| if posterior_std < tolerance: | |
| break | |
| # Sample one more solution | |
| n += 1 | |
| solution = generator_func(problem) | |
| if is_correct(solution, problem): | |
| c += 1 | |
| return n, c | |
| # ============================================================================== | |
| # 3. Main Evaluation Function and Helper Classes | |
| # ============================================================================== | |
| class Problem: | |
| def __init__(self, problem_id, prompt, tests): | |
| self.problem_id = problem_id | |
| self.prompt = prompt | |
| self.tests = tests | |
| def is_correct(solution, problem): | |
| """ | |
| A placeholder function for running tests against a generated solution. | |
| In a real-world scenario, this would involve a sandboxed execution environment. | |
| Args: | |
| solution (str): The generated code solution. | |
| problem (Problem): The problem object containing test cases. | |
| Returns: | |
| bool: True if the solution passes all tests, False otherwise. | |
| """ | |
| # This is a mock implementation. | |
| # In practice, this needs to be a robust, sandboxed execution. | |
| try: | |
| # A simple check: if 'pass' is in the code, assume it's valid but potentially wrong. | |
| # A real check would run the tests. | |
| return len(solution) > 50 and 'return' in solution | |
| except Exception: | |
| return False | |
| def evaluate_experiment(problems, generation_func, k_values, dynamic_sampling_enabled=True): | |
| """ | |
| Executes the evaluation experiment and computes pass@k and pass^k. | |
| Args: | |
| problems (list): A list of Problem objects. | |
| generation_func (function): A function to generate solutions. | |
| k_values (list): An array of k values to compute metrics for. | |
| dynamic_sampling_enabled (bool): Whether to use dynamic sampling or a fixed n. | |
| Returns: | |
| dict: A dictionary containing the computed pass@k and pass^k results. | |
| """ | |
| results_per_problem = defaultdict(dict) | |
| for problem in problems: | |
| if dynamic_sampling_enabled: | |
| n, c = dynamic_sampling(problem, generation_func) | |
| else: | |
| # If not dynamic, use a fixed n, e.g., max_k * 2 or a standard value. | |
| max_k = max(k_values) | |
| fixed_n = max(max_k, 200) | |
| solutions = [generation_func(problem) for _ in range(fixed_n)] | |
| c = sum(1 for sol in solutions if is_correct(sol, problem)) | |
| n = fixed_n | |
| for k in k_values: | |
| if k > n: | |
| print(f"Warning: k={k} > n={n} for problem {problem.problem_id}. Skipping pass@k.") | |
| continue | |
| pass_at_k = pass_at_k_unbiased_estimator(n, c, k) | |
| pass_power_k = pass_power_k_estimator(n, c, k) | |
| results_per_problem[problem.problem_id][f'pass@{k}'] = pass_at_k | |
| results_per_problem[problem.problem_id][f'pass^{k}'] = pass_power_k | |
| results_per_problem[problem.problem_id]['n'] = n | |
| results_per_problem[problem.problem_id]['c'] = c | |
| # Aggregate results | |
| final_metrics = defaultdict(list) | |
| for problem_id, metrics in results_per_problem.items(): | |
| for metric, value in metrics.items(): | |
| if metric.startswith('pass'): | |
| final_metrics[metric].append(value) | |
| average_metrics = {metric: np.mean(values) for metric, values in final_metrics.items()} | |
| return { | |
| 'problem_level_results': results_per_problem, | |
| 'average_results': average_metrics, | |
| } | |
| # ============================================================================== | |
| # 4. Example Usage | |
| # ============================================================================== | |
| if __name__ == '__main__': | |
| # Define a set of mock problems | |
| mock_problems = [ | |
| Problem(problem_id='prob_001', prompt='Write a function that adds two numbers.', tests=[]), | |
| Problem(problem_id='prob_002', prompt='Write a function that reverses a string.', tests=[]), | |
| # Add more problems as needed | |
| ] | |
| # Define a mock generation function | |
| def mock_generator(problem): | |
| # This function simulates generating a solution. | |
| # Here, we simulate a model that is correct 60% of the time. | |
| if np.random.rand() < 0.6: | |
| return f"def {problem.prompt.split(' ')[-1].replace('.', '')}(...):\n return ..." | |
| else: | |
| return "def some_wrong_solution(...):\n return 'wrong'" | |
| # Define the K values to evaluate | |
| k_values_to_test = [1, 10, 50, 100] | |
| # Run the experiment with dynamic sampling | |
| print("Running experiment with dynamic beta-binomial sampling...") | |
| dynamic_results = evaluate_experiment( | |
| problems=mock_problems, | |
| generation_func=mock_generator, | |
| k_values=k_values_to_test, | |
| dynamic_sampling_enabled=True, | |
| ) | |
| print("\n--- Dynamic Sampling Results ---") | |
| print("Average Metrics:", dynamic_results['average_results']) | |
| # Optional: Print detailed results per problem | |
| # for p_id, res in dynamic_results['problem_level_results'].items(): | |
| # print(f"Problem {p_id}: n={res['n']}, c={res['c']}, Results: {res}") | |
| # Run the experiment with a fixed n | |
| print("\nRunning experiment with fixed n=200...") | |
| fixed_results = evaluate_experiment( | |
| problems=mock_problems, | |
| generation_func=mock_generator, | |
| k_values=k_values_to_test, | |
| dynamic_sampling_enabled=False, | |
| ) | |
| print("\n--- Fixed Sampling Results (n=200) ---") | |
| print("Average Metrics:", fixed_results['average_results']) | |
| # Optional: Print detailed results per problem | |
| # for p_id, res in fixed_results['problem_level_results'].items(): | |
| # print(f"Problem {p_id}: n={res['n']}, c={res['c']}, Results: {res}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment