diff --git a/ragchecker/cli.py b/ragchecker/cli.py index 6a1e5f6..dd4f08e 100644 --- a/ragchecker/cli.py +++ b/ragchecker/cli.py @@ -17,6 +17,15 @@ def get_args(): "--output_path", type=str, required=True, help="Output path to the result json file." ) + parser.add_argument( + "--ensure_ascii", type=bool, default=True, + help="Whether to ensure ascii characters in output." + " (Set this to False if you are processing non-English content)" + ) + parser.add_argument( + "--indent", type=int, default=2, + help="Set JSON output indent." + ) parser.add_argument( '--extractor_name', type=str, default="bedrock/meta.llama3-70b-instruct-v1:0", help="Model used for extracting claims. Default: bedrock/meta.llama3-70b-instruct-v1:0" @@ -45,7 +54,6 @@ def get_args(): "--batch_size_checker", type=int, default=32, help="Batch size for checker." ) - # checking options parser.add_argument( '--metrics', type=str, nargs='+', default=[all_metrics], @@ -83,10 +91,10 @@ def main(): ) with open(args.input_path, "r") as f: rag_results = RAGResults.from_json(f.read()) - evaluator.evaluate(rag_results, metrics=args.metrics, save_path=args.output_path) - print(json.dumps(rag_results.metrics, indent=2)) + evaluator.evaluate(rag_results, metrics=args.metrics, save_path=args.output_path, ensure_ascii=args.ensure_ascii) + print(json.dumps(rag_results.metrics, indent=args.indent, ensure_ascii=args.ensure_ascii)) with open(args.output_path, "w") as f: - f.write(rag_results.to_json(indent=2)) + f.write(rag_results.to_json(indent=args.indent, ensure_ascii=args.ensure_ascii)) if __name__ == "__main__": diff --git a/ragchecker/evaluator.py b/ragchecker/evaluator.py index 53a3e9f..7ce46c3 100644 --- a/ragchecker/evaluator.py +++ b/ragchecker/evaluator.py @@ -63,15 +63,15 @@ def __init__( self.joint_check = joint_check self.joint_check_num = joint_check_num self.kwargs = kwargs - + self.sagemaker_client = sagemaker_client self.sagemaker_params = sagemaker_params self.sagemaker_get_response_func = sagemaker_get_response_func - + self.custom_llm_api_func = custom_llm_api_func - + self.extractor = LLMExtractor( - model=extractor_name, + model=extractor_name, batch_size=batch_size_extractor, api_base=extractor_api_base ) @@ -81,11 +81,11 @@ def __init__( self.checker = AlignScoreChecker(batch_size=batch_size_checker) else: self.checker = LLMChecker( - model=checker_name, + model=checker_name, batch_size=batch_size_checker, api_base=checker_api_base ) - + def extract_claims(self, results: List[RAGResult], extract_type="gt_answer"): """ Extract claims from the response and ground truth answer. @@ -99,7 +99,7 @@ def extract_claims(self, results: List[RAGResult], extract_type="gt_answer"): """ assert extract_type in ["gt_answer", "response"], \ "extract_type should be either 'gt_answer' or 'response'." - + if extract_type == "gt_answer": results = [ret for ret in results if ret.gt_answer_claims is None] texts = [result.gt_answer for result in results] @@ -109,7 +109,7 @@ def extract_claims(self, results: List[RAGResult], extract_type="gt_answer"): if not results: return questions = [result.query for result in results] - + logger.info(f"Extracting claims for {extract_type} of {len(results)} RAG results.") extraction_results = self.extractor.extract( batch_responses=texts, @@ -194,8 +194,8 @@ def check_claims(self, results: RAGResults, check_type="answer2response"): result.retrieved2answer = checking_results[i] else: result.retrieved2response = checking_results[i] - - def evaluate(self, results: RAGResults, metrics=all_metrics, save_path=None): + + def evaluate(self, results: RAGResults, metrics=all_metrics, save_path=None, indent=2, ensure_ascii=True): """ Evaluate the RAG results. @@ -207,7 +207,7 @@ def evaluate(self, results: RAGResults, metrics=all_metrics, save_path=None): List of metrics to compute. Default: 'all'. save_path : str, optional Path to save the results. Default: None. Will perform progress checkpointing if provided. - """ + """ # identify the metrics and required intermediate results if isinstance(metrics, str): metrics = [metrics] @@ -222,19 +222,19 @@ def evaluate(self, results: RAGResults, metrics=all_metrics, save_path=None): ret_metrics.add(metric) for metric in ret_metrics: requirements.update(METRIC_REQUIREMENTS[metric]) - + # compute the required intermediate results for requirement in requirements: self.check_claims(results, check_type=requirement) if save_path is not None: with open(save_path, "w") as f: - f.write(results.to_json(indent=2)) + f.write(results.to_json(indent=indent, ensure_ascii=ensure_ascii)) # compute the metrics for metric in ret_metrics: for result in results.results: METRIC_FUNC_MAP[metric](result) - + # aggregate the metrics for group, group_metrics in METRIC_GROUP_MAP.items(): if group == all_metrics: @@ -244,10 +244,10 @@ def evaluate(self, results: RAGResults, metrics=all_metrics, save_path=None): results.metrics[group][metric] = round(np.mean( [result.metrics[metric] for result in results.results] ) * 100, 1) - - # save the results + + # save the results if save_path is not None: with open(save_path, "w") as f: - f.write(results.to_json(indent=2)) + f.write(results.to_json(indent=indent, ensure_ascii=ensure_ascii)) return results.metrics \ No newline at end of file