[
  {
    "title": "RAG-Star: Enhancing Deliberative Reasoning with Retrieval Augmented Verification and Refinement",
    "authors": [
      "Jinhao Jiang",
      "Jiayi Chen",
      "Junyi Li",
      "Ruiyang Ren",
      "Shijie Wang",
      "Wayne Xin Zhao",
      "Yang Song",
      "Tao Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2412.12881v1",
    "doi": "",
    "abstract": "Existing large language models (LLMs) show exceptional problem-solving capabilities but might struggle with complex reasoning tasks. Despite the successes of chain-of-thought and tree-based search methods, they mainly depend on the internal knowledge of LLMs to search over intermediate reasoning steps, limited to dealing with simple tasks involving fewer reasoning steps. In this paper, we propose \\textbf{RAG-Star}, a novel RAG approach that integrates the retrieved information to guide the tree-based deliberative reasoning process that relies on the inherent knowledge of LLMs. By leveraging Monte Carlo Tree Search, RAG-Star iteratively plans intermediate sub-queries and answers for reasoning based on the LLM itself. To consolidate internal and external knowledge, we propose an retrieval-augmented verification that utilizes query- and answer-aware reward modeling to provide feedback for the inherent reasoning of LLMs. Our experiments involving Llama-3.1-8B-Instruct and GPT-4o demonstrate that RAG-Star significantly outperforms previous RAG and reasoning methods.",
    "source_database": "arxiv",
    "arxiv_id": "2412.12881v1"
  },
  {
    "title": "T-RAG: Lessons from the LLM Trenches",
    "authors": [
      "Masoomali Fatehkia",
      "Ji Kim Lucas",
      "Sanjay Chawla"
    ],
    "year": "2024",
    "journal": "arXiv:2402.07483v2",
    "doi": "",
    "abstract": "Large Language Models (LLM) have shown remarkable language capabilities fueling attempts to integrate them into applications across a wide range of domains. An important application area is question answering over private enterprise documents where the main considerations are data security, which necessitates applications that can be deployed on-prem, limited computational resources and the need for a robust application that correctly responds to queries. Retrieval-Augmented Generation (RAG) has emerged as the most prominent framework for building LLM-based applications. While building a RAG is relatively straightforward, making it robust and a reliable application requires extensive customization and relatively deep knowledge of the application domain. We share our experiences building and deploying an LLM application for question answering over private organizational documents. Our application combines the use of RAG with a finetuned open-source LLM. Additionally, our system, which we call Tree-RAG (T-RAG), uses a tree structure to represent entity hierarchies within the organization. This is used to generate a textual description to augment the context when responding to user queries pertaining to entities within the organization's hierarchy. Our evaluations, including a Needle in a Haystack test, show that this combination performs better than a simple RAG or finetuning implementation. Finally, we share some lessons learned based on our experiences building an LLM application for real-world use.",
    "source_database": "arxiv",
    "arxiv_id": "2402.07483v2"
  },
  {
    "title": "Vendi-RAG: Adaptively Trading-Off Diversity And Quality Significantly Improves Retrieval Augmented Generation With LLMs",
    "authors": [
      "Mohammad Reza Rezaei",
      "Adji Bousso Dieng"
    ],
    "year": "2025",
    "journal": "arXiv:2502.11228v2",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) enhances large language models (LLMs) for domain-specific question-answering (QA) tasks by leveraging external knowledge sources. However, traditional RAG systems primarily focus on relevance-based retrieval and often struggle with redundancy, especially when reasoning requires connecting information from multiple sources. This paper introduces Vendi-RAG, a framework based on an iterative process that jointly optimizes retrieval diversity and answer quality. This joint optimization leads to significantly higher accuracy for multi-hop QA tasks. Vendi-RAG leverages the Vendi Score (VS), a flexible similarity-based diversity metric, to promote semantic diversity in document retrieval. It then uses an LLM judge that evaluates candidate answers, generated after a reasoning step, and outputs a score that the retriever uses to balance relevance and diversity among the retrieved documents during each iteration. Experiments on three challenging datasets -- HotpotQA, MuSiQue, and 2WikiMultiHopQA -- demonstrate Vendi-RAG's effectiveness in multi-hop reasoning tasks. The framework achieves significant accuracy improvements over traditional single-step and multi-step RAG approaches, with accuracy increases reaching up to +4.2% on HotpotQA, +4.1% on 2WikiMultiHopQA, and +1.3% on MuSiQue compared to Adaptive-RAG, the current best baseline. The benefits of Vendi-RAG are even more pronounced as the number of retrieved documents increases. Finally, we evaluated Vendi-RAG across different LLM backbones, including GPT-3.5, GPT-4, and GPT-4o-mini, and observed consistent improvements, demonstrating that the framework's advantages are model-agnostic.",
    "source_database": "arxiv",
    "arxiv_id": "2502.11228v2"
  },
  {
    "title": "MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries",
    "authors": [
      "Yixuan Tang",
      "Yi Yang"
    ],
    "year": "2024",
    "journal": "arXiv:2401.15391v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) augments large language models (LLM) by retrieving relevant knowledge, showing promising potential in mitigating LLM hallucinations and enhancing response quality, thereby facilitating the great adoption of LLMs in practice. However, we find that existing RAG systems are inadequate in answering multi-hop queries, which require retrieving and reasoning over multiple pieces of supporting evidence. Furthermore, to our knowledge, no existing RAG benchmarking dataset focuses on multi-hop queries. In this paper, we develop a novel dataset, MultiHop-RAG, which consists of a knowledge base, a large collection of multi-hop queries, their ground-truth answers, and the associated supporting evidence. We detail the procedure of building the dataset, utilizing an English news article dataset as the underlying RAG knowledge base. We demonstrate the benchmarking utility of MultiHop-RAG in two experiments. The first experiment compares different embedding models for retrieving evidence for multi-hop queries. In the second experiment, we examine the capabilities of various state-of-the-art LLMs, including GPT-4, PaLM, and Llama2-70B, in reasoning and answering multi-hop queries given the evidence. Both experiments reveal that existing RAG methods perform unsatisfactorily in retrieving and answering multi-hop queries. We hope MultiHop-RAG will be a valuable resource for the community in developing effective RAG systems, thereby facilitating greater adoption of LLMs in practice. The MultiHop-RAG and implemented RAG system is publicly available at https://github.com/yixuantt/MultiHop-RAG/.",
    "source_database": "arxiv",
    "arxiv_id": "2401.15391v1"
  },
  {
    "title": "Localising In-Domain Adaptation of Transformer-Based Biomedical Language Models",
    "authors": [
      "Tommaso Mario Buonocore",
      "Claudio Crema",
      "Alberto Redolfi",
      "Riccardo Bellazzi",
      "Enea Parimbelli"
    ],
    "year": "2022",
    "journal": "arXiv:2212.10422v3",
    "doi": "https://doi.org/10.1016/j.jbi.2023.104431",
    "abstract": "In the era of digital healthcare, the huge volumes of textual information generated every day in hospitals constitute an essential but underused asset that could be exploited with task-specific, fine-tuned biomedical language representation models, improving patient care and management. For such specialized domains, previous research has shown that fine-tuning models stemming from broad-coverage checkpoints can largely benefit additional training rounds over large-scale in-domain resources. However, these resources are often unreachable for less-resourced languages like Italian, preventing local medical institutions to employ in-domain adaptation. In order to reduce this gap, our work investigates two accessible approaches to derive biomedical language models in languages other than English, taking Italian as a concrete use-case: one based on neural machine translation of English resources, favoring quantity over quality; the other based on a high-grade, narrow-scoped corpus natively written in Italian, thus preferring quality over quantity. Our study shows that data quantity is a harder constraint than data quality for biomedical adaptation, but the concatenation of high-quality data can improve model performance even when dealing with relatively size-limited corpora. The models published from our investigations have the potential to unlock important research opportunities for Italian hospitals and academia. Finally, the set of lessons learned from the study constitutes valuable insights towards a solution to build biomedical language models that are generalizable to other less-resourced languages and different domain settings.",
    "source_database": "arxiv",
    "arxiv_id": "2212.10422v3"
  },
  {
    "title": "A scoping review on multimodal deep learning in biomedical images and texts",
    "authors": [
      "Zhaoyi Sun",
      "Mingquan Lin",
      "Qingqing Zhu",
      "Qianqian Xie",
      "Fei Wang",
      "Zhiyong Lu",
      "Yifan Peng"
    ],
    "year": "2023",
    "journal": "arXiv:2307.07362v3",
    "doi": "https://doi.org/10.1016/j.jbi.2023.104482",
    "abstract": "Computer-assisted diagnostic and prognostic systems of the future should be capable of simultaneously processing multimodal data. Multimodal deep learning (MDL), which involves the integration of multiple sources of data, such as images and text, has the potential to revolutionize the analysis and interpretation of biomedical data. However, it only caught researchers' attention recently. To this end, there is a critical need to conduct a systematic review on this topic, identify the limitations of current work, and explore future directions. In this scoping review, we aim to provide a comprehensive overview of the current state of the field and identify key concepts, types of studies, and research gaps with a focus on biomedical images and texts joint learning, mainly because these two were the most commonly available data types in MDL research. This study reviewed the current uses of multimodal deep learning on five tasks: (1) Report generation, (2) Visual question answering, (3) Cross-modal retrieval, (4) Computer-aided diagnosis, and (5) Semantic segmentation. Our results highlight the diverse applications and potential of MDL and suggest directions for future research in the field. We hope our review will facilitate the collaboration of natural language processing (NLP) and medical imaging communities and support the next generation of decision-making and computer-assisted diagnostic system development.",
    "source_database": "arxiv",
    "arxiv_id": "2307.07362v3"
  },
  {
    "title": "Is Conformal Factuality for RAG-based LLMs Robust? Novel Metrics and Systematic Insights",
    "authors": [
      "Yi Chen",
      "Daiwei Chen",
      "Sukrut Madhav Chikodikar",
      "Caitlyn Heqi Yin",
      "Ramya Korlakai Vinayak"
    ],
    "year": "2026",
    "journal": "arXiv:2603.16817v1",
    "doi": "",
    "abstract": "Large language models (LLMs) frequently hallucinate, limiting their reliability in knowledge-intensive applications. Retrieval-augmented generation (RAG) and conformal factuality have emerged as potential ways to address this limitation. While RAG aims to ground responses in retrieved evidence, it provides no statistical guarantee that the final output is correct. Conformal factuality filtering offers distribution-free statistical reliability by scoring and filtering atomic claims using a threshold calibrated on held-out data, however, the informativeness of the final output is not guaranteed. We systematically analyze the reliability and usefulness of conformal factuality for RAG-based LLMs across generation, scoring, calibration, robustness, and efficiency. We propose novel informativeness-aware metrics that better reflect task utility under conformal filtering. Across three benchmarks and multiple model families, we find that (i) conformal filtering suffers from low usefulness at high factuality levels due to vacuous outputs, (ii) conformal factuality guarantee is not robust to distribution shifts and distractors, highlighting the limitation that requires calibration data to closely match deployment conditions, and (iii) lightweight entailment-based verifiers match or outperform LLM-based model confidence scorers while requiring over $100\\times$ fewer FLOPs. Overall, our results expose factuality-informativeness trade-offs and fragility of conformal filtering framework under distribution shifts and distractors, highlighting the need for new approaches for reliability with robustness and usefulness as key metrics, and provide actionable guidance for building RAG pipelines that are both reliable and computationally efficient.",
    "source_database": "arxiv",
    "arxiv_id": "2603.16817v1"
  },
  {
    "title": "RAG Makes Guardrails Unsafe? Investigating Robustness of Guardrails under RAG-style Contexts",
    "authors": [
      "Yining She",
      "Daniel W. Peterson",
      "Marianne Menglin Liu",
      "Vikas Upadhyay",
      "Mohammad Hossein Chaghazardi",
      "Eunsuk Kang",
      "Dan Roth"
    ],
    "year": "2025",
    "journal": "arXiv:2510.05310v1",
    "doi": "",
    "abstract": "With the increasing adoption of large language models (LLMs), ensuring the safety of LLM systems has become a pressing concern. External LLM-based guardrail models have emerged as a popular solution to screen unsafe inputs and outputs, but they are themselves fine-tuned or prompt-engineered LLMs that are vulnerable to data distribution shifts. In this paper, taking Retrieval Augmentation Generation (RAG) as a case study, we investigated how robust LLM-based guardrails are against additional information embedded in the context. Through a systematic evaluation of 3 Llama Guards and 2 GPT-oss models, we confirmed that inserting benign documents into the guardrail context alters the judgments of input and output guardrails in around 11% and 8% of cases, making them unreliable. We separately analyzed the effect of each component in the augmented context: retrieved documents, user query, and LLM-generated response. The two mitigation methods we tested only bring minor improvements. These results expose a context-robustness gap in current guardrails and motivate training and evaluation protocols that are robust to retrieval and query composition.",
    "source_database": "arxiv",
    "arxiv_id": "2510.05310v1"
  },
  {
    "title": "UniBiomed: A Universal Foundation Model for Grounded Biomedical Image Interpretation",
    "authors": [
      "Linshan Wu",
      "Yuxiang Nie",
      "Sunan He",
      "Jiaxin Zhuang",
      "Luyang Luo",
      "Tao Li",
      "Zhuoyao Xie",
      "Dexuan Chen",
      "Yinghua Zhao",
      "Neeraj Mahboobani",
      "Varut Vardhanabhuti",
      "Ronald Cheong Kin Chan",
      "Yifan Peng",
      "Pranav Rajpurkar",
      "Hao Chen"
    ],
    "year": "2025",
    "journal": "arXiv:2504.21336v3",
    "doi": "",
    "abstract": "The integration of AI-assisted biomedical image analysis into clinical practice demands AI-generated findings that are not only accurate but also interpretable to clinicians. However, existing biomedical AI models generally lack the ability to simultaneously generate diagnostic findings and localize corresponding biomedical objects. This limitation makes it challenging for clinicians to correlate AI-generated findings with visual evidence (e.g., tiny lesions) in images and interpret the results of AI models. To address this challenge, we introduce UniBiomed, the first universal foundation model for grounded biomedical image interpretation, which is capable of generating accurate diagnostic findings and simultaneously segmenting the corresponding biomedical targets. UniBiomed is based on a novel integration of Multi-modal Large Language Model and Segment Anything Model, which can effectively unify diverse biomedical tasks in universal training for advancing grounded interpretation. To develop UniBiomed, we curate a large-scale dataset comprising over 27 million triplets of images, region annotations, and text descriptions across ten biomedical imaging modalities. Extensive validation on 70 internal and 14 external datasets demonstrated the state-of-the-art performance of UniBiomed in diverse biomedical tasks, including image segmentation, disease recognition, region-aware diagnosis, vision question answering, and report generation. In summary, UniBiomed is a powerful and versatile biomedical foundation model, unlocking the untapped grounded interpretation capability for optimizing AI-assisted biomedical image analysis.",
    "source_database": "arxiv",
    "arxiv_id": "2504.21336v3"
  },
  {
    "title": "Collab-RAG: Boosting Retrieval-Augmented Generation for Complex Question Answering via White-Box and Black-Box LLM Collaboration",
    "authors": [
      "Ran Xu",
      "Wenqi Shi",
      "Yuchen Zhuang",
      "Yue Yu",
      "Joyce C. Ho",
      "Haoyu Wang",
      "Carl Yang"
    ],
    "year": "2025",
    "journal": "arXiv:2504.04915v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) systems often struggle to handle multi-hop question-answering tasks accurately due to irrelevant context retrieval and limited complex reasoning capabilities. We introduce Collab-RAG, a collaborative training framework that leverages mutual enhancement between a white-box small language model (SLM) and a blackbox large language model (LLM) for RAG. Specifically, the SLM decomposes complex queries into simpler sub-questions, thus enhancing the accuracy of the retrieval and facilitating more effective reasoning by the black-box LLM. Concurrently, the black-box LLM provides feedback signals to improve the SLM's decomposition capability. We observe that Collab-RAG relies solely on supervision from an affordable black-box LLM without additional distillation from frontier LLMs, yet demonstrates strong generalization across multiple black-box LLMs. Experimental evaluations across five multi-hop QA datasets demonstrate that Collab-RAG substantially outperforms existing black-box-only and SLM fine-tuning baselines by 1.8%-14.2% on average. In particular, our fine-tuned 3B SLM surpasses a frozen 32B LLM in question decomposition, highlighting the efficiency of Collab-RAG in improving reasoning and retrieval for complex questions. The code of Collab-RAG is available on https://github.com/ritaranx/Collab-RAG/.",
    "source_database": "arxiv",
    "arxiv_id": "2504.04915v1"
  },
  {
    "title": "PlainQAFact: Retrieval-augmented Factual Consistency Evaluation Metric for Biomedical Plain Language Summarization",
    "authors": [
      "Zhiwen You",
      "Yue Guo"
    ],
    "year": "2025",
    "journal": "arXiv:2503.08890v4",
    "doi": "https://doi.org/10.1016/j.jbi.2026.105019",
    "abstract": "Hallucinated outputs from large language models (LLMs) pose risks in the medical domain, especially for lay audiences making health-related decisions. Existing automatic factual consistency evaluation methods, such as entailment- and question-answering (QA) -based, struggle with plain language summarization (PLS) due to elaborative explanation phenomenon, which introduces external content (e.g., definitions, background, examples) absent from the scientific abstract to enhance comprehension. To address this, we introduce PlainQAFact, an automatic factual consistency evaluation metric trained on a fine-grained, human-annotated dataset PlainFact, for evaluating factual consistency of both source-simplified and elaborately explained sentences. PlainQAFact first classifies sentence type, then applies a retrieval-augmented QA scoring method. Empirical results show that existing evaluation metrics fail to evaluate the factual consistency in PLS, especially for elaborative explanations, whereas PlainQAFact consistently outperforms them across all evaluation settings. We further analyze PlainQAFact's effectiveness across external knowledge sources, answer extraction strategies, answer overlap measures, and document granularity levels, refining its overall factual consistency assessment. Taken together, our work presents a sentence-aware, retrieval-augmented metric targeted at elaborative explanations in biomedical PLS tasks, providing the community with both a new benchmark and a practical evaluation tool to advance reliable and safe plain language communication in the medical domain. PlainQAFact and PlainFact are available at: https://github.com/zhiwenyou103/PlainQAFact",
    "source_database": "arxiv",
    "arxiv_id": "2503.08890v4"
  },
  {
    "title": "RAG-Gym: Systematic Optimization of Language Agents for Retrieval-Augmented Generation",
    "authors": [
      "Guangzhi Xiong",
      "Qiao Jin",
      "Xiao Wang",
      "Yin Fang",
      "Haolin Liu",
      "Yifan Yang",
      "Fangyuan Chen",
      "Zhixing Song",
      "Dengyu Wang",
      "Minjia Zhang",
      "Zhiyong Lu",
      "Aidong Zhang"
    ],
    "year": "2025",
    "journal": "arXiv:2502.13957v2",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) has shown great promise for knowledge-intensive tasks and recently advanced with agentic RAG, where language agents engage in multi-round interactions with external knowledge sources for adaptive information retrieval. However, existing agentic RAG methods often depend on ad-hoc prompt engineering and lack a unified optimization framework. We introduce RAG-Gym, a comprehensive platform that systematically explores three optimization dimensions: (1) prompt engineering, (2) actor tuning, and (3) critic training. For prompt engineering, we propose Re$^2$Search, a novel agent incorporating reasoning reflection that significantly outperforms standard prompts. In actor tuning, we evaluate three popular post-training algorithms with fine-grained process supervision and identify direct preference optimization as the most effective. We further demonstrate that a trained critic can enhance inference by selecting higher-quality intermediate reasoning steps. Together, these findings lead to the optimized Re$^2$Search++ agent, which surpasses most recent methods like Search-R1 by a relative increase of 3.2% to 11.6% in average F1. Finally, we examine the impact of different reward sources and analyze scaling properties in training and inference, offering practical insights for agentic RAG optimization. The project homepage is available at https://rag-gym.github.io.",
    "source_database": "arxiv",
    "arxiv_id": "2502.13957v2"
  },
  {
    "title": "Biolink Model: A Universal Schema for Knowledge Graphs in Clinical, Biomedical, and Translational Science",
    "authors": [
      "Deepak R. Unni",
      "Sierra A. T. Moxon",
      "Michael Bada",
      "Matthew Brush",
      "Richard Bruskiewich",
      "Paul Clemons",
      "Vlado Dancik",
      "Michel Dumontier",
      "Karamarie Fecho",
      "Gustavo Glusman",
      "Jennifer J. Hadlock",
      "Nomi L. Harris",
      "Arpita Joshi",
      "Tim Putman",
      "Guangrong Qin",
      "Stephen A. Ramsey",
      "Kent A. Shefchek",
      "Harold Solbrig",
      "Karthik Soman",
      "Anne T. Thessen",
      "Melissa A. Haendel",
      "Chris Bizon",
      "Christopher J. Mungall",
      "the Biomedical Data Translator Consortium"
    ],
    "year": "2022",
    "journal": "arXiv:2203.13906v1",
    "doi": "https://doi.org/10.1111/cts.13302",
    "abstract": "Within clinical, biomedical, and translational science, an increasing number of projects are adopting graphs for knowledge representation. Graph-based data models elucidate the interconnectedness between core biomedical concepts, enable data structures to be easily updated, and support intuitive queries, visualizations, and inference algorithms. However, knowledge discovery across these \"knowledge graphs\" (KGs) has remained difficult. Data set heterogeneity and complexity; the proliferation of ad hoc data formats; poor compliance with guidelines on findability, accessibility, interoperability, and reusability; and, in particular, the lack of a universally-accepted, open-access model for standardization across biomedical KGs has left the task of reconciling data sources to downstream consumers. Biolink Model is an open source data model that can be used to formalize the relationships between data structures in translational science. It incorporates object-oriented classification and graph-oriented features. The core of the model is a set of hierarchical, interconnected classes (or categories) and relationships between them (or predicates), representing biomedical entities such as gene, disease, chemical, anatomical structure, and phenotype. The model provides class and edge attributes and associations that guide how entities should relate to one another. Here, we highlight the need for a standardized data model for KGs, describe Biolink Model, and compare it with other models. We demonstrate the utility of Biolink Model in various initiatives, including the Biomedical Data Translator Consortium and the Monarch Initiative, and show how it has supported easier integration and interoperability of biomedical KGs, bringing together knowledge from multiple sources and helping to realize the goals of translational science.",
    "source_database": "arxiv",
    "arxiv_id": "2203.13906v1"
  },
  {
    "title": "Why LLM Safety Guardrails Collapse After Fine-tuning: A Similarity Analysis Between Alignment and Fine-tuning Datasets",
    "authors": [
      "Lei Hsiung",
      "Tianyu Pang",
      "Yung-Chen Tang",
      "Linyue Song",
      "Tsung-Yi Ho",
      "Pin-Yu Chen",
      "Yaoqing Yang"
    ],
    "year": "2025",
    "journal": "arXiv:2506.05346v1",
    "doi": "",
    "abstract": "Recent advancements in large language models (LLMs) have underscored their vulnerability to safety alignment jailbreaks, particularly when subjected to downstream fine-tuning. However, existing mitigation strategies primarily focus on reactively addressing jailbreak incidents after safety guardrails have been compromised, removing harmful gradients during fine-tuning, or continuously reinforcing safety alignment throughout fine-tuning. As such, they tend to overlook a critical upstream factor: the role of the original safety-alignment data. This paper therefore investigates the degradation of safety guardrails through the lens of representation similarity between upstream alignment datasets and downstream fine-tuning tasks. Our experiments demonstrate that high similarity between these datasets significantly weakens safety guardrails, making models more susceptible to jailbreaks. Conversely, low similarity between these two types of datasets yields substantially more robust models and thus reduces harmfulness score by up to 10.33%. By highlighting the importance of upstream dataset design in the building of durable safety guardrails and reducing real-world vulnerability to jailbreak attacks, these findings offer actionable insights for fine-tuning service providers.",
    "source_database": "arxiv",
    "arxiv_id": "2506.05346v1"
  },
  {
    "title": "Long-form factuality in large language models",
    "authors": [
      "Jerry Wei",
      "Chengrun Yang",
      "Xinying Song",
      "Yifeng Lu",
      "Nathan Hu",
      "Jie Huang",
      "Dustin Tran",
      "Daiyi Peng",
      "Ruibo Liu",
      "Da Huang",
      "Cosmo Du",
      "Quoc V. Le"
    ],
    "year": "2024",
    "journal": "arXiv:2403.18802v4",
    "doi": "",
    "abstract": "Large language models (LLMs) often generate content that contains factual errors when responding to fact-seeking prompts on open-ended topics. To benchmark a model's long-form factuality in open domains, we first use GPT-4 to generate LongFact, a prompt set comprising thousands of questions spanning 38 topics. We then propose that LLM agents can be used as automated evaluators for long-form factuality through a method which we call Search-Augmented Factuality Evaluator (SAFE). SAFE utilizes an LLM to break down a long-form response into a set of individual facts and to evaluate the accuracy of each fact using a multi-step reasoning process comprising sending search queries to Google Search and determining whether a fact is supported by the search results. Furthermore, we propose extending F1 score as an aggregated metric for long-form factuality. To do so, we balance the percentage of supported facts in a response (precision) with the percentage of provided facts relative to a hyperparameter representing a user's preferred response length (recall).   Empirically, we demonstrate that LLM agents can outperform crowdsourced human annotators - on a set of ~16k individual facts, SAFE agrees with crowdsourced human annotators 72% of the time, and on a random subset of 100 disagreement cases, SAFE wins 76% of the time. At the same time, SAFE is more than 20 times cheaper than human annotators. We also benchmark thirteen language models on LongFact across four model families (Gemini, GPT, Claude, and PaLM-2), finding that larger language models generally achieve better long-form factuality. LongFact, SAFE, and all experimental code are available at https://github.com/google-deepmind/long-form-factuality.",
    "source_database": "arxiv",
    "arxiv_id": "2403.18802v4"
  },
  {
    "title": "GROUNDEDKG-RAG: Grounded Knowledge Graph Index for Long-document Question Answering",
    "authors": [
      "Tianyi Zhang",
      "Andreas Marfurt"
    ],
    "year": "2026",
    "journal": "arXiv:2604.04359v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) systems have been widely adopted in contemporary large language models (LLMs) due to their ability to improve generation quality while reducing the required input context length. In this work, we focus on RAG systems for long-document question answering. Current approaches suffer from a heavy reliance on LLM descriptions resulting in high resource consumption and latency, repetitive content across hierarchical levels, and hallucinations due to no or limited grounding in the source text. To improve both efficiency and factual accuracy through grounding, we propose GroundedKG-RAG, a RAG system in which the knowledge graph is explicitly extracted from and grounded in the source document. Specifically, we define nodes in GroundedKG as entities and actions, and edges as temporal or semantic relations, with each node and edge grounded in the original sentences. We construct GroundedKG from semantic role labeling (SRL) and abstract meaning representation (AMR) parses and then embed it for retrieval. During querying, we apply the same transformation to the query and retrieve the most relevant sentences from the grounded source text for question answering. We evaluate GroundedKG-RAG on examples from the NarrativeQA dataset and find that it performs on par with a state-of-the art proprietary long-context model at smaller cost and outperforms a competitive baseline. Additionally, our GroundedKG is interpretable and readable by humans, facilitating auditing of results and error analysis.",
    "source_database": "arxiv",
    "arxiv_id": "2604.04359v1"
  },
  {
    "title": "How Does Response Length Affect Long-Form Factuality",
    "authors": [
      "James Xu Zhao",
      "Jimmy Z. J. Liu",
      "Bryan Hooi",
      "See-Kiong Ng"
    ],
    "year": "2025",
    "journal": "arXiv:2505.23295v1",
    "doi": "",
    "abstract": "Large language models (LLMs) are widely used for long-form text generation. However, factual errors in the responses would undermine their reliability. Despite growing attention to LLM factuality, the effect of response length on factuality remains underexplored. In this work, we systematically investigate this relationship by first introducing an automatic and bi-level long-form factuality evaluation framework, which achieves high agreement with human annotations while being cost-effective. Using this framework, we conduct controlled experiments and find that longer responses exhibit lower factual precision, confirming the presence of length bias. To explain this phenomenon, we empirically examine three hypotheses: error propagation, long context, and facts exhaustion. Our results reveal that facts exhaustion, where the model gradually exhausts more reliable knowledge, is the primary cause of factual degradation, rather than the other two hypotheses.",
    "source_database": "arxiv",
    "arxiv_id": "2505.23295v1"
  },
  {
    "title": "Ingest-And-Ground: Dispelling Hallucinations from Continually-Pretrained LLMs with RAG",
    "authors": [
      "Chenhao Fang",
      "Derek Larson",
      "Shitong Zhu",
      "Sophie Zeng",
      "Wendy Summer",
      "Yanqing Peng",
      "Yuriy Hulovatyy",
      "Rajeev Rao",
      "Gabriel Forgues",
      "Arya Pudota",
      "Alex Goncalves",
      "Hervé Robert"
    ],
    "year": "2024",
    "journal": "arXiv:2410.02825v2",
    "doi": "",
    "abstract": "This paper presents new methods that have the potential to improve privacy process efficiency with LLM and RAG. To reduce hallucination, we continually pre-train the base LLM model with a privacy-specific knowledge base and then augment it with a semantic RAG layer. Our evaluations demonstrate that this approach enhances the model performance (as much as doubled metrics compared to out-of-box LLM) in handling privacy-related queries, by grounding responses with factual information which reduces inaccuracies.",
    "source_database": "arxiv",
    "arxiv_id": "2410.02825v2"
  },
  {
    "title": "RAG over Tables: Hierarchical Memory Index, Multi-Stage Retrieval, and Benchmarking",
    "authors": [
      "Jiaru Zou",
      "Dongqi Fu",
      "Sirui Chen",
      "Xinrui He",
      "Zihao Li",
      "Yada Zhu",
      "Jiawei Han",
      "Jingrui He"
    ],
    "year": "2025",
    "journal": "arXiv:2504.01346v4",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enhances Large Language Models (LLMs) by integrating them with an external knowledge base to improve the answer relevance and accuracy. In real-world scenarios, beyond pure text, a substantial amount of knowledge is stored in tables, and user questions often require retrieving answers that are distributed across multiple tables. Retrieving knowledge from a table corpora (i.e., various individual tables) for a question remains nascent, at least, for (i) how to understand intra- and inter-table knowledge effectively, (ii) how to filter unnecessary tables and how to retrieve the most relevant tables efficiently, (iii) how to prompt LLMs to infer over the retrieval, (iv) how to evaluate the corresponding performance in a realistic setting. Facing the above challenges, in this paper, we first propose a table-corpora-aware RAG framework, named T-RAG, which consists of the hierarchical memory index, multi-stage retrieval, and graph-aware prompting for effective and efficient table knowledge retrieval and inference. Further, we first develop a multi-table question answering benchmark named MultiTableQA, which spans 3 different task types, 57,193 tables, and 23,758 questions in total, and the sources are all from real-world scenarios. Based on MultiTableQA, we did the holistic comparison over table retrieval methods, RAG methods, and table-to-graph representation learning methods, where T-RAG shows the leading accuracy, recall, and running time performance. Also, under T-RAG, we evaluate the inference ability upgrade of different LLMs. Code and Data are available at https://github.com/jiaruzouu/T-RAG",
    "source_database": "arxiv",
    "arxiv_id": "2504.01346v4"
  },
  {
    "title": "CaresAI at BioCreative IX Track 1 -- LLM for Biomedical QA",
    "authors": [
      "Reem Abdel-Salam",
      "Mary Adewunmi",
      "Modinat A. Abayomi"
    ],
    "year": "2025",
    "journal": "arXiv:2509.00806v1",
    "doi": "",
    "abstract": "Large language models (LLMs) are increasingly evident for accurate question answering across various domains. However, rigorous evaluation of their performance on complex question-answering (QA) capabilities is essential before deployment in real-world biomedical and healthcare applications. This paper presents our approach to the MedHopQA track of the BioCreative IX shared task, which focuses on multi-hop biomedical question answering involving diseases, genes, and chemicals. We adopt a supervised fine-tuning strategy leveraging LLaMA 3 8B, enhanced with a curated biomedical question-answer dataset compiled from external sources including BioASQ, MedQuAD, and TREC. Three experimental setups are explored: fine-tuning on combined short and long answers, short answers only, and long answers only. While our models demonstrate strong domain understanding, achieving concept-level accuracy scores of up to 0.8, their Exact Match (EM) scores remain significantly lower, particularly in the test phase. We introduce a two-stage inference pipeline for precise short-answer extraction to mitigate verbosity and improve alignment with evaluation metrics. Despite partial improvements, challenges persist in generating strictly formatted outputs. Our findings highlight the gap between semantic understanding and exact answer evaluation in biomedical LLM applications, motivating further research in output control and post-processing strategies.",
    "source_database": "arxiv",
    "arxiv_id": "2509.00806v1"
  },
  {
    "title": "P-RAG: Prompt-Enhanced Parametric RAG with LoRA and Selective CoT for Biomedical and Multi-Hop QA",
    "authors": [
      "Xingda Lyu",
      "Gongfu Lyu",
      "Zitai Yan",
      "Yuxin Jiang"
    ],
    "year": "2026",
    "journal": "arXiv:2602.15874v1",
    "doi": "https://doi.org/10.54254/2755-2721/2025.AST28253",
    "abstract": "Large Language Models (LLMs) demonstrate remarkable capabilities but remain limited by their reliance on static training data. Retrieval-Augmented Generation (RAG) addresses this constraint by retrieving external knowledge during inference, though it still depends heavily on knowledge base quality. To explore potential improvements, we evaluated three RAG variants-Standard RAG, DA-RAG, and our proposed Prompt-Enhanced Parametric RAG (P-RAG), a hybrid architecture that integrates parametric knowledge within the LLM and retrieved evidence, guided by Chain-of-Thought (CoT) prompting and Low-Rank Adaptation (LoRA) fine-tuning-on both general and biomedical datasets. Using LLaMA-3.2-1B-Instruct fine-tuned via LoRA, we evaluate on PubMedQA and 2WikiMultihopQA. P-RAG outperforms Standard RAG on PubMedQA by 10.47 percentage points in F1 (93.33% vs. 82.86%; 12.64% relative). On 2WikiMultihopQA, P-RAG nearly doubles the overall score vs. Standard RAG (33.44% vs. 17.83%) and achieves 44.03% on the Compare subset (with 42.74% Bridge, 21.84% Inference, 8.60% Compose). CoT prompting substantially improves multi-hop reasoning but yields mixed results for simpler, single-hop queries. These findings underscore P-RAG's potential for accurate, scalable, and contextually adaptive biomedical question answering. Our contributions include: (1) LoRA-based fine-tuning of LLaMA-3.2-1B-Instruct for biomedical QA, (2) introduction of P-RAG with Chain-of-Thought prompting, and (3) state-of-the-art results on PubMedQA and 2WikiMultihopQA.",
    "source_database": "arxiv",
    "arxiv_id": "2602.15874v1"
  },
  {
    "title": "RAG based Question-Answering for Contextual Response Prediction System",
    "authors": [
      "Sriram Veturi",
      "Saurabh Vaichal",
      "Reshma Lal Jagadheesh",
      "Nafis Irtiza Tripto",
      "Nian Yan"
    ],
    "year": "2024",
    "journal": "arXiv:2409.03708v2",
    "doi": "",
    "abstract": "Large Language Models (LLMs) have shown versatility in various Natural Language Processing (NLP) tasks, including their potential as effective question-answering systems. However, to provide precise and relevant information in response to specific customer queries in industry settings, LLMs require access to a comprehensive knowledge base to avoid hallucinations. Retrieval Augmented Generation (RAG) emerges as a promising technique to address this challenge. Yet, developing an accurate question-answering framework for real-world applications using RAG entails several challenges: 1) data availability issues, 2) evaluating the quality of generated content, and 3) the costly nature of human evaluation. In this paper, we introduce an end-to-end framework that employs LLMs with RAG capabilities for industry use cases. Given a customer query, the proposed system retrieves relevant knowledge documents and leverages them, along with previous chat history, to generate response suggestions for customer service agents in the contact centers of a major retail company. Through comprehensive automated and human evaluations, we show that this solution outperforms the current BERT-based algorithms in accuracy and relevance. Our findings suggest that RAG-based LLMs can be an excellent support to human customer service representatives by lightening their workload.",
    "source_database": "arxiv",
    "arxiv_id": "2409.03708v2"
  },
  {
    "title": "RAG System for Supporting Japanese Litigation Procedures: Faithful Response Generation Complying with Legal Norms",
    "authors": [
      "Yuya Ishihara",
      "Atsushi Keyaki",
      "Hiroaki Yamada",
      "Ryutaro Ohara",
      "Mihoko Sumida"
    ],
    "year": "2025",
    "journal": "arXiv:2511.22858v1",
    "doi": "",
    "abstract": "This study discusses the essential components that a Retrieval-Augmented Generation (RAG)-based LLM system should possess in order to support Japanese medical litigation procedures complying with legal norms. In litigation, expert commissioners, such as physicians, architects, accountants, and engineers, provide specialized knowledge to help judges clarify points of dispute. When considering the substitution of these expert roles with a RAG-based LLM system, the constraint of strict adherence to legal norms is imposed. Specifically, three requirements arise: (1) the retrieval module must retrieve appropriate external knowledge relevant to the disputed issues in accordance with the principle prohibiting the use of private knowledge, (2) the responses generated must originate from the context provided by the RAG and remain faithful to that context, and (3) the retrieval module must reference external knowledge with appropriate timestamps corresponding to the issues at hand. This paper discusses the design of a RAG-based LLM system that satisfies these requirements.",
    "source_database": "arxiv",
    "arxiv_id": "2511.22858v1"
  },
  {
    "title": "Ask-EDA: A Design Assistant Empowered by LLM, Hybrid RAG and Abbreviation De-hallucination",
    "authors": [
      "Luyao Shi",
      "Michael Kazda",
      "Bradley Sears",
      "Nick Shropshire",
      "Ruchir Puri"
    ],
    "year": "2024",
    "journal": "arXiv:2406.06575v1",
    "doi": "",
    "abstract": "Electronic design engineers are challenged to find relevant information efficiently for a myriad of tasks within design construction, verification and technology development. Large language models (LLM) have the potential to help improve productivity by serving as conversational agents that effectively function as subject-matter experts. In this paper we demonstrate Ask-EDA, a chat agent designed to serve as a 24x7 expert available to provide guidance to design engineers. Ask-EDA leverages LLM, hybrid retrieval augmented generation (RAG) and abbreviation de-hallucination (ADH) techniques to deliver more relevant and accurate responses. We curated three evaluation datasets, namely q2a-100, cmds-100 and abbr-100. Each dataset is tailored to assess a distinct aspect: general design question answering, design command handling and abbreviation resolution. We demonstrated that hybrid RAG offers over a 40% improvement in Recall on the q2a-100 dataset and over a 60% improvement on the cmds-100 dataset compared to not using RAG, while ADH yields over a 70% enhancement in Recall on the abbr-100 dataset. The evaluation results show that Ask-EDA can effectively respond to design-related inquiries.",
    "source_database": "arxiv",
    "arxiv_id": "2406.06575v1"
  },
  {
    "title": "NeMo Guardrails: A Toolkit for Controllable and Safe LLM Applications with Programmable Rails",
    "authors": [
      "Traian Rebedea",
      "Razvan Dinu",
      "Makesh Sreedhar",
      "Christopher Parisien",
      "Jonathan Cohen"
    ],
    "year": "2023",
    "journal": "arXiv:2310.10501v1",
    "doi": "",
    "abstract": "NeMo Guardrails is an open-source toolkit for easily adding programmable guardrails to LLM-based conversational systems. Guardrails (or rails for short) are a specific way of controlling the output of an LLM, such as not talking about topics considered harmful, following a predefined dialogue path, using a particular language style, and more. There are several mechanisms that allow LLM providers and developers to add guardrails that are embedded into a specific model at training, e.g. using model alignment. Differently, using a runtime inspired from dialogue management, NeMo Guardrails allows developers to add programmable rails to LLM applications - these are user-defined, independent of the underlying LLM, and interpretable. Our initial results show that the proposed approach can be used with several LLM providers to develop controllable and safe LLM applications using programmable rails.",
    "source_database": "arxiv",
    "arxiv_id": "2310.10501v1"
  },
  {
    "title": "Telco-RAG: Navigating the Challenges of Retrieval-Augmented Language Models for Telecommunications",
    "authors": [
      "Andrei-Laurentiu Bornea",
      "Fadhel Ayed",
      "Antonio De Domenico",
      "Nicola Piovesan",
      "Ali Maatouk"
    ],
    "year": "2024",
    "journal": "arXiv:2404.15939v3",
    "doi": "",
    "abstract": "The application of Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) systems in the telecommunication domain presents unique challenges, primarily due to the complex nature of telecom standard documents and the rapid evolution of the field. The paper introduces Telco-RAG, an open-source RAG framework designed to handle the specific needs of telecommunications standards, particularly 3rd Generation Partnership Project (3GPP) documents. Telco-RAG addresses the critical challenges of implementing a RAG pipeline on highly technical content, paving the way for applying LLMs in telecommunications and offering guidelines for RAG implementation in other technical domains.",
    "source_database": "arxiv",
    "arxiv_id": "2404.15939v3"
  },
  {
    "title": "Predicting Failures of LLMs to Link Biomedical Ontology Terms to Identifiers Evidence Across Models and Ontologies",
    "authors": [
      "Daniel B. Hier",
      "Steven Keith Platt",
      "Tayo Obafemi-Ajayi"
    ],
    "year": "2025",
    "journal": "arXiv:2509.04458v2",
    "doi": "",
    "abstract": "Large language models often perform well on biomedical NLP tasks but may fail to link ontology terms to their correct identifiers. We investigate why these failures occur by analyzing predictions across two major ontologies, Human Phenotype Ontology and Gene Ontology, and two high-performing models, GPT-4o and LLaMa 3.1 405B. We evaluate nine candidate features related to term familiarity, identifier usage, morphology, and ontology structure. Univariate and multivariate analyses show that exposure to ontology identifiers is the strongest predictor of linking success.",
    "source_database": "arxiv",
    "arxiv_id": "2509.04458v2"
  },
  {
    "title": "Exploring the Vulnerability of the Content Moderation Guardrail in Large Language Models via Intent Manipulation",
    "authors": [
      "Jun Zhuang",
      "Haibo Jin",
      "Ye Zhang",
      "Zhengjian Kang",
      "Wenbin Zhang",
      "Gaby G. Dagher",
      "Haohan Wang"
    ],
    "year": "2025",
    "journal": "arXiv:2505.18556v2",
    "doi": "",
    "abstract": "Intent detection, a core component of natural language understanding, has considerably evolved as a crucial mechanism in safeguarding large language models (LLMs). While prior work has applied intent detection to enhance LLMs' moderation guardrails, showing a significant success against content-level jailbreaks, the robustness of these intent-aware guardrails under malicious manipulations remains under-explored. In this work, we investigate the vulnerability of intent-aware guardrails and demonstrate that LLMs exhibit implicit intent detection capabilities. We propose a two-stage intent-based prompt-refinement framework, IntentPrompt, that first transforms harmful inquiries into structured outlines and further reframes them into declarative-style narratives by iteratively optimizing prompts via feedback loops to enhance jailbreak success for red-teaming purposes. Extensive experiments across four public benchmarks and various black-box LLMs indicate that our framework consistently outperforms several cutting-edge jailbreak methods and evades even advanced Intent Analysis (IA) and Chain-of-Thought (CoT)-based defenses. Specifically, our \"FSTR+SPIN\" variant achieves attack success rates ranging from 88.25% to 96.54% against CoT-based defenses on the o1 model, and from 86.75% to 97.12% on the GPT-4o model under IA-based defenses. These findings highlight a critical weakness in LLMs' safety mechanisms and suggest that intent manipulation poses a growing challenge to content moderation guardrails.",
    "source_database": "arxiv",
    "arxiv_id": "2505.18556v2"
  },
  {
    "title": "FIT-RAG: Black-Box RAG with Factual Information and Token Reduction",
    "authors": [
      "Yuren Mao",
      "Xuemei Dong",
      "Wenyi Xu",
      "Yunjun Gao",
      "Bin Wei",
      "Ying Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2403.14374v1",
    "doi": "",
    "abstract": "Due to the extraordinarily large number of parameters, fine-tuning Large Language Models (LLMs) to update long-tail or out-of-date knowledge is impractical in lots of applications. To avoid fine-tuning, we can alternatively treat a LLM as a black-box (i.e., freeze the parameters of the LLM) and augment it with a Retrieval-Augmented Generation (RAG) system, namely black-box RAG. Recently, black-box RAG has achieved success in knowledge-intensive tasks and has gained much attention. Existing black-box RAG methods typically fine-tune the retriever to cater to LLMs' preferences and concatenate all the retrieved documents as the input, which suffers from two issues: (1) Ignorance of Factual Information. The LLM preferred documents may not contain the factual information for the given question, which can mislead the retriever and hurt the effectiveness of black-box RAG; (2) Waste of Tokens. Simply concatenating all the retrieved documents brings large amounts of unnecessary tokens for LLMs, which degenerates the efficiency of black-box RAG. To address these issues, this paper proposes a novel black-box RAG framework which utilizes the factual information in the retrieval and reduces the number of tokens for augmentation, dubbed FIT-RAG. FIT-RAG utilizes the factual information by constructing a bi-label document scorer. Besides, it reduces the tokens by introducing a self-knowledge recognizer and a sub-document-level token reducer. FIT-RAG achieves both superior effectiveness and efficiency, which is validated by extensive experiments across three open-domain question-answering datasets: TriviaQA, NQ and PopQA. FIT-RAG can improve the answering accuracy of Llama2-13B-Chat by 14.3\\% on TriviaQA, 19.9\\% on NQ and 27.5\\% on PopQA, respectively. Furthermore, it can save approximately half of the tokens on average across the three datasets.",
    "source_database": "arxiv",
    "arxiv_id": "2403.14374v1"
  },
  {
    "title": "SimulRAG: Simulator-based RAG for Grounding LLMs in Long-form Scientific QA",
    "authors": [
      "Haozhou Xu",
      "Dongxia Wu",
      "Matteo Chinazzi",
      "Ruijia Niu",
      "Rose Yu",
      "Yi-An Ma"
    ],
    "year": "2025",
    "journal": "arXiv:2509.25459v1",
    "doi": "",
    "abstract": "Large language models (LLMs) show promise in solving scientific problems. They can help generate long-form answers for scientific questions, which are crucial for comprehensive understanding of complex phenomena that require detailed explanations spanning multiple interconnected concepts and evidence. However, LLMs often suffer from hallucination, especially in the challenging task of long-form scientific question answering. Retrieval-Augmented Generation (RAG) approaches can ground LLMs by incorporating external knowledge sources to improve trustworthiness. In this context, scientific simulators, which play a vital role in validating hypotheses, offer a particularly promising retrieval source to mitigate hallucination and enhance answer factuality. However, existing RAG approaches cannot be directly applied for scientific simulation-based retrieval due to two fundamental challenges: how to retrieve from scientific simulators, and how to efficiently verify and update long-form answers. To overcome these challenges, we propose the simulator-based RAG framework (SimulRAG) and provide a long-form scientific QA benchmark covering climate science and epidemiology with ground truth verified by both simulations and human annotators. In this framework, we propose a generalized simulator retrieval interface to transform between textual and numerical modalities. We further design a claim-level generation method that utilizes uncertainty estimation scores and simulator boundary assessment (UE+SBA) to efficiently verify and update claims. Extensive experiments demonstrate SimulRAG outperforms traditional RAG baselines by 30.4% in informativeness and 16.3% in factuality. UE+SBA further improves efficiency and quality for claim-level generation.",
    "source_database": "arxiv",
    "arxiv_id": "2509.25459v1"
  },
  {
    "title": "Benchmarking LLM Guardrails in Handling Multilingual Toxicity",
    "authors": [
      "Yahan Yang",
      "Soham Dan",
      "Dan Roth",
      "Insup Lee"
    ],
    "year": "2024",
    "journal": "arXiv:2410.22153v1",
    "doi": "",
    "abstract": "With the ubiquity of Large Language Models (LLMs), guardrails have become crucial to detect and defend against toxic content. However, with the increasing pervasiveness of LLMs in multilingual scenarios, their effectiveness in handling multilingual toxic inputs remains unclear. In this work, we introduce a comprehensive multilingual test suite, spanning seven datasets and over ten languages, to benchmark the performance of state-of-the-art guardrails. We also investigates the resilience of guardrails against recent jailbreaking techniques, and assess the impact of in-context safety policies and language resource availability on guardrails' performance. Our findings show that existing guardrails are still ineffective at handling multilingual toxicity and lack robustness against jailbreaking prompts. This work aims to identify the limitations of guardrails and to build a more reliable and trustworthy LLMs in multilingual scenarios.",
    "source_database": "arxiv",
    "arxiv_id": "2410.22153v1"
  },
  {
    "title": "Background Knowledge Grounding for Readable, Relevant, and Factual Biomedical Lay Summaries",
    "authors": [
      "Domenic Rosati"
    ],
    "year": "2023",
    "journal": "arXiv:2305.02104v1",
    "doi": "",
    "abstract": "Communication of scientific findings to the public is important for keeping non-experts informed of developments such as life-saving medical treatments. However, generating readable lay summaries from scientific documents is challenging, and currently, these summaries suffer from critical factual errors. One popular intervention for improving factuality is using additional external knowledge to provide factual grounding. However, it is unclear how these grounding sources should be retrieved, selected, or integrated, and how supplementary grounding documents might affect the readability or relevance of the generated summaries. We develop a simple method for selecting grounding sources and integrating them with source documents. We then use the BioLaySum summarization dataset to evaluate the effects of different grounding sources on summary quality. We found that grounding source documents improves the relevance and readability of lay summaries but does not improve factuality of lay summaries. This continues to be true in zero-shot summarization settings where we hypothesized that grounding might be even more important for factual lay summaries.",
    "source_database": "arxiv",
    "arxiv_id": "2305.02104v1"
  },
  {
    "title": "Retrieval Augmented Generation (RAG) for Fintech: Agentic Design and Evaluation",
    "authors": [
      "Thomas Cook",
      "Richard Osuagwu",
      "Liman Tsatiashvili",
      "Vrynsia Vrynsia",
      "Koustav Ghosal",
      "Maraim Masoud",
      "Riccardo Mattivi"
    ],
    "year": "2025",
    "journal": "arXiv:2510.25518v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) systems often face limitations in specialized domains such as fintech, where domain-specific ontologies, dense terminology, and acronyms complicate effective retrieval and synthesis. This paper introduces an agentic RAG architecture designed to address these challenges through a modular pipeline of specialized agents. The proposed system supports intelligent query reformulation, iterative sub-query decomposition guided by keyphrase extraction, contextual acronym resolution, and cross-encoder-based context re-ranking. We evaluate our approach against a standard RAG baseline using a curated dataset of 85 question--answer--reference triples derived from an enterprise fintech knowledge base. Experimental results demonstrate that the agentic RAG system outperforms the baseline in retrieval precision and relevance, albeit with increased latency. These findings suggest that structured, multi-agent methodologies offer a promising direction for enhancing retrieval robustness in complex, domain-specific settings.",
    "source_database": "arxiv",
    "arxiv_id": "2510.25518v1"
  },
  {
    "title": "GaRAGe: A Benchmark with Grounding Annotations for RAG Evaluation",
    "authors": [
      "Ionut-Teodor Sorodoc",
      "Leonardo F. R. Ribeiro",
      "Rexhina Blloshmi",
      "Christopher Davis",
      "Adrià de Gispert"
    ],
    "year": "2025",
    "journal": "arXiv:2506.07671v1",
    "doi": "",
    "abstract": "We present GaRAGe, a large RAG benchmark with human-curated long-form answers and annotations of each grounding passage, allowing a fine-grained evaluation of whether LLMs can identify relevant grounding when generating RAG answers. Our benchmark contains 2366 questions of diverse complexity, dynamism, and topics, and includes over 35K annotated passages retrieved from both private document sets and the Web, to reflect real-world RAG use cases. This makes it an ideal test bed to evaluate an LLM's ability to identify only the relevant information necessary to compose a response, or provide a deflective response when there is insufficient information. Evaluations of multiple state-of-the-art LLMs on GaRAGe show that the models tend to over-summarise rather than (a) ground their answers strictly on the annotated relevant passages (reaching at most a Relevance-Aware Factuality Score of 60%), or (b) deflect when no relevant grounding is available (reaching at most 31% true positive rate in deflections). The F1 in attribution to relevant sources is at most 58.9%, and we show that performance is particularly reduced when answering time-sensitive questions and when having to draw knowledge from sparser private grounding sources.",
    "source_database": "arxiv",
    "arxiv_id": "2506.07671v1"
  },
  {
    "title": "Probing Factually Grounded Content Transfer with Factual Ablation",
    "authors": [
      "Peter West",
      "Chris Quirk",
      "Michel Galley",
      "Yejin Choi"
    ],
    "year": "2022",
    "journal": "arXiv:2203.10133v2",
    "doi": "",
    "abstract": "Despite recent success, large neural models often generate factually incorrect text. Compounding this is the lack of a standard automatic evaluation for factuality--it cannot be meaningfully improved if it cannot be measured. Grounded generation promises a path to solving both of these problems: models draw on a reliable external document (grounding) for factual information, simplifying the challenge of factuality. Measuring factuality is also simplified--to factual consistency, testing whether the generation agrees with the grounding, rather than all facts. Yet, without a standard automatic metric for factual consistency, factually grounded generation remains an open problem.   We study this problem for content transfer, in which generations extend a prompt, using information from factual grounding. Particularly, this domain allows us to introduce the notion of factual ablation for automatically measuring factual consistency: this captures the intuition that the model should be less likely to produce an output given a less relevant grounding document. In practice, we measure this by presenting a model with two grounding documents, and the model should prefer to use the more factually relevant one. We contribute two evaluation sets to measure this. Applying our new evaluation, we propose multiple novel methods improving over strong baselines.",
    "source_database": "arxiv",
    "arxiv_id": "2203.10133v2"
  },
  {
    "title": "RMIT-ADM+S at the MMU-RAG NeurIPS 2025 Competition",
    "authors": [
      "Kun Ran",
      "Marwah Alaofi",
      "Danula Hettiachchi",
      "Chenglong Ma",
      "Khoi Nguyen Dinh Anh",
      "Khoi Vo Nguyen",
      "Sachin Pathiyan Cherumanal",
      "Lida Rashidi",
      "Falk Scholer",
      "Damiano Spina",
      "Shuoqi Sun",
      "Oleg Zendel"
    ],
    "year": "2026",
    "journal": "arXiv:2602.20735v1",
    "doi": "",
    "abstract": "This paper presents the award-winning RMIT-ADM+S system for the Text-to-Text   track of the NeurIPS~2025 MMU-RAG Competition. We introduce Routing-to-RAG   (R2RAG), a research-focused retrieval-augmented generation (RAG)   architecture composed of lightweight components that dynamically adapt the   retrieval strategy based on inferred query complexity and evidence   sufficiency. The system uses smaller LLMs, enabling operation on a single   consumer-grade GPU while supporting complex research tasks. It builds on the   G-RAG system, winner of the ACM~SIGIR~2025 LiveRAG Challenge, and extends it   with modules informed by qualitative review of outputs. R2RAG won the Best   Dynamic Evaluation award in the Open Source category, demonstrating high   effectiveness with careful design and efficient use of resources.",
    "source_database": "arxiv",
    "arxiv_id": "2602.20735v1"
  },
  {
    "title": "PrismRAG: Boosting RAG Factuality with Distractor Resilience and Strategized Reasoning",
    "authors": [
      "Mohammad Kachuee",
      "Teja Gollapudi",
      "Minseok Kim",
      "Yin Huang",
      "Kai Sun",
      "Xiao Yang",
      "Jiaqi Wang",
      "Nirav Shah",
      "Yue Liu",
      "Aaron Colak",
      "Anuj Kumar",
      "Wen-tau Yih",
      "Xin Luna Dong"
    ],
    "year": "2025",
    "journal": "arXiv:2507.18857v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) often falls short when retrieved context includes confusing semi-relevant passages, or when answering questions require deep contextual understanding and reasoning. We propose an efficient fine-tuning framework, called PrismRAG, that (i) trains the model with distractor-aware QA pairs mixing gold evidence with subtle distractor passages, and (ii) instills reasoning-centric habits that make the LLM plan, rationalize, and synthesize without relying on extensive human engineered instructions. Evaluated across 12 open-book RAG QA benchmarks spanning diverse application domains and scenarios, PrismRAG improves average factuality by 5.4%, outperforming state-of-the-art solutions.",
    "source_database": "arxiv",
    "arxiv_id": "2507.18857v1"
  },
  {
    "title": "Enhancing LLM Factual Accuracy with RAG to Counter Hallucinations: A Case Study on Domain-Specific Queries in Private Knowledge-Bases",
    "authors": [
      "Jiarui Li",
      "Ye Yuan",
      "Zehua Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2403.10446v1",
    "doi": "",
    "abstract": "We proposed an end-to-end system design towards utilizing Retrieval Augmented Generation (RAG) to improve the factual accuracy of Large Language Models (LLMs) for domain-specific and time-sensitive queries related to private knowledge-bases. Our system integrates RAG pipeline with upstream datasets processing and downstream performance evaluation. Addressing the challenge of LLM hallucinations, we finetune models with a curated dataset which originates from CMU's extensive resources and annotated with the teacher model. Our experiments demonstrate the system's effectiveness in generating more accurate answers to domain-specific and time-sensitive inquiries. The results also revealed the limitations of fine-tuning LLMs with small-scale and skewed datasets. This research highlights the potential of RAG systems in augmenting LLMs with external datasets for improved performance in knowledge-intensive tasks. Our code and models are available on Github.",
    "source_database": "arxiv",
    "arxiv_id": "2403.10446v1"
  },
  {
    "title": "LLM-Optic: Unveiling the Capabilities of Large Language Models for Universal Visual Grounding",
    "authors": [
      "Haoyu Zhao",
      "Wenhang Ge",
      "Ying-cong Chen"
    ],
    "year": "2024",
    "journal": "arXiv:2405.17104v2",
    "doi": "",
    "abstract": "Visual grounding is an essential tool that links user-provided text queries with query-specific regions within an image. Despite advancements in visual grounding models, their ability to comprehend complex queries remains limited. To overcome this limitation, we introduce LLM-Optic, an innovative method that utilizes Large Language Models (LLMs) as an optical lens to enhance existing visual grounding models in comprehending complex text queries involving intricate text structures, multiple objects, or object spatial relationships, situations that current models struggle with. LLM-Optic first employs an LLM as a Text Grounder to interpret complex text queries and accurately identify objects the user intends to locate. Then a pre-trained visual grounding model is used to generate candidate bounding boxes given the refined query by the Text Grounder. After that, LLM-Optic annotates the candidate bounding boxes with numerical marks to establish a connection between text and specific image regions, thereby linking two distinct modalities. Finally, it employs a Large Multimodal Model (LMM) as a Visual Grounder to select the marked candidate objects that best correspond to the original text query. Through LLM-Optic, we have achieved universal visual grounding, which allows for the detection of arbitrary objects specified by arbitrary human language input. Importantly, our method achieves this enhancement without requiring additional training or fine-tuning. Extensive experiments across various challenging benchmarks demonstrate that LLM-Optic achieves state-of-the-art zero-shot visual grounding capabilities. Project Page: https://haoyu-zhao.github.io/LLM-Optic.github.io/.",
    "source_database": "arxiv",
    "arxiv_id": "2405.17104v2"
  },
  {
    "title": "fastbmRAG: A Fast Graph-Based RAG Framework for Efficient Processing of Large-Scale Biomedical Literature",
    "authors": [
      "Guofeng Meng",
      "Li Shen",
      "Qiuyan Zhong",
      "Wei Wang",
      "Haizhou Zhang",
      "Xiaozhen Wang"
    ],
    "year": "2025",
    "journal": "arXiv:2511.10014v1",
    "doi": "",
    "abstract": "Large language models (LLMs) are rapidly transforming various domains, including biomedicine and healthcare, and demonstrate remarkable potential from scientific research to new drug discovery. Graph-based retrieval-augmented generation (RAG) systems, as a useful application of LLMs, can improve contextual reasoning through structured entity and relationship identification from long-context knowledge, e.g. biomedical literature. Even though many advantages over naive RAGs, most of graph-based RAGs are computationally intensive, which limits their application to large-scale dataset. To address this issue, we introduce fastbmRAG, an fast graph-based RAG optimized for biomedical literature. Utilizing well organized structure of biomedical papers, fastbmRAG divides the construction of knowledge graph into two stages, first drafting graphs using abstracts; and second, refining them using main texts guided by vector-based entity linking, which minimizes redundancy and computational load. Our evaluations demonstrate that fastbmRAG is over 10x faster than existing graph-RAG tools and achieve superior coverage and accuracy to input knowledge. FastbmRAG provides a fast solution for quickly understanding, summarizing, and answering questions about biomedical literature on a large scale. FastbmRAG is public available in https://github.com/menggf/fastbmRAG.",
    "source_database": "arxiv",
    "arxiv_id": "2511.10014v1"
  },
  {
    "title": "Grounding Beyond Detection: Enhancing Contextual Understanding in Embodied 3D Grounding",
    "authors": [
      "Yani Zhang",
      "Dongming Wu",
      "Hao Shi",
      "Yingfei Liu",
      "Tiancai Wang",
      "Haoqiang Fan",
      "Xingping Dong"
    ],
    "year": "2025",
    "journal": "arXiv:2506.05199v2",
    "doi": "",
    "abstract": "Embodied 3D grounding aims to localize target objects described in human instructions from ego-centric viewpoint. Most methods typically follow a two-stage paradigm where a trained 3D detector's optimized backbone parameters are used to initialize a grounding model. In this study, we explore a fundamental question: Does embodied 3D grounding benefit enough from detection? To answer this question, we assess the grounding performance of detection models using predicted boxes filtered by the target category. Surprisingly, these detection models without any instruction-specific training outperform the grounding models explicitly trained with language instructions. This indicates that even category-level embodied 3D grounding may not be well resolved, let alone more fine-grained context-aware grounding. Motivated by this finding, we propose DEGround, which shares DETR queries as object representation for both DEtection and Grounding and enables the grounding to benefit from basic category classification and box detection. Based on this framework, we further introduce a regional activation grounding module that highlights instruction-related regions and a query-wise modulation module that incorporates sentence-level semantic into the query representation, strengthening the context-aware understanding of language instructions. Remarkably, DEGround outperforms state-of-the-art model BIP3D by 7.52% at overall accuracy on the EmbodiedScan validation set. The source code will be publicly available at https://github.com/zyn213/DEGround.",
    "source_database": "arxiv",
    "arxiv_id": "2506.05199v2"
  },
  {
    "title": "Knowledge-Driven Agentic Scientific Corpus Distillation Framework for Biomedical Large Language Models Training",
    "authors": [
      "Meng Xiao",
      "Xunxin Cai",
      "Qingqing Long",
      "Chengrui Wang",
      "Yuanchun Zhou",
      "Hengshu Zhu"
    ],
    "year": "2025",
    "journal": "arXiv:2504.19565v3",
    "doi": "",
    "abstract": "Corpus distillation for biomedical large language models (LLMs) seeks to address the pressing challenge of insufficient quantity and quality in open-source annotated scientific corpora, which remains a bottleneck for effective LLM training in biomedical research. This paper proposes a knowledge-driven, agentic framework for scientific corpus distillation, tailored explicitly for LLM training in the biomedical domain, addressing the challenge posed by the complex hierarchy of biomedical knowledge. Central to our approach is a collaborative multi-agent architecture, where specialized agents, each guided by the Medical Subject Headings (MeSH) hierarchy, work in concert to autonomously extract, synthesize, and self-evaluate high-quality textual data from vast scientific literature. This agentic framework collectively generates and refines domain-specific question-answer pairs, ensuring comprehensive coverage and consistency with biomedical ontologies while minimizing manual involvement. Extensive experimental results show that language models trained on our multi-agent distilled datasets achieve notable improvements in biomedical question-answering tasks, outperforming both strong life sciences LLM baselines and advanced proprietary models. Notably, our AI-Ready dataset enables Llama3-70B to surpass GPT-4 with MedPrompt and Med-PaLM-2, despite their larger scale. Detailed ablation studies and case analyses further validate the effectiveness and synergy of each agent within the framework, highlighting the potential of multi-agent collaboration in biomedical LLM training.",
    "source_database": "arxiv",
    "arxiv_id": "2504.19565v3"
  },
  {
    "title": "OLAPH: Improving Factuality in Biomedical Long-form Question Answering",
    "authors": [
      "Minbyul Jeong",
      "Hyeon Hwang",
      "Chanwoong Yoon",
      "Taewhoo Lee",
      "Jaewoo Kang"
    ],
    "year": "2024",
    "journal": "arXiv:2405.12701v3",
    "doi": "",
    "abstract": "In the medical domain, numerous scenarios necessitate the long-form generation ability of large language models (LLMs). Specifically, when addressing patients' questions, it is essential that the model's response conveys factual claims, highlighting the need for an automated method to evaluate those claims. Thus, we introduce MedLFQA, a benchmark dataset reconstructed using long-form question-answering datasets related to the biomedical domain. We use MedLFQA to facilitate a cost-effective automatic evaluations of factuality. We also propose OLAPH, a simple and novel framework that utilizes cost-effective and multifaceted automatic evaluation to construct a synthetic preference set and answers questions in our preferred manner. Our framework leads us to train LLMs step-by-step to reduce hallucinations and include crucial medical claims. We highlight that, even on evaluation metrics not used during training, LLMs trained with our OLAPH framework demonstrate significant performance improvement in factuality. Our findings reveal that a 7B LLM trained with our OLAPH framework can provide long answers comparable to the medical experts' answers in terms of factuality. We believe that our work could shed light on gauging the long-text generation ability of LLMs in the medical domain. Our code and datasets are available.",
    "source_database": "arxiv",
    "arxiv_id": "2405.12701v3"
  },
  {
    "title": "MedTrust-RAG: Evidence Verification and Trust Alignment for Biomedical Question Answering",
    "authors": [
      "Yingpeng Ning",
      "Yuanyuan Sun",
      "Ling Luo",
      "Yanhua Wang",
      "Yuchen Pan",
      "Hongfei Lin"
    ],
    "year": "2025",
    "journal": "arXiv:2510.14400v2",
    "doi": "",
    "abstract": "Biomedical question answering (QA) requires accurate interpretation of complex medical knowledge. Large language models (LLMs) have shown promising capabilities in this domain, with retrieval-augmented generation (RAG) systems enhancing performance by incorporating external medical literature. However, RAG-based approaches in biomedical QA suffer from hallucinations due to post-retrieval noise and insufficient verification of retrieved evidence, undermining response reliability. We propose MedTrust-Guided Iterative RAG, a framework designed to enhance factual consistency and mitigate hallucinations in medical QA. Our method introduces three key innovations. First, it enforces citation-aware reasoning by requiring all generated content to be explicitly grounded in retrieved medical documents, with structured Negative Knowledge Assertions used when evidence is insufficient. Second, it employs an iterative retrieval-verification process, where a verification agent assesses evidence adequacy and refines queries through Medical Gap Analysis until reliable information is obtained. Third, it integrates the MedTrust-Align Module (MTAM) that combines verified positive examples with hallucination-aware negative samples, leveraging Direct Preference Optimization to reinforce citation-grounded reasoning while penalizing hallucination-prone response patterns.",
    "source_database": "arxiv",
    "arxiv_id": "2510.14400v2"
  },
  {
    "title": "Current state of LLM Risks and AI Guardrails",
    "authors": [
      "Suriya Ganesh Ayyamperumal",
      "Limin Ge"
    ],
    "year": "2024",
    "journal": "arXiv:2406.12934v1",
    "doi": "",
    "abstract": "Large language models (LLMs) have become increasingly sophisticated, leading to widespread deployment in sensitive applications where safety and reliability are paramount. However, LLMs have inherent risks accompanying them, including bias, potential for unsafe actions, dataset poisoning, lack of explainability, hallucinations, and non-reproducibility. These risks necessitate the development of \"guardrails\" to align LLMs with desired behaviors and mitigate potential harm.   This work explores the risks associated with deploying LLMs and evaluates current approaches to implementing guardrails and model alignment techniques. We examine intrinsic and extrinsic bias evaluation methods and discuss the importance of fairness metrics for responsible AI development. The safety and reliability of agentic LLMs (those capable of real-world actions) are explored, emphasizing the need for testability, fail-safes, and situational awareness.   Technical strategies for securing LLMs are presented, including a layered protection model operating at external, secondary, and internal levels. System prompts, Retrieval-Augmented Generation (RAG) architectures, and techniques to minimize bias and protect privacy are highlighted.   Effective guardrail design requires a deep understanding of the LLM's intended use case, relevant regulations, and ethical considerations. Striking a balance between competing requirements, such as accuracy and privacy, remains an ongoing challenge. This work underscores the importance of continuous research and development to ensure the safe and responsible use of LLMs in real-world applications.",
    "source_database": "arxiv",
    "arxiv_id": "2406.12934v1"
  },
  {
    "title": "ConsistencyAI: A Benchmark to Assess LLMs' Factual Consistency When Responding to Different Demographic Groups",
    "authors": [
      "Peter Banyas",
      "Shristi Sharma",
      "Alistair Simmons",
      "Atharva Vispute"
    ],
    "year": "2025",
    "journal": "arXiv:2510.13852v2",
    "doi": "",
    "abstract": "Is an LLM telling you different facts than it's telling me? This paper introduces ConsistencyAI, an independent benchmark for measuring the factual consistency of large language models (LLMs) for different personas. ConsistencyAI tests whether, when users of different demographics ask identical questions, the model responds with factually inconsistent answers. Designed without involvement from LLM providers, this benchmark offers impartial evaluation and accountability. In our experiment, we queried 19 LLMs with prompts that requested 5 facts for each of 15 topics. We repeated this query 100 times for each LLM, each time adding prompt context from a different persona selected from a subset of personas modeling the general population. We processed the responses into sentence embeddings, computed cross-persona cosine similarity, and computed the weighted average of cross-persona cosine similarity to calculate factual consistency scores. In 100-persona experiments, scores ranged from 0.9065 to 0.7896, and the mean was 0.8656, which we adopt as a benchmark threshold. xAI's Grok-3 is most consistent, while several lightweight models rank lowest. Consistency varies by topic: the job market is least consistent, G7 world leaders most consistent, and issues like vaccines or the Israeli-Palestinian conflict diverge by provider. These results show that both the provider and the topic shape the factual consistency. We release our code and interactive demo to support reproducible evaluation and encourage persona-invariant prompting strategies.",
    "source_database": "arxiv",
    "arxiv_id": "2510.13852v2"
  },
  {
    "title": "RAG-Guardrails Integration for AI Content Control",
    "authors": [
      "R. More"
    ],
    "year": "2025",
    "journal": "Proceedings of the 2025 18th International Conference on Computer Science and Information Technology",
    "doi": "10.1145/3783862.3783896",
    "pmid": "",
    "abstract": "Generative AI, particularly large language models (LLMs), has shown remarkable potential across domains such as healthcare, legal services, and finance. However, their adoption is hindered by two persistent challenges: hallucination-where models generate factually incorrect information-and the risk of producing biased or unsafe content. This paper proposes a hybrid framework that integrates Retrieval-Augmented Generation (RAG) with NVIDIA NeMo Guardrails to address these concerns. RAG mitigates hallucinations by grounding model outputs in externally retrieved, trusted data sources, while NeMo Guardrails enforce domain-specific safety and compliance constraints through predefined behavioral policies. Empirical evaluations demonstrate that this combined approach reduces hallucinated content by 30–45% and improves safety and policy adherence across multiple enterprise use cases. The system exhibits strong potential for deployment in regulated, high-stakes environments. Future work will focus on enhancing real-time responsiveness and expanding multilingual and culturally adaptive capabilities. The proposed framework offers a scalable foundation for building trustworthy, domain-aligned generative AI solutions.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "GaRAGe: A Benchmark with Grounding Annotations for RAG Evaluation",
    "authors": [
      "I. Sorodoc",
      "Leonardo F. R. Ribeiro",
      "Rexhina Blloshmi",
      "Christopher Davis",
      "A. D. Gispert"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2506.07671",
    "pmid": "",
    "abstract": "We present GaRAGe, a large RAG benchmark with human-curated long-form answers and annotations of each grounding passage, allowing a fine-grained evaluation of whether LLMs can identify relevant grounding when generating RAG answers. Our benchmark contains 2366 questions of diverse complexity, dynamism, and topics, and includes over 35K annotated passages retrieved from both private document sets and the Web, to reflect real-world RAG use cases. This makes it an ideal test bed to evaluate an LLM's ability to identify only the relevant information necessary to compose a response, or provide a deflective response when there is insufficient information. Evaluations of multiple state-of-the-art LLMs on GaRAGe show that the models tend to over-summarise rather than (a) ground their answers strictly on the annotated relevant passages (reaching at most a Relevance-Aware Factuality Score of 60%), or (b) deflect when no relevant grounding is available (reaching at most 31% true positive rate in deflections). The F1 in attribution to relevant sources is at most 58.9%, and we show that performance is particularly reduced when answering time-sensitive questions and when having to draw knowledge from sparser private grounding sources.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Less Finetuning, Better Retrieval: Rethinking LLM Adaptation for Biomedical Retrievers via Synthetic Data and Model Merging",
    "authors": [
      "Sameh Khattab",
      "Jean-Philippe Corbeil",
      "Osman Alperen Koras",
      "Amin Dada",
      "Julian Friedrich",
      "François Beaulieu",
      "Paul Vozila",
      "J. Kleesiek"
    ],
    "year": "2026",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2602.04731",
    "pmid": "",
    "abstract": "Retrieval-augmented generation (RAG) has become the backbone of grounding Large Language Models (LLMs), improving knowledge updates and reducing hallucinations. Recently, LLM-based retriever models have shown state-of-the-art performance for RAG applications. However, several technical aspects remain underexplored on how to adapt general-purpose LLMs into effective domain-specific retrievers, especially in specialized domains such as biomedicine. We present Synthesize-Train-Merge (STM), a modular framework that enhances decoder-only LLMs with synthetic hard negatives, retrieval prompt optimization, and model merging. Experiments on a subset of 12 medical and general tasks from the MTEB benchmark show STM boosts task-specific experts by up to 23.5\\% (average 7.5\\%) and produces merged models that outperform both single experts and strong baselines without extensive pretraining. Our results demonstrate a scalable, efficient path for turning general LLMs into high-performing, domain-specialized retrievers, preserving general-domain capabilities while excelling on specialized tasks.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "KRAGEN: a knowledge graph-enhanced RAG framework for biomedical problem solving using large language models",
    "authors": [
      "Nicholas Matsumoto",
      "Jay Moran",
      "Hyunjun Choi",
      "Miguel E. Hernandez",
      "Mythreye Venkatesan",
      "Z. Wang",
      "Jason H. Moore"
    ],
    "year": "2024",
    "journal": "Bioinformatics",
    "doi": "10.1093/bioinformatics/btae353",
    "pmid": "38830083",
    "abstract": "Abstract Motivation Answering and solving complex problems using a large language model (LLM) given a certain domain such as biomedicine is a challenging task that requires both factual consistency and logic, and LLMs often suffer from some major limitations, such as hallucinating false or irrelevant information, or being influenced by noisy data. These issues can compromise the trustworthiness, accuracy, and compliance of LLM-generated text and insights. Results Knowledge Retrieval Augmented Generation ENgine (KRAGEN) is a new tool that combines knowledge graphs, Retrieval Augmented Generation (RAG), and advanced prompting techniques to solve complex problems with natural language. KRAGEN converts knowledge graphs into a vector database and uses RAG to retrieve relevant facts from it. KRAGEN uses advanced prompting techniques: namely graph-of-thoughts (GoT), to dynamically break down a complex problem into smaller subproblems, and proceeds to solve each subproblem by using the relevant knowledge through the RAG framework, which limits the hallucinations, and finally, consolidates the subproblems and provides a solution. KRAGEN’s graph visualization allows the user to interact with and evaluate the quality of the solution’s GoT structure and logic. Availability and implementation KRAGEN is deployed by running its custom Docker containers. KRAGEN is available as open-source from GitHub at: https://github.com/EpistasisLab/KRAGEN.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "SimulRAG: Simulator-based RAG for Grounding LLMs in Long-form Scientific QA",
    "authors": [
      "Haozhou Xu",
      "D. Wu",
      "M. Chinazzi",
      "Ruijia Niu",
      "Rose Yu",
      "Yi-An Ma"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2509.25459",
    "pmid": "",
    "abstract": "Large language models (LLMs) show promise in solving scientific problems. They can help generate long-form answers for scientific questions, which are crucial for comprehensive understanding of complex phenomena that require detailed explanations spanning multiple interconnected concepts and evidence. However, LLMs often suffer from hallucination, especially in the challenging task of long-form scientific question answering. Retrieval-Augmented Generation (RAG) approaches can ground LLMs by incorporating external knowledge sources to improve trustworthiness. In this context, scientific simulators, which play a vital role in validating hypotheses, offer a particularly promising retrieval source to mitigate hallucination and enhance answer factuality. However, existing RAG approaches cannot be directly applied for scientific simulation-based retrieval due to two fundamental challenges: how to retrieve from scientific simulators, and how to efficiently verify and update long-form answers. To overcome these challenges, we propose the simulator-based RAG framework (SimulRAG) and provide a long-form scientific QA benchmark covering climate science and epidemiology with ground truth verified by both simulations and human annotators. In this framework, we propose a generalized simulator retrieval interface to transform between textual and numerical modalities. We further design a claim-level generation method that utilizes uncertainty estimation scores and simulator boundary assessment (UE+SBA) to efficiently verify and update claims. Extensive experiments demonstrate SimulRAG outperforms traditional RAG baselines by 30.4% in informativeness and 16.3% in factuality. UE+SBA further improves efficiency and quality for claim-level generation.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Towards a Multi-Agent System Based on LLM and RAG for Automated and Customizable Urban Diagnostics",
    "authors": [
      "Rida Azmi",
      "Ebnou Abdem Seyid Abdellahi",
      "Mariem Bounabi",
      "Jérôme Chenal",
      "Mohammed Hlal",
      "Elbachir Diop"
    ],
    "year": "2025",
    "journal": "2025 International Conference on Intelligent Systems: Theories and Applications (SITA)",
    "doi": "10.1109/SITA67914.2025.11273206",
    "pmid": "",
    "abstract": "The increasing complexity and dynamism of urban environments necessitate advanced tools for comprehensive and timely diagnostics. Traditional methods are often labor-intensive, fragmented, and struggle to synthesize the vast, heterogeneous data streams generated by modern cities. This paper presents a novel theoretical framework for a multi-agent system that synergistically integrates Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) to deliver automated and customizable urban diagnostics. The proposed system employs a modular, plug-and-play architecture orchestrated by a core LLM, which coordinates a team of specialized agents for tasks including data extraction, analysis, auto-debugging, and report generation. A key innovation is the use of a handbook driven RAG mechanism, where structured technical guides for various data sources and thematic domains serve as a verifiable knowledge base, grounding the system's outputs in factual, domain-specific information. This knowledge-driven approach enables the dynamic generation of code, the handling of diverse data formats, and the assembly of complex diagnostic reports tailored to user specifications provided in natural language. By outlining the system's architecture, workflow, knowledge management strategy, and core theoretical principles, this paper establishes a foundational contribution towards developing more intelligent, adaptive, and reliable systems for urban planning and governance.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Conceptual Design of an LLM-Based Tech Product Recommendation System Using LangChain, LangGraph, Firecrawl, and n8n with RAG, Fine-Tuning, Prompt Engineering, and KNN with Cosine Similarity",
    "authors": [
      "Mrs. Abha Pathak",
      "Mrs. Tejaswini Mali",
      "Mr. Sanket Rathod",
      "Mr. Niraj Rane",
      "Ms. Aditi Rakh",
      "Ms. Swapnali Pimpare"
    ],
    "year": "2025",
    "journal": "International Journal of Advanced Research in Science, Communication and Technology",
    "doi": "10.48175/ijarsct-29973",
    "pmid": "",
    "abstract": "Choosing the right technology product has become increasingly difficult for consumers due to limited technical knowledge, rapidly evolving specifications, and the overwhelming number of available options. Traditional recommendation systems rely on static filters or keyword-based searches, often producing incomplete or context-insensitive results. This paper proposes a conceptual design for an AI-driven recommendation framework that leverages Large Language Models (LLMs) to deliver accurate, explainable, and personalized product suggestions. The system integrates LangChain and LangGraph to manage reasoning, tool orchestration, and multi-step control flow, while product similarity is computed using K-Nearest Neighbors (KNN) with cosine similarity. To ensure factual grounding and reduce hallucination, the design incorporates Retrieval-Augmented Generation (RAG), complemented by fine-tuning and prompt engineering for domain-specific alignment. A continuously updated product knowledge base, maintained through automated web scraping using Firecrawl and workflow synchronization via n8n, supports real-time data accuracy. The proposed framework enables natural-language interaction and aims to provide reliable recommendations for devices such as smartphones, laptops, and wearables, offering a scalable and modular foundation for next-generation tech product advisory systems",
    "source_database": "semantic_scholar"
  },
  {
    "title": "LLMForum-RAG: A Multilingual, Multi-domain Framework for Factual Reasoning via Weighted Retrieval and LLM Collaboration",
    "authors": [
      "Soham Chaudhuri",
      "Dipanjan Saha",
      "Dipankar Das"
    ],
    "year": "2025",
    "journal": "",
    "doi": "10.18653/v1/2025.findings-ijcnlp.88",
    "pmid": "",
    "abstract": "LLMs have emerged as a transformative technology, enabling a wide range of tasks such as text generation, summarization, question answering, and more. The use of RAG with LLM is on the rise to provide deeper knowledge bases of various domains. In the present study, we propose a RAG framework that employs weighted Rocchio mechanism for retrieval and LLM collaborative forum with supervision for generation. Our framework is evaluated in two downstream tasks: a biomedical question answering (BioASQ-QA) and a multilingual claim verification (e.g. in English, Hindi, and Bengali) to showcase its adapt-ability across various domains and languages. The proposed retriever is capable to achieve substantial improvement over BM25 of +8% (BioASQ-QA), +15% (English), +5% (Hindi), and +20% (Bengali) for Recall@5. In veracity classification, our framework achieves an average answer correctness of 0 . 78 on BioASQ-QA while achieving F1-score of 0 . 59 , 0 . 56 , and 0 . 41 for English, Hindi and Bengali languages, respectively. These results demonstrate the effectiveness and robustness of our framework for retrieval and generation in multilingual and multi-domain settings.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "HYPER-RAG: Evaluating Hyperparameter Trade-Offs in Biomedical Retrieval-Augmented Generation",
    "authors": [
      "Ankush Sil Sarma",
      "Pawan Kumar Singh"
    ],
    "year": "2025",
    "journal": "2025 IEEE Pune Section International Conference (PuneCon)",
    "doi": "10.1109/PuneCon67554.2025.11377827",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) improves the factual accuracy of large language models by combining document retrieval with text generation. In biomedical question answering, where correctness is critical, the effect of key hyperparameters has not been studied in a systematic way. This paper presents an evaluation of RAG on the COVID-QA dataset with a focus on three retrievers (dense, BM25, hybrid), two retrieval depths (top- $\\mathrm{k}=1,3$), and optional reranking with a cross encoder. We use a single biomedical prompt and measure exact match (EM), F1 score, semantic similarity, groundedness, and latency. We also report a composite score that balances lexical accuracy, semantic similarity, and efficiency. Our results on a $\\mathbf{1 0 0}$-question subset show that reranking improves grounding at the cost of extra latency, and that increasing top-k improves recall but gives smaller gains after a point. The study highlights that multiple metrics are needed to judge biomedical RAG systems reliably and that careful tuning of retrieval and reranking settings can yield practical improvements under compute constraints.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Navi: RAG-Powered LLM Chatbot for Academic Institutions",
    "authors": [
      "Amiya P Bovas"
    ],
    "year": "2025",
    "journal": "International Journal for Research in Applied Science and Engineering Technology",
    "doi": "10.22214/ijraset.2025.75655",
    "pmid": "",
    "abstract": "With the growing integration of artificial intelligence (AI) across educational ecosystems, there is an increasing\ndemand for intelligent conversational agents that can efficiently deliver reliable, domain-specific information to students,\nfaculty, and visitors. This research introduces Navi, an academic virtual assistant designed using a Large Language Model\n(LLM) combined with a Retrieval-Augmented Generation (RAG) framework to generate accurate and contextually grounded\nresponses [1], [5]. The chatbot incorporates the Mistral-7B-Instruct model [3] for response generation and leverages a FAISSbased vector database [4], where embeddings are produced using the all-MiniLM-L6-v2 sentence transformer model. When a\nuser submits a query, relevant document segments are retrieved from institutional data sources and integrated into the LLM’s\nprompt, enabling precise, factual, and contextually aligned output.\nNavi offers a range of advanced capabilities, including natural language understanding, multi-turn contextual dialogues,\nmultilingual query handling, sentiment adaptation, speech-enabled interaction, and user-personalized responses. Performance\nevaluation through simulated academic queries indicates improved response accuracy, coherence, and informativeness,\nachieving an average relevance score between 0.7–0.85. The experimental results confirm that combining RAG with an LLM\nsubstantially reduces hallucinations, enhances factual grounding, and improves user satisfaction. Overall, Navi demonstrates a\nscalable and dependable framework for deploying AI-driven information assistants within educational institutions",
    "source_database": "semantic_scholar"
  },
  {
    "title": "MedTrust-RAG: Evidence Verification and Trust Alignment for Biomedical Question Answering",
    "authors": [
      "Yingpeng Ning",
      "Yuanyuan Sun",
      "Ling Luo",
      "Yanhua Wang",
      "Yuchen Pan",
      "Hongfei Lin"
    ],
    "year": "2025",
    "journal": "2025 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)",
    "doi": "10.1109/BIBM66473.2025.11356290",
    "pmid": "",
    "abstract": "Biomedical question answering (QA) requires precise interpretation of complex medical knowledge. Large language models (LLMs) and retrieval-augmented generation (RAG) leverage external medical literature but often produce hallucinations due to noisy retrieval and insufficient verification. We propose MedTrust-Guided Iterative RAG, a framework that improves factual consistency and reduces hallucinations in medical QA. It introduces three innovations. First, citation-aware reasoning grounds generation in retrieved documents and uses Negative Knowledge Assertions when evidence is missing. Second, an iterative retrieval-verification process refines queries through Medical Gap Analysis. Third, the MedTrust-Align Module (MTAM) applies Direct Preference Optimization to align generation with verified evidence and suppress hallucination-prone patterns.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Reshaping Biomedical Scientific Literature in a RAG Pipeline for Question Answering",
    "authors": [
      "Maël Lesavourey",
      "Gilles Hubert"
    ],
    "year": "2025",
    "journal": "",
    "doi": "",
    "pmid": "",
    "abstract": "",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Finetune-RAG: Fine-Tuning Language Models to Resist Hallucination in Retrieval-Augmented Generation",
    "authors": [
      "Zhan Peng Lee",
      "A. Lin",
      "Calvin Tan"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2505.10792",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) has emerged as a powerful framework to improve factuality in large language models (LLMs) by grounding their outputs in retrieved documents. However, ensuring perfect retrieval of relevant information remains challenging, and when irrelevant content is passed downstream to an LLM, it can lead to hallucinations. In this work, we propose Finetune-RAG, a simple and effective fine-tuning approach that features the first-of-its-kind RAG training dataset constructed to mimic real-world imperfections. Experimental results show that Finetune-RAG improves factual accuracy by 21.2% over the base model. We also propose Bench-RAG, an LLM-as-a-judge evaluation pipeline that stress tests models under realistic imperfect retrieval scenarios. Our codebase and dataset are fully open sourced for community use.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "An Agentic Hybrid LLM–RAG Framework for Explainable Clinical Decision Support",
    "authors": [
      "Mohammed Kapadia",
      "Mohammed Memon",
      "P. Mishra",
      "S. Okuhara"
    ],
    "year": "2026",
    "journal": "Proceedings of the 18th International Conference on Agents and Artificial Intelligence",
    "doi": "10.5220/0014459100004052",
    "pmid": "",
    "abstract": ": The fast evolution of Large Language Models (LLMs) has provided new opportunities to intelligent Clinical Decision Support Systems (CDSS), but such issues as hallucination, absent interpretability, and poor factual foundation still exist. This paper proposes a Hybrid LLM-Retrieval Augmented Generation (RAG) model of evidence-based clinical reasoning, which would combine transformer-based contextual understanding and retrieval-based factual verification. This system is an agentic design and consists of four collaborative components, namely Retriever, Transformer Encoder, Generator, and Evaluator Agents that together guarantee accuracy, interpretability, and transparency. The suggested framework transforms the diagnostic reasoning into a probabilistic optimisation problem, and the recommendations are conditionalized by multimodal patient data and top-k evidence obtained in the biomedical literature. A composite loss is a loss that optimises diagnostic accuracy, semantic consistency and factual faithfulness. Experimental validation on benchmark datasets, such as MIMIC-III, PubMedQA and ADReSSo 2021, performs better than current models, including BioBERT, GPT-3.5 and Med-PaLM 2, with 93.7% accuracy, 0.926 AUROC, and 68% reduction in rate of hallucination. The findings prove that the Hybrid LLM-RAG model is feasible in the context of aligning linguistic fluency and clinical reliability, developing a reliable AI-based decision support in healthcare-related applications. Although the findings are promising, they are achieved in controlled experimental conditions and do not demonstrate competitive performance assertion.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "ModernBERT + ColBERT: Enhancing biomedical RAG through an advanced re-ranking retriever",
    "authors": [
      "Eduardo Mart'inez Rivera",
      "F. Menolascina"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2510.04757",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) is a powerful technique for enriching Large Language Models (LLMs) with external knowledge, allowing for factually grounded responses, a critical requirement in high-stakes domains such as healthcare. However, the efficacy of RAG systems is fundamentally restricted by the performance of their retrieval module, since irrelevant or semantically misaligned documents directly compromise the accuracy of the final generated response. General-purpose dense retrievers can struggle with the nuanced language of specialised domains, while the high accuracy of in-domain models is often achieved at prohibitive computational costs. In this work, we aim to address this trade-off by developing and evaluating a two-stage retrieval architecture that combines a lightweight ModernBERT bidirectional encoder for efficient initial candidate retrieval with a ColBERTv2 late-interaction model for fine-grained re-ranking. We conduct comprehensive evaluations of our retriever module performance and RAG system performance in the biomedical context, fine-tuning the IR module using 10k question-passage pairs from PubMedQA. Our analysis of the retriever module confirmed the positive impact of the ColBERT re-ranker, which improved Recall@3 by up to 4.2 percentage points compared to its retrieve-only counterpart. When integrated into the biomedical RAG, our IR module leads to a state-of-the-art average accuracy of 0.4448 on the five tasks of the MIRAGE question-answering benchmark, outperforming strong baselines such as MedCPT (0.4436). Our ablation studies reveal that this performance is critically dependent on a joint fine-tuning process that aligns the retriever and re-ranker; otherwise, the re-ranker might degrade the performance.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "DeepSeek-Med-8B: Medical LLM for Chinese Diagnosis and Referral",
    "authors": [
      "Chenxing Li",
      "J. Mao",
      "Bin Liu",
      "Weiwei Luo"
    ],
    "year": "2025",
    "journal": "2025 8th International Conference on Computer Information Science and Application Technology (CISAT)",
    "doi": "10.1109/CISAT66811.2025.11181817",
    "pmid": "",
    "abstract": "The uneven distribution of medical resources in China poses significant challenges, especially in rural areas. While large language models (LLMs) offer potential for clinical support, existing systems like GPT-4 and Med-PaLM suffer from hallucinations, English-centric biases, and lack real-time physician integration. We present DeepSeek-Med-8B, a Chinese medical conversational agent based on the DeepSeek-R1-DistillLlama-8B architecture.DeepSeek-Med-8B is trained through: (i) Supervised Fine-Tuning (SFT) on curated Chinese medical corpora; (ii) Reinforcement Learning with AI and Doctor Feedback (RLAIF) for factuality, empathy, and referral quality; and (iii) Retrieval-Augmented Generation (RAG) for real-time grounding in physician databases.Across eight clinical tasks, DeepSeek-Med-8B achieves a top1 mean score of 66.9 on GPT-4o-based benchmarks and a 74% top-3 doctor match rate, outperforming rule-based baselines. The model runs efficiently on a single RTX 4090 GPU via INT8 quantization.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Enterprise GenAI: LLM Deployment on AWS",
    "authors": [
      "Sufiyan Shaikh"
    ],
    "year": "2026",
    "journal": "International Journal for Research in Applied Science and Engineering Technology",
    "doi": "10.22214/ijraset.2026.77762",
    "pmid": "",
    "abstract": "Generative AI and Large Language Models (LLMs) have transitioned from experimental prototypes to critical\nenterprise assets, requiring robust, scalable, and secure deployment frameworks. This paper presents a comprehensive survey of\nLLM deployment strategies on Amazon Web Services (AWS), focusing on the shift from consumer-grade to enterprise-ready\narchitectures. We analyze the AWS Generative AI stack, specifically comparing managed serverless approaches via Amazon\nBedrock with customizable infrastructure through Amazon SageMaker. The survey highlights key architectural patterns,\nincluding Retrieval-Augmented Generation (RAG) for grounding models in proprietary data and multi-agent systems for\ncomplex task orchestration. Furthermore, we examine the critical role of LLMOps in managing the model lifecycle, ensuring\nsecurity through Guardrails, and optimizing costs via quantization and provisioned throughput. By synthesizing real-world case\nstudies and performance metrics, this paper provides a scalable roadmap for organizations to implement production-grade\nGenerative AI solutions that maintain data sovereignty and operational excellence.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "”A Midsummer Night’s Dream” quest for truth: From ChatGPT “hallucinations” to RAG reasoning and ACURAI precision — a scoping review on detection, minimizing, and (almost) complete error elimination and enhancing Large Language Models' re-liability",
    "authors": [
      "A. Anghelescu",
      "Constantin Munteanu",
      "Lucia Ana Maria Anghelescu",
      "G. Onose"
    ],
    "year": "2025",
    "journal": "Balneo and PRM Research Journal",
    "doi": "10.12680/balneo.2025.847",
    "pmid": "",
    "abstract": "Like A Midsummer Night’s Dream, large language models (LLMs) exhibit vast imagination, drawing on massive training datasets. However, they may fabricate or mix information, lacking mechanisms to verify real-world sources. Most commercial LLMs, including those used in medicine, remain prone to hallucinations—plausible but false content. Retrieval-Augmented Generation (RAG) aims to address this by grounding LLM outputs in real-time access to verified sources like scientific databases. A 2023–2025 PubMed search identified 91 papers on RAG and LLM applications across biomedical domains; 78 were useful for our paper, adressing medical domains. RAG techniques significantly reduce hallucinations by ensuring that only validated information informs model outputs. ACURAI, an advanced system based on “phrase dominance and discrete functional units (DFUs),” further enhances LLM accuracy. Tested on a novel “RAG-Truth Dataset Caveats,” ACURAI eliminated 91–100% of junk outputs in GPT-3.5 and GPT-4. While LLMs can resemble Puck (creative yet unreliable), ACURAI, aided by RAG, acts more like Theseus, grounding answers in verified data. This framework strengthens the possible role of LLMs in clinical diagnosis, academic writing, and patient education, offering a practical path toward safer and more accurate medical AI. Ultimately, human oversight remains key to interpreting and validating AI-generated outputs.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "From LLM Reasoning to Autonomous AI Agents: A Comprehensive Review",
    "authors": [
      "M. Ferrag",
      "N. Tihanyi",
      "M. Debbah"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2504.19678",
    "pmid": "",
    "abstract": "Large language models and autonomous AI agents have evolved rapidly, resulting in a diverse array of evaluation benchmarks, frameworks, and collaboration protocols. Driven by the growing need for standardized evaluation and integration, we systematically consolidate these fragmented efforts into a unified framework. However, the landscape remains fragmented and lacks a unified taxonomy or comprehensive survey. Therefore, we present a side-by-side comparison of benchmarks developed between 2019 and 2025 that evaluate these models and agents across multiple domains. In addition, we propose a taxonomy of approximately 60 benchmarks that cover general and academic knowledge reasoning, mathematical problem-solving, code generation and software engineering, factual grounding and retrieval, domain-specific evaluations, multimodal and embodied tasks, task orchestration, and interactive assessments. Furthermore, we review AI-agent frameworks introduced between 2023 and 2025 that integrate large language models with modular toolkits to enable autonomous decision-making and multi-step reasoning. Moreover, we present real-world applications of autonomous AI agents in materials science, biomedical research, academic ideation, software engineering, synthetic data generation, chemical reasoning, mathematical problem-solving, geographic information systems, multimedia, healthcare, and finance. We then survey key agent-to-agent collaboration protocols, namely the Agent Communication Protocol (ACP), the Model Context Protocol (MCP), and the Agent-to-Agent Protocol (A2A). Finally, we discuss recommendations for future research, focusing on advanced reasoning strategies, failure modes in multi-agent LLM systems, automated scientific discovery, dynamic tool integration via reinforcement learning, integrated search capabilities, and security vulnerabilities in agent protocols.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Auto-GDA: Automatic Domain Adaptation for Efficient Grounding Verification in Retrieval Augmented Generation",
    "authors": [
      "Tobias Leemann",
      "Periklis Petridis",
      "Giuseppe Vietri",
      "Dionysis Manousakas",
      "Aaron Roth",
      "Sergül Aydöre"
    ],
    "year": "2024",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2410.03461",
    "pmid": "",
    "abstract": "While retrieval-augmented generation (RAG) has been shown to enhance factuality of large language model (LLM) outputs, LLMs still suffer from hallucination, generating incorrect or irrelevant information. A common detection strategy involves prompting the LLM again to assess whether its response is grounded in the retrieved evidence, but this approach is costly. Alternatively, lightweight natural language inference (NLI) models for efficient grounding verification can be used at inference time. While existing pre-trained NLI models offer potential solutions, their performance remains subpar compared to larger models on realistic RAG inputs. RAG inputs are more complex than most datasets used for training NLI models and have characteristics specific to the underlying knowledge base, requiring adaptation of the NLI models to a specific target domain. Additionally, the lack of labeled instances in the target domain makes supervised domain adaptation, e.g., through fine-tuning, infeasible. To address these challenges, we introduce Automatic Generative Domain Adaptation (Auto-GDA). Our framework enables unsupervised domain adaptation through synthetic data generation. Unlike previous methods that rely on handcrafted filtering and augmentation strategies, Auto-GDA employs an iterative process to continuously improve the quality of generated samples using weak labels from less efficient teacher models and discrete optimization to select the most promising augmented samples. Experimental results demonstrate the effectiveness of our approach, with models fine-tuned on synthetic data using Auto-GDA often surpassing the performance of the teacher model and reaching the performance level of LLMs at 10% of their computational cost.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "MEGA-RAG: a retrieval-augmented generation framework with multi-evidence guided answer refinement for mitigating hallucinations of LLMs in public health",
    "authors": [
      "Shan Xu",
      "Zhaokun Yan",
      "Chengxiao Dai",
      "Fan Wu"
    ],
    "year": "2025",
    "journal": "Frontiers in Public Health",
    "doi": "10.3389/fpubh.2025.1635381",
    "pmid": "41132171",
    "abstract": "Introduction The increasing adoption of large language models (LLMs) in public health has raised significant concerns about hallucinations-factually inaccurate or misleading outputs that can compromise clinical communication and policy decisions. Methods We propose a retrieval-augmented generation framework with multi-evidence guided answer refinement (MEGA-RAG), specifically designed to mitigate hallucinations in public health applications. The framework integrates multi-source evidence retrieval (dense retrieval via FAISS, keyword-based retrieval via BM25, and biomedical knowledge graphs), employs a cross-encoder reranker to ensure semantic relevance, and incorporates a discrepancy-aware refinement module to further enhance factual accuracy. Results Experimental evaluation demonstrates that MEGA-RAG outperforms four baseline models [PubMedBERT, PubMedGPT, standalone LLM, and LLM with standard retrieval-augmented generation (RAG)], achieving a reduction in hallucination rates by over 40%. It also achieves the highest accuracy (0.7913), precision (0.7541), recall (0.8304), and F1 score (0.7904). Discussion These findings confirm that MEGA-RAG is highly effective in generating factually reliable and medically accurate responses, thereby enhancing the credibility of AI-generated health information for applications in health education, clinical communication, and evidence-based policy development.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "UniTor at BioASQ 2025: Modular Biomedical QA with Synthetic Snippets and Multiple Task Answer Generation",
    "authors": [
      "Federico Borazio",
      "Andriy Shcherbakov",
      "D. Croce",
      "Roberto Basili"
    ],
    "year": "2025",
    "journal": "",
    "doi": "",
    "pmid": "",
    "abstract": "",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Diagnosing and Addressing Pitfalls in KG-RAG Datasets: Toward More Reliable Benchmarking",
    "authors": [
      "Liangliang Zhang",
      "Zhuorui Jiang",
      "H. Chi",
      "Haoyang Chen",
      "Mohammed Elkoumy",
      "Fali Wang",
      "Qiong Wu",
      "Zhengyi Zhou",
      "Shirui Pan",
      "Suhang Wang",
      "Yao Ma"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2505.23495",
    "pmid": "",
    "abstract": "Knowledge Graph Question Answering (KGQA) systems rely on high-quality benchmarks to evaluate complex multi-hop reasoning. However, despite their widespread use, popular datasets such as WebQSP and CWQ suffer from critical quality issues, including inaccurate or incomplete ground-truth annotations, poorly constructed questions that are ambiguous, trivial, or unanswerable, and outdated or inconsistent knowledge. Through a manual audit of 16 popular KGQA datasets, including WebQSP and CWQ, we find that the average factual correctness rate is only 57 %. To address these issues, we introduce KGQAGen, an LLM-in-the-loop framework that systematically resolves these pitfalls. KGQAGen combines structured knowledge grounding, LLM-guided generation, and symbolic verification to produce challenging and verifiable QA instances. Using KGQAGen, we construct KGQAGen-10k, a ten-thousand scale benchmark grounded in Wikidata, and evaluate a diverse set of KG-RAG models. Experimental results demonstrate that even state-of-the-art systems struggle on this benchmark, highlighting its ability to expose limitations of existing models. Our findings advocate for more rigorous benchmark construction and position KGQAGen as a scalable framework for advancing KGQA evaluation.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Stream RAG: Instant and Accurate Spoken Dialogue Systems with Streaming Tool Usage",
    "authors": [
      "Siddhant Arora",
      "Haidar Khan",
      "Kai Sun",
      "Xin Dong",
      "Sajal Choudhary",
      "Seungwhan Moon",
      "Xinyuan Zhang",
      "Adithya Sagar",
      "S. Appini",
      "Kaushik Patnaik",
      "Sanat Sharma",
      "Shinji Watanabe",
      "Anuj Kumar",
      "Ahmed A Aly",
      "Yue Liu",
      "Florian Metze",
      "Zhaojiang Lin"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2510.02044",
    "pmid": "",
    "abstract": "End-to-end speech-in speech-out dialogue systems are emerging as a powerful alternative to traditional ASR-LLM-TTS pipelines, generating more natural, expressive responses with significantly lower latency. However, these systems remain prone to hallucinations due to limited factual grounding. While text-based dialogue systems address this challenge by integrating tools such as web search and knowledge graph APIs, we introduce the first approach to extend tool use directly into speech-in speech-out systems. A key challenge is that tool integration substantially increases response latency, disrupting conversational flow. To mitigate this, we propose Streaming Retrieval-Augmented Generation (Streaming RAG), a novel framework that reduces user-perceived latency by predicting tool queries in parallel with user speech, even before the user finishes speaking. Specifically, we develop a post-training pipeline that teaches the model when to issue tool calls during ongoing speech and how to generate spoken summaries that fuse audio queries with retrieved text results, thereby improving both accuracy and responsiveness. To evaluate our approach, we construct AudioCRAG, a benchmark created by converting queries from the publicly available CRAG dataset into speech form. Experimental results demonstrate that our streaming RAG approach increases QA accuracy by up to 200% relative (from 11.1% to 34.2% absolute) and further enhances user experience by reducing tool use latency by 20%. Importantly, our streaming RAG approach is modality-agnostic and can be applied equally to typed input, paving the way for more agentic, real-time AI assistants.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "LLM-Driven Learner Modeling and Personalized Learning Pathways: A Closed-Loop Framework and Engineering Design for Virtual Laboratories",
    "authors": [
      "Ruijie Wang",
      "Guangtao Xu"
    ],
    "year": "2025",
    "journal": "2025 International Conference on Educational Technology Management (ICETM)",
    "doi": "10.1109/ICETM67477.2025.11413398",
    "pmid": "",
    "abstract": "Focusing on virtual experiment teaching, this paper proposes a personalized learning closed-loop with LLM as the core. A simulation engine provides a verifiable factual baseline, while the LLM undertakes semantic interpretation, two-phase path way generation (skeleton-verification-refinement), fact-grounded judgement and feedback, and explanatory summarization. To enhance robustness and compliance, the framework employs retrieval-augmented generation (RAG), structured outputs, and a second-pass verifier as guardrails. At the learner-modeling layer, we fuse LLM semantic increments with BKT/IRT steady estimates to obtain a fine grained yet stable representation that drives adaptive replanning. The engineering design covers windowed reporting and fact checks, an orchestration service with template interfaces, result caching and tiered inference (small model first), minimal-necessary data collection with anonymization, and classroom-orien ted batching and rate limiting. Although large scale evaluation re mains for future work, the framework connects the key chain “interpretation—modeling—path—judgement—explanation,” demonstrating interpretability, controllability, and deployment feasibility.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Principled Context Engineering for RAG: Statistical Guarantees via Conformal Prediction",
    "authors": [
      "Debashish Chakraborty",
      "Eugene Yang",
      "Daniel Khashabi",
      "Dawn J. Lawrie",
      "Kevin Duh"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.1007/978-3-032-21300-6_45",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enhances factual grounding in large language models (LLMs) by incorporating retrieved evidence, but LLM accuracy declines when long or noisy contexts exceed the model's effective attention span. Existing pre-generation filters rely on heuristics or uncalibrated LLM confidence scores, offering no statistical control over retained evidence. We evaluate and demonstrate context engineering through conformal prediction, a coverage-controlled filtering framework that removes irrelevant content while preserving recall of supporting evidence. Using both embedding- and LLM-based scoring functions, we test this approach on the NeuCLIR and RAGTIME collections. Conformal filtering consistently meets its target coverage, ensuring that a specified fraction of relevant snippets are retained, and reduces retained context by 2-3x relative to unfiltered retrieval. On NeuCLIR, downstream factual accuracy measured by ARGUE F1 improves under strict filtering and remains stable at moderate coverage, indicating that most discarded material is redundant or irrelevant. These results demonstrate that conformal prediction enables reliable, coverage-controlled context reduction in RAG, offering a model-agnostic and principled approach to context engineering.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "RAGalyst: Automated Human-Aligned Agentic Evaluation for Domain-Specific RAG",
    "authors": [
      "Joshua Gao",
      "Quoc Huy Pham",
      "Subin Varghese",
      "Silwal Saurav",
      "Vedhus Hoskere"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2511.04502",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) is a critical technique for grounding Large Language Models (LLMs) in factual evidence, yet evaluating RAG systems in specialized, safety-critical domains remains a significant challenge. Existing evaluation frameworks often rely on heuristic-based metrics that fail to capture domain-specific nuances and other works utilize LLM-as-a-Judge approaches that lack validated alignment with human judgment. This paper introduces RAGalyst, an automated, human-aligned agentic framework designed for the rigorous evaluation of domain-specific RAG systems. RAGalyst features an agentic pipeline that generates high-quality, synthetic question-answering (QA) datasets from source documents, incorporating an agentic filtering step to ensure data fidelity. The framework refines two key LLM-as-a-Judge metrics-Answer Correctness and Answerability-using prompt optimization to achieve a strong correlation with human annotations. Applying this framework to evaluate various RAG components across three distinct domains (military operations, cybersecurity, and bridge engineering), we find that performance is highly context-dependent. No single embedding model, LLM, or hyperparameter configuration proves universally optimal. Additionally, we provide an analysis on the most common low Answer Correctness reasons in RAG. These findings highlight the necessity of a systematic evaluation framework like RAGalyst, which empowers practitioners to uncover domain-specific trade-offs and make informed design choices for building reliable and effective RAG systems. RAGalyst is available on our Github.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Retrieval-Augmented Multi-LLM Ensemble for Industrial Part Specification Extraction",
    "authors": [
      "Muzakkiruddin Ahmed Mohammed",
      "John R. Talburt",
      "Leon Claasssens",
      "Adriaan Marais"
    ],
    "year": "2025",
    "journal": "2025 17th International Conference on Knowledge and System Engineering (KSE)",
    "doi": "10.1109/KSE68178.2025.11309590",
    "pmid": "",
    "abstract": "Industrial part specification extraction from unstructured text remains a persistent challenge in manufacturing, procurement, and maintenance, where manual processing is both time-consuming and error-prone. This paper introduces RAGsemble, a retrieval-augmented multi-LLM ensemble framework that orchestrates nine state-of-the-art Large Language Models (LLMs) within a structured three-phase pipeline. RAGsemble addresses key limitations of single-model systems by combining the complementary strengths of model families including Gemini (2.0, 2.5, 1.5), OpenAI (GPT-4o, o4-mini), Mistral Large, and Gemma (1B, 4B, 3n-e4b), while grounding outputs in factual data using FAISS-based semantic retrieval. The system architecture consists of three stages: (1) parallel extraction by diverse LLMs, (2) targeted research augmentation leveraging high-performing models, and (3) intelligent synthesis with conflict resolution and confidence-aware scoring. RAG integration provides real-time access to structured part databases, enabling the system to validate, refine, and enrich outputs through similarity-based reference retrieval. Experimental results using real industrial datasets demonstrate significant gains in extraction accuracy, technical completeness, and structured output quality compared to leading single-LLM baselines. Key contributions include a scalable ensemble architecture for industrial domains, seamless RAG integration throughout the pipeline, comprehensive quality assessment mechanisms, and a production-ready solution suitable for deployment in knowledgeintensive manufacturing environments.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Artificial Intelligence for Quantitative Finance: A RAG-Augmented Multi-Agent Framework for Robust Equity Strategy Discovery",
    "authors": [
      "Jianfei Wang",
      "Hualin Li"
    ],
    "year": "2025",
    "journal": "Proceedings of the 2025 9th International Conference on Computer Science and Artificial Intelligence",
    "doi": "10.1145/3788149.3788249",
    "pmid": "",
    "abstract": "This paper introduces an AI-driven multi-agent framework for automated quantitative strategy generation and validation, integrating large language model (LLM) agents with Retrieval-Augmented Generation (RAG) to enhance factual accuracy and research reliability. The system orchestrates specialized agents for market analysis, financial knowledge retrieval, feature engineering, strategy construction, backtesting, and performance interpretation, establishing a comprehensive autonomous investment research pipeline. By grounding each agent's reasoning in a curated financial knowledge base, the framework mitigates hallucination risks, improves decision consistency, and provides transparent explanations of trading logic and risk characteristics. Experimental evaluation on simulated equity market data demonstrates that the RAG-enhanced multi-agent system achieves superior performance, generating a +3.25% return with improved consistency compared to single-agent (+2.15%) and non-RAG multi-agent (+1.82%) variants. The RAG-augmented approach exhibits higher win rates (51.2% vs 48-49%) and more robust risk-adjusted returns, validating its effectiveness in producing interpretable and reliable trading strategies. These results underscore the transformative potential of knowledge-grounded multi-agent AI systems in modern quantitative finance, while highlighting the framework's adaptability across different market environments without dependency on specific asset classes or historical periods.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Mitigating Artificial Intelligence Hallucinations in Education: A Comparative Study of Retrieval-Augmented Generation (RAG) and Large Language Models",
    "authors": [
      "Pei-hua Chen",
      "Yuen-Min Huang",
      "Ting‐Ting Wu",
      "Hsin‐Yu Lee"
    ],
    "year": "2025",
    "journal": "2025 7th International Conference on Modern Educational Technology (ICMET)",
    "doi": "10.1109/ICMET67594.2025.11451842",
    "pmid": "",
    "abstract": "The integration of Large Language Models (LLMs) into educational technologies promises to revolutionize personalized learning. However, their propensity for “hallucination,” the generation of factually incorrect or nonsensical information, poses a significant risk to knowledge integrity and student trust. To address this critical challenge, this paper investigates Retrieval-Augmented Generation (RAG), an architectural approach that mitigates hallucinations by grounding the model's responses in factual data. Before generating an answer, the RAG framework retrieves relevant information from a verified knowledge base, ensuring the output is contextually accurate. We conducted a comparative study between a standard LLM and a RAG-powered system whose knowledge base was populated with specific curriculum materials. The generated answers to domain-specific questions were assessed through two distinct lenses: a quantitative evaluation of factual accuracy by subject matter experts and a qualitative analysis of student perceptions gathered through semi-structured interviews. Our preliminary findings indicate that the RAG model demonstrates a marked improvement in factual accuracy and a significant reduction in hallucinatory content. Furthermore, students perceived the RAG-generated responses as more trustworthy and useful for their learning. This research provides empirical evidence for the necessity of adopting RAG in educational AI, offering a pathway toward developing more reliable and effective digital learning tools.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Enhancing Truth with AI: Evaluating ML, LLMs, and RAG in Combating Misinformation",
    "authors": [
      "Manisha",
      "Manisha Jailia"
    ],
    "year": "2025",
    "journal": "2025 2nd International Conference on Advanced Computing and Emerging Technologies (ACET)",
    "doi": "10.1109/ACET67282.2025.11430189",
    "pmid": "",
    "abstract": "In the digital age, misinformation is a new danger. It impacts how people talk about things, harms democracy, and changes how people think about the government. This study answers the important demand for better ways to find false information by comparing classic ML models with the newest LLMs and then creating a hybrid framework that contains the best parts of both. Unfortunately, classic classifiers like Naive Bayes and the Passive-Aggressive Classifier often miss content that is both hostile and full of context. LLMs like GPT-3.5 and GPT-4, on the other hand, are very accurate and know what words mean, but they need a lot of processing power and can make stuff up in their outputs. Adding Retrieval-Augmented Generation (RAG) to the LLM pipeline fixes these issues. Retrieval-Augmented Generation (RAG) plays a crucial role by grounding LLM outputs in verified external knowledge, reducing hallucination and improving factual accuracy. By combining retrieval with generation, RAG ensures that misinformation detection becomes more reliable, contextaware, and aligned with real-world evidence. Our hybrid technique works better than existing algorithms on benchmark datasets including LIAR, and Buzz Feed. It has an F1 score of 95.9% and an accuracy score of 96.3%. It also keeps inference times in check. Also, SHAP analysis and attention weight visualisation make things easier to comprehend. This study looks at the good and bad sides of the present ways of identifying misinformation and suggests a solution that is easy to comprehend and can be applied in the real world.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "LightRAG-Driven Medical QA: Leveraging Domain-Specific for Efficient LLM Reasoning",
    "authors": [
      "Rishabh Kushwaha",
      "Reshma Swain",
      "Bal Krishna Saraswat"
    ],
    "year": "2025",
    "journal": "2025 5th International Conference on Advancement in Electronics & Communication Engineering (AECE)",
    "doi": "10.1109/AECE67531.2025.11386653",
    "pmid": "",
    "abstract": "Large Language Model (LLM) have demonstrated great capabilities in medical question answering. But fake facts and projections restrict the efficiency of their real world. Past studies have incorporated RAG methods such as Naive RAG and GraphRAG to enhance the aspect of factual grounding. Efficiency retention and recall and precision optimization are challenging to these particular methods. In this paper, we apply the LightRAG framework to the medical domain as a lightweight and domain-specific retrieval procedure that enhances the reasoning skills of LLMs. BERTScore used to evaluate LightRAG against Naive RAG and GraphRAG based on semantic similarity evaluations and retrieval scores. The findings demonstrate that LightRAG works better with the F1 score of 0.83, Recall 0.85 and Precision 0.81. It is more contextually retrieved than Naive RAG and more accurate and stable than Graph-RAG. Providing a superior tradeoff between accuracy and completeness, LightRAG offers correct and comprehensive answers to medical questions. Dataset take from Gale resource and content length 3,53,624. Such results indicate that LightRAG may help professionals, assist students and common people.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "A Hybrid GNN-LLM Framework for Correlating Cybersecurity Incidents",
    "authors": [
      "Lina Baha",
      "Amine Mammasse",
      "Oualid Saci"
    ],
    "year": "2025",
    "journal": "2025 Fourth International Conference on Theoretical and Applicative Aspects of Computer Science (ICTAACS)",
    "doi": "10.1109/ICTAACS69003.2025.11399321",
    "pmid": "",
    "abstract": "High volumes of alerts from Intrusion Detection Systems (IDS) cause significant \"alert fatigue\" among security analysts, hindering the identification of genuine incidents. Existing automated correlation methods often lack the semantic context and explainability needed for effective response. This paper presents a hybrid framework that integrates Graph Neural Networks (GNNs) and Large Language Models (LLMs) to correlate and explain security alerts. The approach constructs an alert graph using hybrid node features that fuse structured data with semantic embeddings. A GraphSAGE model is trained for link prediction to identify correlated alerts. Crucially, the framework implements a feedback loop where the GNN’s predictions serve as factual grounding for a Retrieval-Augmented Generation (RAG) module, producing human-readable justifications and actionable recommendations. Evaluated on the CIC-IDS 2017 dataset, the model achieves an Area Under the Curve (AUC) of 0.9731 and an accuracy of 90.02%. We demonstrate its ability to group alerts into coherent incidents, bridging the gap between automated detection and human-centric incident response.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "A Systematic Review of Key Retrieval-Augmented Generation (RAG) Systems: Progress, Gaps, and Future Directions",
    "authors": [
      "Agada Joseph Oche",
      "Ademola Glory Folashade",
      "Tirthankar Ghosal",
      "Arpan Biswas"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2507.18910",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) represents a major advancement in natural language processing (NLP), combining large language models (LLMs) with information retrieval systems to enhance factual grounding, accuracy, and contextual relevance. This paper presents a comprehensive systematic review of RAG, tracing its evolution from early developments in open domain question answering to recent state-of-the-art implementations across diverse applications. The review begins by outlining the motivations behind RAG, particularly its ability to mitigate hallucinations and outdated knowledge in parametric models. Core technical components-retrieval mechanisms, sequence-to-sequence generation models, and fusion strategies are examined in detail. A year-by-year analysis highlights key milestones and research trends, providing insight into RAG's rapid growth. The paper further explores the deployment of RAG in enterprise systems, addressing practical challenges related to retrieval of proprietary data, security, and scalability. A comparative evaluation of RAG implementations is conducted, benchmarking performance on retrieval accuracy, generation fluency, latency, and computational efficiency. Persistent challenges such as retrieval quality, privacy concerns, and integration overhead are critically assessed. Finally, the review highlights emerging solutions, including hybrid retrieval approaches, privacy-preserving techniques, optimized fusion strategies, and agentic RAG architectures. These innovations point toward a future of more reliable, efficient, and context-aware knowledge-intensive NLP systems.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "GNN-RAG: Graph Neural Retrieval for Large Language Model Reasoning",
    "authors": [
      "Costas Mavromatis",
      "George Karypis"
    ],
    "year": "2024",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2405.20139",
    "pmid": "",
    "abstract": "Knowledge Graphs (KGs) represent human-crafted factual knowledge in the form of triplets (head, relation, tail), which collectively form a graph. Question Answering over KGs (KGQA) is the task of answering natural questions grounding the reasoning to the information provided by the KG. Large Language Models (LLMs) are the state-of-the-art models for QA tasks due to their remarkable ability to understand natural language. On the other hand, Graph Neural Networks (GNNs) have been widely used for KGQA as they can handle the complex graph information stored in the KG. In this work, we introduce GNN-RAG, a novel method for combining language understanding abilities of LLMs with the reasoning abilities of GNNs in a retrieval-augmented generation (RAG) style. First, a GNN reasons over a dense KG subgraph to retrieve answer candidates for a given question. Second, the shortest paths in the KG that connect question entities and answer candidates are extracted to represent KG reasoning paths. The extracted paths are verbalized and given as input for LLM reasoning with RAG. In our GNN-RAG framework, the GNN acts as a dense subgraph reasoner to extract useful graph information, while the LLM leverages its natural language processing ability for ultimate KGQA. Furthermore, we develop a retrieval augmentation (RA) technique to further boost KGQA performance with GNN-RAG. Experimental results show that GNN-RAG achieves state-of-the-art performance in two widely used KGQA benchmarks (WebQSP and CWQ), outperforming or matching GPT-4 performance with a 7B tuned LLM. In addition, GNN-RAG excels on multi-hop and multi-entity questions outperforming competing approaches by 8.9--15.5% points at answer F1.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Chunking, Retrieval, and Re-ranking: An Empirical Evaluation of RAG Architectures for Policy Document Question Answering",
    "authors": [
      "A. Maharjan",
      "Umesh Yadav"
    ],
    "year": "2026",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2601.15457",
    "pmid": "",
    "abstract": "The integration of Large Language Models (LLMs) into the public health policy sector offers a transformative approach to navigating the vast repositories of regulatory guidance maintained by agencies such as the Centers for Disease Control and Prevention (CDC). However, the propensity for LLMs to generate hallucinations, defined as plausible but factually incorrect assertions, presents a critical barrier to the adoption of these technologies in high-stakes environments where information integrity is non-negotiable. This empirical evaluation explores the effectiveness of Retrieval-Augmented Generation (RAG) architectures in mitigating these risks by grounding generative outputs in authoritative document context. Specifically, this study compares a baseline Vanilla LLM against Basic RAG and Advanced RAG pipelines utilizing cross-encoder re-ranking. The experimental framework employs a Mistral-7B-Instruct-v0.2 model and an all-MiniLM-L6-v2 embedding model to process a corpus of official CDC policy analytical frameworks and guidance documents. The analysis measures the impact of two distinct chunking strategies, recursive character-based and token-based semantic splitting, on system accuracy, measured through faithfulness and relevance scores across a curated set of complex policy scenarios. Quantitative findings indicate that while Basic RAG architectures provide a substantial improvement in faithfulness (0.621) over Vanilla baselines (0.347), the Advanced RAG configuration achieves a superior faithfulness average of 0.797. These results demonstrate that two-stage retrieval mechanisms are essential for achieving the precision required for domain-specific policy question answering, though structural constraints in document segmentation remain a significant bottleneck for multi-step reasoning tasks.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "GRACE-RAG: Graph Retrieval with Adaptive Chunk Extraction for Long-Context Question Answering",
    "authors": [
      "Tianwei Huang",
      "Shuai Lei",
      "Askar Hamdulla",
      "Chunxiao Gao",
      "Huaping Zhang"
    ],
    "year": "2026",
    "journal": "2026 International Conference on Communication Networks and Machine Learning (CNML)",
    "doi": "10.1109/CNML68938.2026.11452294",
    "pmid": "",
    "abstract": "Retrieval-augmented generation (RAG) improves factuality by grounding large language models (LLMs) on external corpora, but it still struggles with multi-hop reasoning and long-context overload. We propose GRACE-RAG, a two-stage framework that (i) builds a fine-grained chunk–sentence–entity graph via dynamic chunking during offline indexing, (ii) performs query-aware entity activation and personalized PageRank for associative retrieval online, and (iii) adaptively compresses retrieved evidence under a token budget before answer generation. Experiments on seven QA benchmarks show consistent gains over strong graph-based RAG baselines.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Design and Implementation of a RAG Chatbot System for Scientific Research Institutes",
    "authors": [
      "Igor Radulović",
      "Jovana Mitrić",
      "Katarina Kovijanić",
      "Mija Ljuka",
      "Nejra Merdović",
      "Madžida Hundur Hiyari",
      "A. Badnjević"
    ],
    "year": "2026",
    "journal": "2026 30th International Conference on Information Technology (IT)",
    "doi": "10.1109/IT67293.2026.11435604",
    "pmid": "",
    "abstract": "This paper presents the design and implementation of a prototype chatbot system based on the Retrieval-Augmented Generation (RAG) architecture, applied in a scientific research institute to improve knowledge access. The system combines semantic search over a vector knowledge base with response generation using large language models, enabling contextually relevant institutional information. A case study was conducted to evaluate the prototype in a real-world environment. Results indicate improved factual grounding compared to an LLM-only baseline within the evaluated dataset, although the evaluation was limited to a small set of queries and a single institutional document collection.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "SRAG: RAG with Structured Data Improves Vector Retrieval",
    "authors": [
      "Shalin Shah",
      "Srikanth Ryali",
      "Ramasubbu Venkatesh"
    ],
    "year": "2026",
    "journal": "",
    "doi": "",
    "pmid": "",
    "abstract": "Retrieval Augmented Generation (RAG) provides the necessary informational grounding to LLMs in the form of chunks retrieved from a vector database or through web search. RAG could also use knowledge graph triples as a means of providing factual information to an LLM. However, the retrieval is only based on representational similarity between a question and the contents. The performance of RAG depends on the numeric vector representations of the query and the chunks. To improve these representations, we propose Structured RAG (SRAG), which adds structured information to a query as well as the chunks in the form of topics, sentiments, query and chunk types (e.g., informational, quantitative), knowledge graph triples and semantic tags. Experiments indicate that this method significantly improves the retrieval process. Using GPT-5 as an LLM-as-a-judge, results show that the method improves the score given to answers in a question answering system by 30% (p-value = 2e-13) (with tighter bounds). The strongest improvement is in comparative, analytical and predictive questions. The results suggest that our method enables broader, more diverse, and episodic-style retrieval. Tail risk analysis shows that SRAG attains very large gains more often, with losses remaining minor in magnitude.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Rational Synthesizers or Heuristic Followers? Analyzing LLMs in RAG-based Question-Answering",
    "authors": [
      "Atharv Naphade"
    ],
    "year": "2026",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2601.06189",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) is the prevailing paradigm for grounding Large Language Models (LLMs), yet the mechanisms governing how models integrate groups of conflicting retrieved evidence remain opaque. Does an LLM answer a certain way because the evidence is factually strong, because of a prior belief, or merely because it is repeated frequently? To answer this, we introduce GroupQA, a curated dataset of 1,635 controversial questions paired with 15,058 diversely-sourced evidence documents, annotated for stance and qualitative strength. Through controlled experiments, we characterize group-level evidence aggregation dynamics: Paraphrasing an argument can be more persuasive than providing distinct independent support; Models favor evidence presented first rather than last, and Larger models are increasingly resistant to adapt to presented evidence. Additionally, we find that LLM explanations to group-based answers are unfaithful. Together, we show that LLMs behave consistently as vulnerable heuristic followers, with direct implications for improving RAG system design.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Ingest-And-Ground: Dispelling Hallucinations from Continually-Pretrained LLMs with RAG",
    "authors": [
      "Chenhao Fang",
      "Derek Larson",
      "Shitong Zhu",
      "Sophie Zeng",
      "Wendy Summer",
      "Yanqing Peng",
      "Yuriy Hulovatyy",
      "Rajeev Rao",
      "Gabriel Forgues",
      "Arya Pudota",
      "Alex Goncalves",
      "Herv'e Robert"
    ],
    "year": "2024",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2410.02825",
    "pmid": "",
    "abstract": "This paper presents new methods that have the potential to improve privacy process efficiency with LLM and RAG. To reduce hallucination, we continually pre-train the base LLM model with a privacy-specific knowledge base and then augment it with a semantic RAG layer. Our evaluations demonstrate that this approach enhances the model performance (as much as doubled metrics compared to out-of-box LLM) in handling privacy-related queries, by grounding responses with factual information which reduces inaccuracies.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Engineering RAG Systems for Real-World Applications: Design, Development, and Evaluation",
    "authors": [
      "Md Toufique Hasan",
      "Muhammad Waseem",
      "Kai-Kristian Kemell",
      "A. Khan",
      "Mika Saari",
      "Pekka Abrahamsson"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.1007/978-3-032-04200-2_10",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) systems are emerging as a key approach for grounding Large Language Models (LLMs) in external knowledge, addressing limitations in factual accuracy and contextual relevance. However, there is a lack of empirical studies that report on the development of RAG-based implementations grounded in real-world use cases, evaluated through general user involvement, and accompanied by systematic documentation of lessons learned. This paper presents five domain-specific RAG applications developed for real-world scenarios across governance, cybersecurity, agriculture, industrial research, and medical diagnostics. Each system incorporates multilingual OCR, semantic retrieval via vector embeddings, and domain-adapted LLMs, deployed through local servers or cloud APIs to meet distinct user needs. A web-based evaluation involving a total of 100 participants assessed the systems across six dimensions: (i) Ease of Use, (ii) Relevance, (iii) Transparency, (iv) Responsiveness, (v) Accuracy, and (vi) Likelihood of Recommendation. Based on user feedback and our development experience, we documented twelve key lessons learned, highlighting technical, operational, and ethical challenges affecting the reliability and usability of RAG systems in practice.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Turk-LettuceDetect: A Hallucination Detection Model for Turkish RAG Applications",
    "authors": [
      "Selva Tas",
      "Mahmut El Huseyni",
      "Özay Ezerceli",
      "Reyhan Bayraktar",
      "Fatma Betül Terzioglu"
    ],
    "year": "2025",
    "journal": "2025 3rd International Conference on Foundation and Large Language Models (FLLM)",
    "doi": "10.1109/FLLM67465.2025.11390913",
    "pmid": "",
    "abstract": "The widespread adoption of Large Language Models (LLMs) has been hindered by their tendency to hallucinate, generating plausible but factually incorrect information. While Retrieval-Augmented Generation (RAG) systems attempt to address this issue by grounding responses in external knowledge, hallucination remains a persistent challenge, particularly for morphologically complex, low-resource languages like Turkish. This paper introduces Turk-LettuceDetect, the first suite of hallucination detection models specifically designed for Turkish RAG applications. Building on the LettuceDetect framework, we formulate hallucination detection as a token-level classification task and fine-tune three distinct encoder architectures: a Turkishspecific ModernBERT, TurkEmbed4STS, and multilingual EuroBERT. These models were trained on a machine-translated version of the RAGTruth benchmark dataset containing 17,790 instances across question answering, data-to-text generation, and summarization tasks. Our experimental results show that the ModernBERT-based model achieves an F1-score of 0.7266 on the complete test set, with particularly strong performance on structured tasks. The models maintain computational efficiency while supporting long contexts up to 8,192 tokens, making them suitable for real-time deployment. Comparative analysis reveals that while state-of-the-art LLMs demonstrate high recall, they suffer from low precision due to over-generation of hallucinated content, underscoring the necessity of specialized detection mechanisms. By releasing our models and translated dataset, this work addresses a critical gap in multilingual NLP and establishes a foundation for developing more reliable and trustworthy AI applications for Turkish and other languages.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Benchmarking Vector, Graph and Hybrid Retrieval Augmented Generation (RAG) Pipelines for Open Radio Access Networks (ORAN)",
    "authors": [
      "Sarat Ahmad",
      "Zeinab Nezami",
      "Maryam Hafeez",
      "S. A. R. Zaidi"
    ],
    "year": "2025",
    "journal": "2025 IEEE 36th International Symposium on Personal, Indoor and Mobile Radio Communications (PIMRC)",
    "doi": "10.1109/PIMRC62392.2025.11274810",
    "pmid": "",
    "abstract": "Generative AI (GenAI) is expected to play a pivotal role in enabling autonomous optimization in future wireless networks. Within the ORAN architecture, Large Language Models (LLMs) can be specialized to generate xApps and rApps by leveraging specifications and API definitions from the RAN Intelligent Controller (RIC) platform. However, fine-tuning base LLMs for telecom-specific tasks remains expensive and resource-intensive. Retrieval-Augmented Generation (RAG) offers a practical alternative through in-context learning, enabling domain adaptation without full retraining. While traditional RAG systems rely on vector-based retrieval, emerging variants such as GraphRAG and Hybrid GraphRAG incorporate knowledge graphs or dual retrieval strategies to support multi-hop reasoning and improve factual grounding. Despite their promise, these methods lack systematic, metric-driven evaluations, particularly in high-stakes domains such as ORAN. In this study, we conduct a comparative evaluation of Vector RAG, GraphRAG, and Hybrid GraphRAG using ORAN specifications. We assess performance across varying question complexities using established generation metrics: faithfulness, answer relevance, context relevance, and factual correctness. Results show that both GraphRAG and Hybrid GraphRAG outperform traditional RAG. Hybrid GraphRAG improves factual correctness by 8%, while GraphRAG improves context relevance by 11%.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Medical Graph-RAG: Bilingual Graph-Based Reasoning for Cardiological Intelligence",
    "authors": [
      "Leen I. A. Shaqalaih",
      "Omar Belal",
      "Fatma Küçük",
      "Yiġit Tuncer",
      "M. Ganiz"
    ],
    "year": "2025",
    "journal": "2025 International Conference on INnovations in Intelligent SysTems and Applications (INISTA)",
    "doi": "10.1109/INISTA68122.2025.11249583",
    "pmid": "",
    "abstract": "Clinical question answering requires factual accuracy, transparency, and evidence that is traceable. Despite recent progress, Large Language Models (LLMs) still hallucinate and struggle with specialized terminology. Retrieval-Augmented Generation (RAG) mitigates this by grounding answers in external sources, but conventional RAG neglects the rich relational structure of medical knowledge. Building on MedGraphRAG, an evidence-focused, graph-based RAG framework, we present the first bilingual (English-Turkish) adaptation. Our study differs from the original in three key ways. First, we substitute GPT-4o-mini for GPT-4 as the generator. Second, instead of the MedC-K repository used in the original work, instead curated a much smaller set of cardiology textbooks and open-access articles. Third, we additionally evaluate a medical specialist model: MedGemma. We further extend the system by translating MIMIC-IV clinical notes and a subset of UMLS concepts into Turkish, enabling Turkish medical graph construction and retrieval. Experiments show that in English, MedGraphRAG retains strong gains over baseline. In Turkish, performance degrades relative to English. We observe that the largest driver of performance may be due to the size/coverage of the upper repository (medical reports). MedGemma-27B underperforms GPT-4o-mini in our setup on text-only MCQs. Overall, results highlight the sensitivity of MedGraphRAG to the breadth of curated medical sources and provide the first systematic assessment for Turkish.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "A Survey of Retrieval-Augmented Generation (RAG) for Large Language Models",
    "authors": [
      "Yusong Ma",
      "Hongxuan Nie",
      "Chao Chen",
      "Jiujie Zhang",
      "Jiali Jiang",
      "Bisheng Wang",
      "Yuqi Xia"
    ],
    "year": "2025",
    "journal": "2025 International Conference on Trustworthy Big Data and Artificial Intelligence (ICTBAI)",
    "doi": "10.1109/ICTBAI68361.2025.00008",
    "pmid": "",
    "abstract": "While Large Language Models (LLMs) are revolutionary, their deployment is constrained by inherent limitations such as factual hallucination and static knowledge. This survey systematically reviews Retrieval-Augmented Generation (RAG), a key paradigm for addressing these challenges by grounding LLMs in external, verifiable knowledge. To overcome the flaws of standalone models, RAG integrates LLMs with updatable knowledge bases, a hybrid approach that significantly enhances output accuracy and trustworthiness. Our primary finding is the technology’s clear evolutionary trajectory, which we structure into three stages: Naive, Advanced, and Modular RAG. This progression demonstrates a shift away from monolithic parametric memory towards intelligent systems that interact with external data. By summarizing the field’s progression, key challenges like retriever-generator alignment, and future directions such as integration with agentic architectures, this work concludes that RAG is a crucial technology for propelling AI to be more evidence-based and capable of complex reasoning.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "LLMs and LVMs for agentic AI: a GPU-accelerated multimodal system architecture for RAG-grounded, explainable, and adaptive intelligence",
    "authors": [
      "Kiarash Ahi",
      "Chih-Hung Hsieh",
      "G. Fenger"
    ],
    "year": "2025",
    "journal": "",
    "doi": "10.1117/12.3078485",
    "pmid": "",
    "abstract": "This paper presents an architecture for an Agentic AI System that autonomously operates and manages complex workflows across enterprise and industrial software ecosystems such as Electronic Design Automation (EDA) tools (e.g., Siemens Calibre), Product Lifecycle Management (PLM) and Digital Twin platforms (e.g., Teamcenter Digital Reality Viewer), as well as knowledge-centric domains including HR analytics, financial modeling, healthcare diagnostics, and creative design platforms. This architecture leverages a multi-agent framework orchestrated by a central planner, integrating large language model (LLM) and large vision model (LVM) reasoning for multimodal understanding, retrieval-augmented generation (RAG) pipelines, and enterprise-grade governance to enable secure, explainable, and adaptive automation across both physical and virtual product lifecycle stages. The architecture is structured as a nine-layer intelligent stack, beginning with a natural language interface and extending through layers responsible for cognitive orchestration, specialized agents, contextual retrieval, reasoning, tool execution, security, access control, and feedback-driven learning. Users issue high-level intents—such as “run DRC and fix critical violations” or “synchronize the latest design update with the digital twin”— which are interpreted by the planner agent and decomposed into sub-tasks. These are executed by specialized agents (e.g., simulation, review, or action agents), each interfacing securely with industrial tools and twin environments through sandboxed runtimes and version-controlled APIs. The planner dynamically adjusts task decomposition and agent routing based on resource constraints, latency budgets, and model confidence, enabling adaptive, performance-aware orchestration. Beyond industrial and engineering use cases, the same agentic architecture generalizes to broader enterprise workflows. In HR and finance, autonomous agents extract insights from structured and unstructured data, improve forecasting accuracy, and ensure regulatory compliance. In healthcare, multimodal reasoning that fuses text, imagery, and sensor data can assist clinicians in diagnosis and treatment planning while maintaining explainability. In creative and design environments, agentic co-pilots interpret user intent, generate assets, and optimize iterative design loops—enhancing both productivity and human creativity. A core RAG layer grounds decisions in proprietary engineering knowledge (e.g., PDK rules, fab specifications, simulation logs, and historical twin data), while a chunk reranker ensures only the most relevant context is injected into LLM prompts. This RAG pipeline supports fast memory access, context pruning, and scalable grounding across high-volume logs and digital twin telemetry. This grounding layer can be extended to any domain where contextual reasoning over proprietary knowledge is critical—ranging from clinical data repositories and enterprise ERPs to document archives and financial transaction graphs. To support this architecture’s adaptive orchestration and multimodal agent execution, performance-optimized inference becomes critical. To meet the latency, throughput, and scalability demands of large-scale multimodal reasoning, the system incorporates GPU-accelerated inference pipelines, including ROI-guided compression and adaptive latent-space clustering to reduce computational overhead while preserving output fidelity. These GPU-accelerated strategies are based on the ROI-LCC framework, which integrates dynamic Region of Interest (ROI) selection, latentspace clustering, and learned GPU feature extraction to minimize redundancy and streamline computation. Outputs are processed through a guardrails and explainability (XAI) layer that filters unsafe content, validates decisions, and generates structured audit trails. The system includes a Human-in-the-Loop (HITL) mechanism to review high-impact or real-world synchronized actions before execution. These optimizations—originally developed and validated on nanometer-resolution SEM imagery exhibiting nanoscale noise, low SNR, and extreme visual detail—enable robust, high-throughput inference in compute-constrained scenarios such as EUV lithography and biomedical diagnostics. This architecture has been integrated into Calibre SEMSuite™, demonstrating readiness for real-world deployment in precision-critical industrial environments. The architecture supports real-time telemetry, bias and drift detection, and a data flywheel that captures feedback and performance metrics to continuously refine agent behavior, prompt strategies, and model accuracy. Designed for hybrid on-prem/cloud deployment and compliant with RBAC/ABAC enterprise security policies, this system ensures scalability, transparency, and governance continuity across industrial, enterprise, and domain-specific ecosystems—from design and manufacturing to financial analytics, healthcare diagnostics, HR operations, and creative content pipelines. Collectively, these capabilities position the architecture as a generalized substrate for enterprise-scale intelligence orchestration. It not only automates workflows but also augments human decision-making, improves analytical accuracy, and accelerates creativity across sectors—bridging cognitive reasoning, multimodal perception, and secure execution. By unifying LLM reasoning and LVM orchestration, GPU-accelerated inference, grounded retrieval, digital twin synchronization, tool integration, and enterprise governance within a modular agentic framework, this system transforms traditional industrial software into an intelligent, auditable, and self-improving co-pilot—accelerating design cycles, enhancing reliability, and bridging the gap between virtual models and physical systems through autonomous, explainable decision orchestration. These optimizations make the architecture suitable for deployment in latency-sensitive, compute-constrained industrial scenarios, including edge-assisted digital twin environments and high-throughput simulation workflows, as well as knowledge-driven enterprise systems that demand adaptive, explainable, and human-aligned intelligence.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Optimizing Medical Question-Answering Systems: A Comparative Study of Fine-Tuned and Zero-Shot Large Language Models with RAG Framework",
    "authors": [
      "Tasnimul Hassan",
      "Md. Karim",
      "Haziq Jeelani",
      "Elham Behnam",
      "R. Green",
      "F. Syed"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2512.05863",
    "pmid": "",
    "abstract": "Medical question-answering (QA) systems can benefit from advances in large language models (LLMs), but directly applying LLMs to the clinical domain poses challenges such as maintaining factual accuracy and avoiding hallucinations. In this paper, we present a retrieval-augmented generation (RAG) based medical QA system that combines domain-specific knowledge retrieval with open-source LLMs to answer medical questions. We fine-tune two state-of-the-art open LLMs (LLaMA~2 and Falcon) using Low-Rank Adaptation (LoRA) for efficient domain specialization. The system retrieves relevant medical literature to ground the LLM's answers, thereby improving factual correctness and reducing hallucinations. We evaluate the approach on benchmark datasets (PubMedQA and MedMCQA) and show that retrieval augmentation yields measurable improvements in answer accuracy compared to using LLMs alone. Our fine-tuned LLaMA~2 model achieves 71.8% accuracy on PubMedQA, substantially improving over the 55.4% zero-shot baseline, while maintaining transparency by providing source references. We also detail the system design and fine-tuning methodology, demonstrating that grounding answers in retrieved evidence reduces unsupported content by approximately 60%. These results highlight the potential of RAG-augmented open-source LLMs for reliable biomedical QA, pointing toward practical clinical informatics applications.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Generate but Verify: Answering with Faithfulness in RAG-based Question Answering",
    "authors": [
      "Simone Filice",
      "Elad Haramaty",
      "Guy Horowitz",
      "Zohar S. Karnin",
      "L. Lewin-Eytan",
      "Alex Shtoff"
    ],
    "year": "2025",
    "journal": "",
    "doi": "10.18653/v1/2025.ijcnlp-long.56",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enhances LLMs by grounding answers in retrieved passages, which is key in factual Question Answering. However, generated answers may still be unfaithful to the passages, either due to retrieval or generation errors. Many RAG downstream applications rely on assessing answer faithfulness for applying fallback strategies, yet address it implicitly, without a consistent evaluation methodology. We introduce the task of Answering with Faithfulness (AwF), which brings faithfulness prediction to the forefront, explicitly coupling it with answer generation. We define variants of the precision and recall metrics tailored to this task, facilitating direct evaluation and comparison of different AwF methods. We then demonstrate, both theoretically and empirically, that for RAG applications using AwF as a sub-procedure, an improvement to the AwF metrics translates to an improvement to the downstream performance. This results in improved performance for recently published results.",
    "source_database": "semantic_scholar"
  }
]