[
  {
    "title": "The Development and Evaluation of a Retrieval-Augmented Generation Large Language Model Virtual Assistant for Postoperative Instructions.",
    "authors": [
      "Haider SA",
      "Prabha S",
      "Gomez Cabello CA",
      "Genovese A",
      "Collaco B",
      "Wood N",
      "London J",
      "Bagaria S",
      "Tao C",
      "Forte AJ"
    ],
    "year": "2025",
    "journal": "Bioengineering (Basel, Switzerland)",
    "doi": "10.3390/bioengineering12111219",
    "pmid": "41301175",
    "abstract": "During postoperative recovery, patients and their caregivers often lack crucial information, leading to numerous repetitive inquiries that burden healthcare providers. Traditional discharge materials, including paper handouts and patient portals, are often static, overwhelming, or underutilized, leading to patient overwhelm and contributing to unnecessary ER visits and overall healthcare overutilization. Conversational chatbots offer a solution, but Natural Language Processing (NLP) systems are often inflexible and limited in understanding, while powerful Large Language Models (LLMs) are prone to generating \"hallucinations\".",
    "source_database": "pubmed"
  },
  {
    "title": "AR-RAG: Autoregressive Retrieval Augmentation for Image Generation",
    "authors": [
      "Jingyuan Qi",
      "Zhiyang Xu",
      "Qifan Wang",
      "Lifu Huang"
    ],
    "year": "2025",
    "journal": "arXiv:2506.06962v3",
    "doi": "",
    "abstract": "We introduce Autoregressive Retrieval Augmentation (AR-RAG), a novel paradigm that enhances image generation by autoregressively incorporating knearest neighbor retrievals at the patch level. Unlike prior methods that perform a single, static retrieval before generation and condition the entire generation on fixed reference images, AR-RAG performs context-aware retrievals at each generation step, using prior-generated patches as queries to retrieve and incorporate the most relevant patch-level visual references, enabling the model to respond to evolving generation needs while avoiding limitations (e.g., over-copying, stylistic bias, etc.) prevalent in existing methods. To realize AR-RAG, we propose two parallel frameworks: (1) Distribution-Augmentation in Decoding (DAiD), a training-free plug-and-use decoding strategy that directly merges the distribution of model-predicted patches with the distribution of retrieved patches, and (2) Feature-Augmentation in Decoding (FAiD), a parameter-efficient fine-tuning method that progressively smooths the features of retrieved patches via multi-scale convolution operations and leverages them to augment the image generation process. We validate the effectiveness of AR-RAG on widely adopted benchmarks, including Midjourney-30K, GenEval and DPG-Bench, demonstrating significant performance gains over state-of-the-art image generation models.",
    "source_database": "arxiv",
    "arxiv_id": "2506.06962v3"
  },
  {
    "title": "FAIR-RAG: Faithful Adaptive Iterative Refinement for Retrieval-Augmented Generation",
    "authors": [
      "Mohammad Aghajani Asl",
      "Majid Asgari-Bidhendi",
      "Behrooz Minaei-Bidgoli"
    ],
    "year": "2025",
    "journal": "arXiv:2510.22344v1",
    "doi": "",
    "abstract": "While Retrieval-Augmented Generation (RAG) mitigates hallucination and knowledge staleness in Large Language Models (LLMs), existing frameworks often falter on complex, multi-hop queries that require synthesizing information from disparate sources. Current advanced RAG methods, employing iterative or adaptive strategies, lack a robust mechanism to systematically identify and fill evidence gaps, often propagating noise or failing to gather a comprehensive context. We introduce FAIR-RAG, a novel agentic framework that transforms the standard RAG pipeline into a dynamic, evidence-driven reasoning process. At its core is an Iterative Refinement Cycle governed by a module we term Structured Evidence Assessment (SEA). The SEA acts as an analytical gating mechanism: it deconstructs the initial query into a checklist of required findings and audits the aggregated evidence to identify confirmed facts and, critically, explicit informational gaps. These gaps provide a precise signal to an Adaptive Query Refinement agent, which generates new, targeted sub-queries to retrieve missing information. This cycle repeats until the evidence is verified as sufficient, ensuring a comprehensive context for a final, strictly faithful generation. We conducted experiments on challenging multi-hop QA benchmarks, including HotpotQA, 2WikiMultiHopQA, and MusiQue. In a unified experimental setup, FAIR-RAG significantly outperforms strong baselines. On HotpotQA, it achieves an F1-score of 0.453 -- an absolute improvement of 8.3 points over the strongest iterative baseline -- establishing a new state-of-the-art for this class of methods on these benchmarks. Our work demonstrates that a structured, evidence-driven refinement process with explicit gap analysis is crucial for unlocking reliable and accurate reasoning in advanced RAG systems for complex, knowledge-intensive tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2510.22344v1"
  },
  {
    "title": "Automated Literature Review Using NLP Techniques and LLM-Based Retrieval-Augmented Generation",
    "authors": [
      "Nurshat Fateh Ali",
      "Md. Mahdi Mohtasim",
      "Shakil Mosharrof",
      "T. Gopi Krishna"
    ],
    "year": "2024",
    "journal": "arXiv:2411.18583v1",
    "doi": "",
    "abstract": "This research presents and compares multiple approaches to automate the generation of literature reviews using several Natural Language Processing (NLP) techniques and retrieval-augmented generation (RAG) with a Large Language Model (LLM). The ever-increasing number of research articles provides a huge challenge for manual literature review. It has resulted in an increased demand for automation. Developing a system capable of automatically generating the literature reviews from only the PDF files as input is the primary objective of this research work. The effectiveness of several Natural Language Processing (NLP) strategies, such as the frequency-based method (spaCy), the transformer model (Simple T5), and retrieval-augmented generation (RAG) with Large Language Model (GPT-3.5-turbo), is evaluated to meet the primary objective. The SciTLDR dataset is chosen for this research experiment and three distinct techniques are utilized to implement three different systems for auto-generating the literature reviews. The ROUGE scores are used for the evaluation of all three systems. Based on the evaluation, the Large Language Model GPT-3.5-turbo achieved the highest ROUGE-1 score, 0.364. The transformer model comes in second place and spaCy is at the last position. Finally, a graphical user interface is created for the best system based on the large language model.",
    "source_database": "arxiv",
    "arxiv_id": "2411.18583v1"
  },
  {
    "title": "Intelligent Interaction Strategies for Context-Aware Cognitive Augmentation",
    "authors": [
      " Xiangrong",
      " Zhu",
      "Yuan Xu",
      "Tianjian Liu",
      "Jingwei Sun",
      "Yu Zhang",
      "Xin Tong"
    ],
    "year": "2025",
    "journal": "arXiv:2504.13684v1",
    "doi": "",
    "abstract": "Human cognition is constrained by processing limitations, leading to cognitive overload and inefficiencies in knowledge synthesis and decision-making. Large Language Models (LLMs) present an opportunity for cognitive augmentation, but their current reactive nature limits their real-world applicability. This position paper explores the potential of context-aware cognitive augmentation, where LLMs dynamically adapt to users' cognitive states and task environments to provide appropriate support. Through a think-aloud study in an exhibition setting, we examine how individuals interact with multi-modal information and identify key cognitive challenges in structuring, retrieving, and applying knowledge. Our findings highlight the need for AI-driven cognitive support systems that integrate real-time contextual awareness, personalized reasoning assistance, and socially adaptive interactions. We propose a framework for AI augmentation that seamlessly transitions between real-time cognitive support and post-experience knowledge organization, contributing to the design of more effective human-centered AI systems.",
    "source_database": "arxiv",
    "arxiv_id": "2504.13684v1"
  },
  {
    "title": "Factually: Exploring Wearable Fact-Checking for Augmented Truth Discernment",
    "authors": [
      "Chitralekha Gupta",
      "Hanjun Wu",
      "Praveen Sasikumar",
      "Shreyas Sridhar",
      "Priambudi Bagaskara",
      "Suranga Nanayakkara"
    ],
    "year": "2025",
    "journal": "arXiv:2504.17204v1",
    "doi": "",
    "abstract": "Wearable devices are transforming human capabilities by seamlessly augmenting cognitive functions. In this position paper, we propose a voice-based, interactive learning companion designed to amplify and extend cognitive abilities through informal learning. Our vision is threefold: (1) to enable users to discover new knowledge on-the-go through contextual interactive quizzes, fostering critical thinking and mindfulness, (2) to proactively detect misinformation, empowering users to critically assess information in real time, and (3) to provide spoken language correction and prompting hints for second language learning and effective communication. As an initial step toward this vision, we present Factually - a proactive, wearable fact-checking system integrated into devices like smartwatches or rings. Factually discreetly alerts users to potential falsehoods via vibrotactile feedback, helping them assess information critically. We demonstrate its utility through three illustrative scenarios, highlighting its potential to extend cognitive abilities for real-time misinformation detection. Early qualitative feedback suggests that Factually can enhance users' fact-checking capabilities, offering both practical and experiential benefits.",
    "source_database": "arxiv",
    "arxiv_id": "2504.17204v1"
  },
  {
    "title": "Designing AI Systems that Augment Human Performed vs. Demonstrated Critical Thinking",
    "authors": [
      "Katelyn Xiaoying Mei",
      "Nic Weber"
    ],
    "year": "2025",
    "journal": "arXiv:2504.14689v1",
    "doi": "",
    "abstract": "The recent rapid advancement of LLM-based AI systems has accelerated our search and production of information. While the advantages brought by these systems seemingly improve the performance or efficiency of human activities, they do not necessarily enhance human capabilities. Recent research has started to examine the impact of generative AI on individuals' cognitive abilities, especially critical thinking. Based on definitions of critical thinking across psychology and education, this position paper proposes the distinction between demonstrated and performed critical thinking in the era of generative AI and discusses the implication of this distinction in research and development of AI systems that aim to augment human critical thinking.",
    "source_database": "arxiv",
    "arxiv_id": "2504.14689v1"
  },
  {
    "title": "EVOR: Evolving Retrieval for Code Generation",
    "authors": [
      "Hongjin Su",
      "Shuyang Jiang",
      "Yuhang Lai",
      "Haoyuan Wu",
      "Boao Shi",
      "Che Liu",
      "Qian Liu",
      "Tao Yu"
    ],
    "year": "2024",
    "journal": "arXiv:2402.12317v2",
    "doi": "",
    "abstract": "Recently the retrieval-augmented generation (RAG) has been successfully applied in code generation. However, existing pipelines for retrieval-augmented code generation (RACG) employ static knowledge bases with a single source, limiting the adaptation capabilities of Large Language Models (LLMs) to domains they have insufficient knowledge of. In this work, we develop a novel pipeline, EVOR, that employs the synchronous evolution of both queries and diverse knowledge bases. On two realistic settings where the external knowledge is required to solve code generation tasks, we compile four new datasets associated with frequently updated libraries and long-tail programming languages, named EVOR-BENCH. Extensive experiments demonstrate that EVOR achieves two to four times of execution accuracy compared to other methods such as Reflexion (Shinn et al., 2024), DocPrompting (Zhou et al., 2023), etc. We demonstrate that EVOR is flexible and can be easily combined with them to achieve further improvement. Further analysis reveals that EVOR benefits from the synchronous evolution of queries and documents and the diverse information sources in the knowledge base. We hope that our studies will inspire more insights into the design of advanced RACG pipelines in future research. Our model, code, and data are available at https://arks-codegen.github.io.",
    "source_database": "arxiv",
    "arxiv_id": "2402.12317v2"
  },
  {
    "title": "Riddle Me This! Stealthy Membership Inference for Retrieval-Augmented Generation",
    "authors": [
      "Ali Naseh",
      "Yuefeng Peng",
      "Anshuman Suri",
      "Harsh Chaudhari",
      "Alina Oprea",
      "Amir Houmansadr"
    ],
    "year": "2025",
    "journal": "arXiv:2502.00306v2",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enables Large Language Models (LLMs) to generate grounded responses by leveraging external knowledge databases without altering model parameters. Although the absence of weight tuning prevents leakage via model parameters, it introduces the risk of inference adversaries exploiting retrieved documents in the model's context. Existing methods for membership inference and data extraction often rely on jailbreaking or carefully crafted unnatural queries, which can be easily detected or thwarted with query rewriting techniques common in RAG systems. In this work, we present Interrogation Attack (IA), a membership inference technique targeting documents in the RAG datastore. By crafting natural-text queries that are answerable only with the target document's presence, our approach demonstrates successful inference with just 30 queries while remaining stealthy; straightforward detectors identify adversarial prompts from existing methods up to ~76x more frequently than those generated by our attack. We observe a 2x improvement in TPR@1%FPR over prior inference attacks across diverse RAG configurations, all while costing less than $0.02 per document inference.",
    "source_database": "arxiv",
    "arxiv_id": "2502.00306v2"
  },
  {
    "title": "Ragas: Automated Evaluation of Retrieval Augmented Generation",
    "authors": [
      "Shahul Es",
      "Jithin James",
      "Luis Espinosa-Anke",
      "Steven Schockaert"
    ],
    "year": "2023",
    "journal": "arXiv:2309.15217v2",
    "doi": "",
    "abstract": "We introduce Ragas (Retrieval Augmented Generation Assessment), a framework for reference-free evaluation of Retrieval Augmented Generation (RAG) pipelines. RAG systems are composed of a retrieval and an LLM based generation module, and provide LLMs with knowledge from a reference textual database, which enables them to act as a natural language layer between a user and textual databases, reducing the risk of hallucinations. Evaluating RAG architectures is, however, challenging because there are several dimensions to consider: the ability of the retrieval system to identify relevant and focused context passages, the ability of the LLM to exploit such passages in a faithful way, or the quality of the generation itself. With Ragas, we put forward a suite of metrics which can be used to evaluate these different dimensions \\textit{without having to rely on ground truth human annotations}. We posit that such a framework can crucially contribute to faster evaluation cycles of RAG architectures, which is especially important given the fast adoption of LLMs.",
    "source_database": "arxiv",
    "arxiv_id": "2309.15217v2"
  },
  {
    "title": "Aviation Safety Enhancement via NLP & Deep Learning: Classifying Flight Phases in ATSB Safety Reports",
    "authors": [
      "Aziida Nanyonga",
      "Hassan Wasswa",
      "Graham Wild"
    ],
    "year": "2025",
    "journal": "arXiv:2501.07923v1",
    "doi": "",
    "abstract": "Aviation safety is paramount, demanding precise analysis of safety occurrences during different flight phases. This study employs Natural Language Processing (NLP) and Deep Learning models, including LSTM, CNN, Bidirectional LSTM (BLSTM), and simple Recurrent Neural Networks (sRNN), to classify flight phases in safety reports from the Australian Transport Safety Bureau (ATSB). The models exhibited high accuracy, precision, recall, and F1 scores, with LSTM achieving the highest performance of 87%, 88%, 87%, and 88%, respectively. This performance highlights their effectiveness in automating safety occurrence analysis. The integration of NLP and Deep Learning technologies promises transformative enhancements in aviation safety analysis, enabling targeted safety measures and streamlined report handling.",
    "source_database": "arxiv",
    "arxiv_id": "2501.07923v1"
  },
  {
    "title": "RAC: Retrieval-Augmented Clarification for Faithful Conversational Search",
    "authors": [
      "Ahmed Rayane Kebir",
      "Vincent Guigue",
      "Lynda Said Lhadj",
      "Laure Soulier"
    ],
    "year": "2026",
    "journal": "arXiv:2601.11722v1",
    "doi": "",
    "abstract": "Clarification questions help conversational search systems resolve ambiguous or underspecified user queries. While prior work has focused on fluency and alignment with user intent, especially through facet extraction, much less attention has been paid to grounding clarifications in the underlying corpus. Without such grounding, systems risk asking questions that cannot be answered from the available documents. We introduce RAC (Retrieval-Augmented Clarification), a framework for generating corpus-faithful clarification questions. After comparing several indexing strategies for retrieval, we fine-tune a large language model to make optimal use of research context and to encourage the generation of evidence-based question. We then apply contrastive preference optimization to favor questions supported by retrieved passages over ungrounded alternatives. Evaluated on four benchmarks, RAC demonstrate significant improvements over baselines. In addition to LLM-as-Judge assessments, we introduce novel metrics derived from NLI and data-to-text to assess how well questions are anchored in the context, and we demonstrate that our approach consistently enhances faithfulness.",
    "source_database": "arxiv",
    "arxiv_id": "2601.11722v1"
  },
  {
    "title": "Engineering the RAG Stack: A Comprehensive Review of the Architecture and Trust Frameworks for Retrieval-Augmented Generation Systems",
    "authors": [
      "Dean Wampler",
      "Dave Nielson",
      "Alireza Seddighi"
    ],
    "year": "2025",
    "journal": "arXiv:2601.05264v1",
    "doi": "",
    "abstract": "This article provides a comprehensive systematic literature review of academic studies, industrial applications, and real-world deployments from 2018 to 2025, providing a practical guide and detailed overview of modern Retrieval-Augmented Generation (RAG) architectures. RAG offers a modular approach for integrating external knowledge without increasing the capacity of the model as LLM systems expand. Research and engineering practices have been fragmented as a result of the increasing diversity of RAG methodologies, which encompasses a variety of fusion mechanisms, retrieval strategies, and orchestration approaches. We provide quantitative assessment frameworks, analyze the implications for trust and alignment, and systematically consolidate existing RAG techniques into a unified taxonomy. This document is a practical framework for the deployment of resilient, secure, and domain-adaptable RAG systems, synthesizing insights from academic literature, industry reports, and technical implementation guides. It also functions as a technical reference.",
    "source_database": "arxiv",
    "arxiv_id": "2601.05264v1"
  },
  {
    "title": "MUST-RAG: MUSical Text Question Answering with Retrieval Augmented Generation",
    "authors": [
      "Daeyong Kwon",
      "SeungHeon Doh",
      "Juhan Nam"
    ],
    "year": "2025",
    "journal": "arXiv:2507.23334v2",
    "doi": "",
    "abstract": "Recent advancements in Large language models (LLMs) have demonstrated remarkable capabilities across diverse domains. While they exhibit strong zero-shot performance on various tasks, LLMs' effectiveness in music-related applications remains limited due to the relatively small proportion of music-specific knowledge in their training data. To address this limitation, we propose MusT-RAG, a comprehensive framework based on Retrieval Augmented Generation (RAG) to adapt general-purpose LLMs for text-only music question answering (MQA) tasks. RAG is a technique that provides external knowledge to LLMs by retrieving relevant context information when generating answers to questions. To optimize RAG for the music domain, we (1) propose MusWikiDB, a music-specialized vector database for the retrieval stage, and (2) utilizes context information during both inference and fine-tuning processes to effectively transform general-purpose LLMs into music-specific models. Our experiment demonstrates that MusT-RAG significantly outperforms traditional fine-tuning approaches in enhancing LLMs' music domain adaptation capabilities, showing consistent improvements across both in-domain and out-of-domain MQA benchmarks. Additionally, our MusWikiDB proves substantially more effective than general Wikipedia corpora, delivering superior performance and computational efficiency.",
    "source_database": "arxiv",
    "arxiv_id": "2507.23334v2"
  },
  {
    "title": "Lightweight Transformers for Clinical Natural Language Processing",
    "authors": [
      "Omid Rohanian",
      "Mohammadmahdi Nouriborji",
      "Hannah Jauncey",
      "Samaneh Kouchaki",
      "ISARIC Clinical Characterisation Group",
      "Lei Clifton",
      "Laura Merson",
      "David A. Clifton"
    ],
    "year": "2023",
    "journal": "arXiv:2302.04725v1",
    "doi": "https://doi.org/10.1017/S1351324923000542",
    "abstract": "Specialised pre-trained language models are becoming more frequent in NLP since they can potentially outperform models trained on generic texts. BioBERT and BioClinicalBERT are two examples of such models that have shown promise in medical NLP tasks. Many of these models are overparametrised and resource-intensive, but thanks to techniques like Knowledge Distillation (KD), it is possible to create smaller versions that perform almost as well as their larger counterparts. In this work, we specifically focus on development of compact language models for processing clinical texts (i.e. progress notes, discharge summaries etc). We developed a number of efficient lightweight clinical transformers using knowledge distillation and continual learning, with the number of parameters ranging from 15 million to 65 million. These models performed comparably to larger models such as BioBERT and ClinicalBioBERT and significantly outperformed other compact models trained on general or biomedical data. Our extensive evaluation was done across several standard datasets and covered a wide range of clinical text-mining tasks, including Natural Language Inference, Relation Extraction, Named Entity Recognition, and Sequence Classification. To our knowledge, this is the first comprehensive study specifically focused on creating efficient and compact transformers for clinical NLP tasks. The models and code used in this study can be found on our Huggingface profile at https://huggingface.co/nlpie and Github page at https://github.com/nlpie-research/Lightweight-Clinical-Transformers, respectively, promoting reproducibility of our results.",
    "source_database": "arxiv",
    "arxiv_id": "2302.04725v1"
  },
  {
    "title": "Empathy Is Not What Changed: Clinical Assessment of Psychological Safety Across GPT Model Generations",
    "authors": [
      "Michael Keeman",
      "Anastasia Keeman"
    ],
    "year": "2026",
    "journal": "arXiv:2603.09997v1",
    "doi": "",
    "abstract": "When OpenAI deprecated GPT-4o in early 2026, thousands of users protested under #keep4o, claiming newer models had \"lost their empathy.\" No published study has tested this claim. We conducted the first clinical measurement, evaluating three OpenAI model generations (GPT-4o, o4-mini, GPT-5-mini) across 14 emotionally challenging conversational scenarios in mental health and AI companion domains, producing 2,100 scored AI responses assessed on six psychological safety dimensions using clinically-grounded rubrics.   Empathy scores are statistically indistinguishable across all three models (Kruskal-Wallis H=4.33, p=0.115). What changed is the safety posture: crisis detection improved monotonically from GPT-4o to GPT-5-mini (H=13.88, p=0.001), while advice safety declined (H=16.63, p<0.001). Per-turn trajectory analysis -- a novel methodological contribution -- reveals these shifts are sharpest during mid-conversation crisis moments invisible to aggregate scoring. In a self-harm scenario involving a minor, GPT-4o scored 3.6/10 on crisis detection during early disclosure turns; GPT-5-mini never dropped below 7.8.   What users perceived as \"lost empathy\" was a shift from a cautious model that missed crises to an alert model that sometimes says too much -- a trade-off with real consequences for vulnerable users, currently invisible to both the people who feel it and the developers who create it.",
    "source_database": "arxiv",
    "arxiv_id": "2603.09997v1"
  },
  {
    "title": "Utilizing Metadata for Better Retrieval-Augmented Generation",
    "authors": [
      "Raquib Bin Yousuf",
      "Shengzhe Xu",
      "Mandar Sharma",
      "Andrew Neeser",
      "Chris Latimer",
      "Naren Ramakrishnan"
    ],
    "year": "2026",
    "journal": "arXiv:2601.11863v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation systems depend on retrieving semantically relevant document chunks to support accurate, grounded outputs from large language models. In structured and repetitive corpora such as regulatory filings, chunk similarity alone often fails to distinguish between documents with overlapping language. Practitioners often flatten metadata into input text as a heuristic, but the impact and trade-offs of this practice remain poorly understood. We present a systematic study of metadata-aware retrieval strategies, comparing plain-text baselines with approaches that embed metadata directly. Our evaluation spans metadata-as-text (prefix and suffix), a dual-encoder unified embedding that fuses metadata and content in a single index, dual-encoder late-fusion retrieval, and metadata-aware query reformulation. Across multiple retrieval metrics and question types, we find that prefixing and unified embeddings consistently outperform plain-text baselines, with the unified at times exceeding prefixing while being easier to maintain. Beyond empirical comparisons, we analyze embedding space, showing that metadata integration improves effectiveness by increasing intra-document cohesion, reducing inter-document confusion, and widening the separation between relevant and irrelevant chunks. Field-level ablations show that structural cues provide strong disambiguating signals. Our code, evaluation framework, and the RAGMATE-10K dataset are publicly hosted.",
    "source_database": "arxiv",
    "arxiv_id": "2601.11863v1"
  },
  {
    "title": "Investigating Retrieval-Augmented Generation in Quranic Studies: A Study of 13 Open-Source Large Language Models",
    "authors": [
      "Zahra Khalila",
      "Arbi Haza Nasution",
      "Winda Monika",
      "Aytug Onan",
      "Yohei Murakami",
      "Yasir Bin Ismail Radi",
      "Noor Mohammad Osmani"
    ],
    "year": "2025",
    "journal": "arXiv:2503.16581v1",
    "doi": "https://doi.org/10.14569/IJACSA.2025.01602134",
    "abstract": "Accurate and contextually faithful responses are critical when applying large language models (LLMs) to sensitive and domain-specific tasks, such as answering queries related to quranic studies. General-purpose LLMs often struggle with hallucinations, where generated responses deviate from authoritative sources, raising concerns about their reliability in religious contexts. This challenge highlights the need for systems that can integrate domain-specific knowledge while maintaining response accuracy, relevance, and faithfulness. In this study, we investigate 13 open-source LLMs categorized into large (e.g., Llama3:70b, Gemma2:27b, QwQ:32b), medium (e.g., Gemma2:9b, Llama3:8b), and small (e.g., Llama3.2:3b, Phi3:3.8b). A Retrieval-Augmented Generation (RAG) is used to make up for the problems that come with using separate models. This research utilizes a descriptive dataset of Quranic surahs including the meanings, historical context, and qualities of the 114 surahs, allowing the model to gather relevant knowledge before responding. The models are evaluated using three key metrics set by human evaluators: context relevance, answer faithfulness, and answer relevance. The findings reveal that large models consistently outperform smaller models in capturing query semantics and producing accurate, contextually grounded responses. The Llama3.2:3b model, even though it is considered small, does very well on faithfulness (4.619) and relevance (4.857), showing the promise of smaller architectures that have been well optimized. This article examines the trade-offs between model size, computational efficiency, and response quality while using LLMs in domain-specific applications.",
    "source_database": "arxiv",
    "arxiv_id": "2503.16581v1"
  },
  {
    "title": "RAGPart & RAGMask: Retrieval-Stage Defenses Against Corpus Poisoning in Retrieval-Augmented Generation",
    "authors": [
      "Pankayaraj Pathmanathan",
      "Michael-Andrei Panaitescu-Liess",
      "Cho-Yu Jason Chiang",
      "Furong Huang"
    ],
    "year": "2025",
    "journal": "arXiv:2512.24268v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) has emerged as a promising paradigm to enhance large language models (LLMs) with external knowledge, reducing hallucinations and compensating for outdated information. However, recent studies have exposed a critical vulnerability in RAG pipelines corpus poisoning where adversaries inject malicious documents into the retrieval corpus to manipulate model outputs. In this work, we propose two complementary retrieval-stage defenses: RAGPart and RAGMask. Our defenses operate directly on the retriever, making them computationally lightweight and requiring no modification to the generation model. RAGPart leverages the inherent training dynamics of dense retrievers, exploiting document partitioning to mitigate the effect of poisoned points. In contrast, RAGMask identifies suspicious tokens based on significant similarity shifts under targeted token masking. Across two benchmarks, four poisoning strategies, and four state-of-the-art retrievers, our defenses consistently reduce attack success rates while preserving utility under benign conditions. We further introduce an interpretable attack to stress-test our defenses. Our findings highlight the potential and limitations of retrieval-stage defenses, providing practical insights for robust RAG deployments.",
    "source_database": "arxiv",
    "arxiv_id": "2512.24268v1"
  },
  {
    "title": "IGMiRAG: Intuition-Guided Retrieval-Augmented Generation with Adaptive Mining of In-Depth Memory",
    "authors": [
      "Xingliang Hou",
      "Yuyan Liu",
      "Qi Sun",
      "haoxiu wang",
      "Hao Hu",
      "Shaoyi Du",
      "Zhiqiang Tian"
    ],
    "year": "2026",
    "journal": "arXiv:2602.07525v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) equips large language models (LLMs) with reliable knowledge memory. To strengthen cross-text associations, recent research integrates graphs and hypergraphs into RAG to capture pairwise and multi-entity relations as structured links. However, their misaligned memory organization necessitates costly, disjointed retrieval. To address these limitations, we propose IGMiRAG, a framework inspired by human intuition-guided reasoning. It constructs a hierarchical heterogeneous hypergraph to align multi-granular knowledge, incorporating deductive pathways to simulate realistic memory structures. During querying, IGMiRAG distills intuitive strategies via a question parser to control mining depth and memory window, and activates instantaneous memories as anchors using dual-focus retrieval. Mirroring human intuition, the framework guides retrieval resource allocation dynamically. Furthermore, we design a bidirectional diffusion algorithm that navigates deductive paths to mine in-depth memories, emulating human reasoning processes. Extensive evaluations indicate IGMiRAG outperforms the state-of-the-art baseline by 4.8% EM and 5.0% F1 overall, with token costs adapting to task complexity (average 6.3k+, minimum 3.0k+). This work presents a cost-effective RAG paradigm that improves both efficiency and effectiveness.",
    "source_database": "arxiv",
    "arxiv_id": "2602.07525v1"
  },
  {
    "title": "Reconstructing Context: Evaluating Advanced Chunking Strategies for Retrieval-Augmented Generation",
    "authors": [
      "Carlo Merola",
      "Jaspinder Singh"
    ],
    "year": "2025",
    "journal": "arXiv:2504.19754v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) has become a transformative approach for enhancing large language models (LLMs) by grounding their outputs in external knowledge sources. Yet, a critical question persists: how can vast volumes of external knowledge be managed effectively within the input constraints of LLMs? Traditional methods address this by chunking external documents into smaller, fixed-size segments. While this approach alleviates input limitations, it often fragments context, resulting in incomplete retrieval and diminished coherence in generation. To overcome these shortcomings, two advanced techniques, late chunking and contextual retrieval, have been introduced, both aiming to preserve global context. Despite their potential, their comparative strengths and limitations remain unclear. This study presents a rigorous analysis of late chunking and contextual retrieval, evaluating their effectiveness and efficiency in optimizing RAG systems. Our results indicate that contextual retrieval preserves semantic coherence more effectively but requires greater computational resources. In contrast, late chunking offers higher efficiency but tends to sacrifice relevance and completeness.",
    "source_database": "arxiv",
    "arxiv_id": "2504.19754v1"
  },
  {
    "title": "CARROT: A Learned Cost-Constrained Retrieval Optimization System for RAG",
    "authors": [
      "Ziting Wang",
      "Haitao Yuan",
      "Wei Dong",
      "Gao Cong",
      "Feifei Li"
    ],
    "year": "2024",
    "journal": "arXiv:2411.00744v2",
    "doi": "",
    "abstract": "Large Language Models (LLMs) have demonstrated impressive ability in generation and reasoning tasks but struggle with handling up-to-date knowledge, leading to inaccuracies or hallucinations. Retrieval-Augmented Generation (RAG) mitigates this by retrieving and incorporating external knowledge into input prompts. In particular, due to LLMs' context window limitations and long-context hallucinations, only the most relevant \"chunks\" are retrieved. However, current RAG systems face three key challenges: (1) chunks are often retrieved independently without considering their relationships, such as redundancy and ordering; (2) the utility of chunks is non-monotonic, as adding more chunks can degrade quality; and (3) retrieval strategies fail to adapt to the unique characteristics of different queries. To overcome these challenges, we design a cost-constrained retrieval optimization framework for RAG. We adopt a Monte Carlo Tree Search (MCTS) based strategy to find the optimal chunk combination order, which considers the chunks' correlations. In addition, to address the non-monotonicity of chunk utility, instead of treating budget exhaustion as the termination condition, we design a utility computation strategy to identify the optimal chunk combination without necessarily exhausting the budget. Furthermore, we propose a configuration agent that predicts optimal configurations for each query domain, improving our framework's adaptability and efficiency. Experimental results demonstrate up to a 30% improvement over baseline models, highlighting the framework's effectiveness, scalability, and suitability. Our source code has been released at https://github.com/wang0702/CARROT.",
    "source_database": "arxiv",
    "arxiv_id": "2411.00744v2"
  },
  {
    "title": "Faithfulness and the Notion of Adversarial Sensitivity in NLP Explanations",
    "authors": [
      "Supriya Manna",
      "Niladri Sett"
    ],
    "year": "2024",
    "journal": "arXiv:2409.17774v2",
    "doi": "https://doi.org/10.18653/v1/2024.blackboxnlp-1.12",
    "abstract": "Faithfulness is arguably the most critical metric to assess the reliability of explainable AI. In NLP, current methods for faithfulness evaluation are fraught with discrepancies and biases, often failing to capture the true reasoning of models. We introduce Adversarial Sensitivity as a novel approach to faithfulness evaluation, focusing on the explainer's response when the model is under adversarial attack. Our method accounts for the faithfulness of explainers by capturing sensitivity to adversarial input changes. This work addresses significant limitations in existing evaluation techniques, and furthermore, quantifies faithfulness from a crucial yet underexplored paradigm.",
    "source_database": "arxiv",
    "arxiv_id": "2409.17774v2"
  },
  {
    "title": "Using Bottleneck Adapters to Identify Cancer in Clinical Notes under Low-Resource Constraints",
    "authors": [
      "Omid Rohanian",
      "Hannah Jauncey",
      "Mohammadmahdi Nouriborji",
      "Vinod Kumar Chauhan",
      "Bronner P. Gonçalves",
      "Christiana Kartsonaki",
      "ISARIC Clinical Characterisation Group",
      "Laura Merson",
      "David Clifton"
    ],
    "year": "2022",
    "journal": "arXiv:2210.09440v2",
    "doi": "",
    "abstract": "Processing information locked within clinical health records is a challenging task that remains an active area of research in biomedical NLP. In this work, we evaluate a broad set of machine learning techniques ranging from simple RNNs to specialised transformers such as BioBERT on a dataset containing clinical notes along with a set of annotations indicating whether a sample is cancer-related or not.   Furthermore, we specifically employ efficient fine-tuning methods from NLP, namely, bottleneck adapters and prompt tuning, to adapt the models to our specialised task. Our evaluations suggest that fine-tuning a frozen BERT model pre-trained on natural language and with bottleneck adapters outperforms all other strategies, including full fine-tuning of the specialised BioBERT model. Based on our findings, we suggest that using bottleneck adapters in low-resource situations with limited access to labelled data or processing capacity could be a viable strategy in biomedical text mining. The code used in the experiments are going to be made available at https://github.com/omidrohanian/bottleneck-adapters.",
    "source_database": "arxiv",
    "arxiv_id": "2210.09440v2"
  },
  {
    "title": "Refine Medical Diagnosis Using Generation Augmented Retrieval and Clinical Practice Guidelines",
    "authors": [
      "Wenhao Li",
      "Hongkuan Zhang",
      "Hongwei Zhang",
      "Zhengxu Li",
      "Zengjie Dong",
      "Yafan Chen",
      "Niranjan Bidargaddi",
      "Hong Liu"
    ],
    "year": "2025",
    "journal": "arXiv:2506.21615v1",
    "doi": "",
    "abstract": "Current medical language models, adapted from large language models (LLMs), typically predict ICD code-based diagnosis from electronic health records (EHRs) because these labels are readily available. However, ICD codes do not capture the nuanced, context-rich reasoning clinicians use for diagnosis. Clinicians synthesize diverse patient data and reference clinical practice guidelines (CPGs) to make evidence-based decisions. This misalignment limits the clinical utility of existing models. We introduce GARMLE-G, a Generation-Augmented Retrieval framework that grounds medical language model outputs in authoritative CPGs. Unlike conventional Retrieval-Augmented Generation based approaches, GARMLE-G enables hallucination-free outputs by directly retrieving authoritative guideline content without relying on model-generated text. It (1) integrates LLM predictions with EHR data to create semantically rich queries, (2) retrieves relevant CPG knowledge snippets via embedding similarity, and (3) fuses guideline content with model output to generate clinically aligned recommendations. A prototype system for hypertension diagnosis was developed and evaluated on multiple metrics, demonstrating superior retrieval precision, semantic relevance, and clinical guideline adherence compared to RAG-based baselines, while maintaining a lightweight architecture suitable for localized healthcare deployment. This work provides a scalable, low-cost, and hallucination-free method for grounding medical language models in evidence-based clinical practice, with strong potential for broader clinical deployment.",
    "source_database": "arxiv",
    "arxiv_id": "2506.21615v1"
  },
  {
    "title": "Inference-Time Safety For Code LLMs Via Retrieval-Augmented Revision",
    "authors": [
      "Manisha Mukherjee",
      "Vincent J. Hellendoorn"
    ],
    "year": "2026",
    "journal": "arXiv:2603.01494v1",
    "doi": "",
    "abstract": "Large Language Models (LLMs) are increasingly deployed for code generation in high-stakes software development, yet their limited transparency in security reasoning and brittleness to evolving vulnerability patterns raise critical trustworthiness concerns. Models trained on static datasets cannot readily adapt to newly discovered vulnerabilities or changing security standards without retraining, leading to the repeated generation of unsafe code.   We present a principled approach to trustworthy code generation by design that operates as an inference-time safety mechanism. Our approach employs retrieval-augmented generation to surface relevant security risks in generated code and retrieve related security discussions from a curated Stack Overflow knowledge base, which are then used to guide an LLM during code revision. This design emphasizes three aspects relevant to trustworthiness: (1) interpretability, through transparent safety interventions grounded in expert community explanations; (2) robustness, by allowing adaptation to evolving security practices without model retraining; and (3) safety alignment, through real-time intervention before unsafe code reaches deployment.   Across real-world and benchmark datasets, our approach improves the security of LLM-generated code compared to prompting alone, while introducing no new vulnerabilities as measured by static analysis. These results suggest that principled, retrieval-augmented inference-time interventions can serve as a complementary mechanism for improving the safety of LLM-based code generation, and highlight the ongoing value of community knowledge in supporting trustworthy AI deployment.",
    "source_database": "arxiv",
    "arxiv_id": "2603.01494v1"
  },
  {
    "title": "Blended RAG: Improving RAG (Retriever-Augmented Generation) Accuracy with Semantic Search and Hybrid Query-Based Retrievers",
    "authors": [
      "Kunal Sawarkar",
      "Abhilasha Mangal",
      "Shivam Raj Solanki"
    ],
    "year": "2024",
    "journal": "arXiv:2404.07220v2",
    "doi": "https://doi.org/10.1109/MIPR62202.2024.00031",
    "abstract": "Retrieval-Augmented Generation (RAG) is a prevalent approach to infuse a private knowledge base of documents with Large Language Models (LLM) to build Generative Q\\&A (Question-Answering) systems. However, RAG accuracy becomes increasingly challenging as the corpus of documents scales up, with Retrievers playing an outsized role in the overall RAG accuracy by extracting the most relevant document from the corpus to provide context to the LLM. In this paper, we propose the 'Blended RAG' method of leveraging semantic search techniques, such as Dense Vector indexes and Sparse Encoder indexes, blended with hybrid query strategies. Our study achieves better retrieval results and sets new benchmarks for IR (Information Retrieval) datasets like NQ and TREC-COVID datasets. We further extend such a 'Blended Retriever' to the RAG system to demonstrate far superior results on Generative Q\\&A datasets like SQUAD, even surpassing fine-tuning performance.",
    "source_database": "arxiv",
    "arxiv_id": "2404.07220v2"
  },
  {
    "title": "VeriCite: Towards Reliable Citations in Retrieval-Augmented Generation via Rigorous Verification",
    "authors": [
      "Haosheng Qian",
      "Yixing Fan",
      "Jiafeng Guo",
      "Ruqing Zhang",
      "Qi Chen",
      "Dawei Yin",
      "Xueqi Cheng"
    ],
    "year": "2025",
    "journal": "arXiv:2510.11394v1",
    "doi": "https://doi.org/10.1145/3767695.3769505",
    "abstract": "Retrieval-Augmented Generation (RAG) has emerged as a crucial approach for enhancing the responses of large language models (LLMs) with external knowledge sources. Despite the impressive performance in complex question-answering tasks, RAG still struggles with hallucinations. Attributing RAG-generated content through in-line citations has demonstrated potential in reducing hallucinations and facilitating human verification. Existing citation generation methods primarily rely on either fine-tuning the generator or employing post-processing approaches for citation matching. However, the former approach demands substantial annotated data and computational resources, while the latter often encounters difficulties in managing multiple citations and frequently produces suboptimal results. In this paper, we introduce a novel framework, called VeriCite, designed to rigorously validate supporting evidence and enhance answer attribution. Specifically, VeriCite breaks down into a three-stage generation: 1) The initial answer generation first generates a response based on all available contexts and has its claims verified through the NLI model; 2) the supporting evidence selection assesses the utility of each document and extracts useful supporting evidences; 3) the final answer refinement integrates the initial response and collected evidences to produce the final, refined answer.We conduct experiments across five open-source LLMs and four datasets, demonstrating that VeriCite can significantly improve citation quality while maintaining the correctness of the answers.",
    "source_database": "arxiv",
    "arxiv_id": "2510.11394v1"
  },
  {
    "title": "Relational Action Bases: Formalization, Effective Safety Verification, and Invariants (Extended Version)",
    "authors": [
      "Silvio Ghilardi",
      "Alessandro Gianola",
      "Marco Montali",
      "Andrey Rivkin"
    ],
    "year": "2022",
    "journal": "arXiv:2208.06377v2",
    "doi": "",
    "abstract": "Modeling and verification of dynamic systems operating over a relational representation of states are increasingly investigated problems in AI, Business Process Management, and Database Theory. To make these systems amenable to verification, the amount of information stored in each relational state needs to be bounded, or restrictions are imposed on the preconditions and effects of actions. We introduce the general framework of relational action bases (RABs), which generalizes existing models by lifting both these restrictions: unbounded relational states can be evolved through actions that can quantify both existentially and universally over the data, and that can exploit numerical datatypes with arithmetic predicates. We then study parameterized safety of RABs via (approximated) SMT-based backward search, singling out essential meta-properties of the resulting procedure, and showing how it can be realized by an off-the-shelf combination of existing verification modules of the state-of-the-art MCMT model checker. We demonstrate the effectiveness of this approach on a benchmark of data-aware business processes. Finally, we show how universal invariants can be exploited to make this procedure fully correct.",
    "source_database": "arxiv",
    "arxiv_id": "2208.06377v2"
  },
  {
    "title": "Reason and Verify: A Framework for Faithful Retrieval-Augmented Generation",
    "authors": [
      "Eeham Khan",
      "Luis Rodriguez",
      "Marc Queudot"
    ],
    "year": "2026",
    "journal": "arXiv:2603.10143v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) significantly improves the factuality of Large Language Models (LLMs), yet standard pipelines often lack mechanisms to verify inter- mediate reasoning, leaving them vulnerable to hallucinations in high-stakes domains. To address this, we propose a domain-specific RAG framework that integrates explicit rea- soning and faithfulness verification. Our architecture augments standard retrieval with neural query rewriting, BGE-based cross-encoder reranking, and a rationale generation module that grounds sub-claims in specific evidence spans. We further introduce an eight-category verification taxonomy that enables fine-grained assessment of rationale faithfulness, distinguishing between explicit and implicit support patterns to facilitate structured error diagnosis. We evaluate this framework on the BioASQ and PubMedQA benchmarks, specifically analyzing the impact of dynamic in-context learning and rerank- ing under constrained token budgets. Experiments demonstrate that explicit rationale generation improves accuracy over vanilla RAG baselines, while dynamic demonstration selection combined with robust reranking yields further gains in few-shot settings. Using Llama-3-8B-Instruct, our approach achieves 89.1% on BioASQ-Y/N and 73.0% on Pub- MedQA, competitive with systems using significantly larger models. Additionally, we perform a pilot study combining human expert assessment with LLM-based verification to explore how explicit rationale generation improves system transparency and enables more detailed diagnosis of retrieval failures in biomedical question answering.",
    "source_database": "arxiv",
    "arxiv_id": "2603.10143v1"
  },
  {
    "title": "Hybrid-Code v2: Zero-Hallucination Clinical ICD-10 Coding via Neuro-Symbolic Verification and Automated Knowledge Base Expansion",
    "authors": [
      "Yunguo Yu"
    ],
    "year": "2025",
    "journal": "arXiv:2512.23743v2",
    "doi": "",
    "abstract": "Automated clinical ICD-10 coding is a high-impact healthcare task requiring a balance between coverage, precision, and safety. While neural approaches achieve strong performance, they suffer from hallucination-generating invalid or unsupported codes-posing unacceptable risks in safety-critical clinical settings. Rule-based systems eliminate hallucination but lack scalability and coverage due to manual knowledge base (KB) curation.   We present Hybrid-Code v2, a neuro-symbolic framework that achieves zero Type-I hallucination by construction while maintaining competitive coverage and precision. The system integrates neural candidate generation with a symbolic KB verification layer that enforces validity constraints through multi-layer verification, including format, evidence grounding, negation detection, temporal consistency, and exclusion rules. In addition, we introduce an automated KB expansion mechanism that extracts and validates coding patterns from unlabeled clinical text, addressing the scalability limitations of rule-based systems.   Evaluated on the MIMIC-III dataset against ClinicalBERT, BioBERT, rule-based systems, and GPT-4, Hybrid-Code v2 achieves 85% coverage, 92% precision, and 0% Type-I hallucination, outperforming rule-based systems by +40% coverage while eliminating hallucination observed in neural baselines (6-18%). The proposed architecture provides a formal safety guarantee for syntactic validity while preserving strong empirical performance.   These results demonstrate that neuro-symbolic verification can enforce safety constraints in neural medical AI systems without sacrificing effectiveness, offering a generalizable design pattern for deploying trustworthy AI in safety-critical domains.",
    "source_database": "arxiv",
    "arxiv_id": "2512.23743v2"
  },
  {
    "title": "Video Enriched Retrieval Augmented Generation Using Aligned Video Captions",
    "authors": [
      "Kevin Dela Rosa"
    ],
    "year": "2024",
    "journal": "arXiv:2405.17706v1",
    "doi": "",
    "abstract": "In this work, we propose the use of \"aligned visual captions\" as a mechanism for integrating information contained within videos into retrieval augmented generation (RAG) based chat assistant systems. These captions are able to describe the visual and audio content of videos in a large corpus while having the advantage of being in a textual format that is both easy to reason about & incorporate into large language model (LLM) prompts, but also typically require less multimedia content to be inserted into the multimodal LLM context window, where typical configurations can aggressively fill up the context window by sampling video frames from the source video. Furthermore, visual captions can be adapted to specific use cases by prompting the original foundational model / captioner for particular visual details or fine tuning. In hopes of helping advancing progress in this area, we curate a dataset and describe automatic evaluation procedures on common RAG tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2405.17706v1"
  },
  {
    "title": "Expert Mind: A Retrieval-Augmented Architecture for Expert Knowledge Preservation in the Energy Sector",
    "authors": [
      "Diego Ezequiel Cervera"
    ],
    "year": "2026",
    "journal": "arXiv:2603.14541v1",
    "doi": "",
    "abstract": "The departure of subject-matter experts from industrial organizations results in the irreversible loss of tacit knowledge that is rarely captured through conventional documentation practices. This paper proposes Expert Mind, an experimental system that leverages Retrieval-Augmented Generation (RAG), large language models (LLMs), and multimodal capture techniques to preserve, structure, and make queryable the deep expertise of organizational knowledge holders. Drawing on the specific context of the energy sector, where decades of operational experience risk being lost to an aging workforce, we describe the system architecture, processing pipeline, ethical framework, and evaluation methodology. The proposed system addresses the knowledge elicitation problem through structured interviews, think-aloud sessions, and text corpus ingestion, which are subsequently embedded into a vector store and queried through a conversational interface. Preliminary design considerations suggest Expert Mind can significantly reduce knowledge transfer latency and improve onboarding efficiency. Ethical dimensions including informed consent, intellectual property, and the right to erasure are addressed as first-class design constraints.",
    "source_database": "arxiv",
    "arxiv_id": "2603.14541v1"
  },
  {
    "title": "Tree of Reviews: A Tree-based Dynamic Iterative Retrieval Framework for Multi-hop Question Answering",
    "authors": [
      "Li Jiapeng",
      "Liu Runze",
      "Li Yabo",
      "Zhou Tong",
      "Li Mingling",
      "Chen Xiang"
    ],
    "year": "2024",
    "journal": "arXiv:2404.14464v1",
    "doi": "",
    "abstract": "Multi-hop question answering is a knowledge-intensive complex problem. Large Language Models (LLMs) use their Chain of Thoughts (CoT) capability to reason complex problems step by step, and retrieval-augmentation can effectively alleviate factual errors caused by outdated and unknown knowledge in LLMs. Recent works have introduced retrieval-augmentation in the CoT reasoning to solve multi-hop question answering. However, these chain methods have the following problems: 1) Retrieved irrelevant paragraphs may mislead the reasoning; 2) An error in the chain structure may lead to a cascade of errors.   In this paper, we propose a dynamic retrieval framework called Tree of Reviews (ToR), where the root node is the question, and the other nodes are paragraphs from retrieval, extending different reasoning paths from the root node to other nodes. Our framework dynamically decides to initiate a new search, reject, or accept based on the paragraphs on the reasoning paths. Compared to related work, we introduce a tree structure to handle each retrieved paragraph separately, alleviating the misleading effect of irrelevant paragraphs on the reasoning path; the diversity of reasoning path extension reduces the impact of a single reasoning error on the whole. We conducted experiments on three different multi-hop question answering datasets. The results show that compared to the baseline methods, ToR achieves state-of-the-art performance in both retrieval and response generation. In addition, we propose two tree-based search optimization strategies, pruning and effective expansion, to reduce time overhead and increase the diversity of path extension. We will release our code.",
    "source_database": "arxiv",
    "arxiv_id": "2404.14464v1"
  },
  {
    "title": "AlzheimerRAG: Multimodal Retrieval Augmented Generation for Clinical Use Cases using PubMed articles",
    "authors": [
      "Aritra Kumar Lahiri",
      "Qinmin Vivian Hu"
    ],
    "year": "2024",
    "journal": "arXiv:2412.16701v3",
    "doi": "https://doi.org/10.3390/make7030089",
    "abstract": "Recent advancements in generative AI have fostered the development of highly adept Large Language Models (LLMs) that integrate diverse data types to empower decision-making. Among these, multimodal retrieval-augmented generation (RAG) applications are promising because they combine the strengths of information retrieval and generative models, enhancing their utility across various domains, including clinical use cases. This paper introduces AlzheimerRAG, a Multimodal RAG application for clinical use cases, primarily focusing on Alzheimer's Disease case studies from PubMed articles. This application incorporates cross-modal attention fusion techniques to integrate textual and visual data processing by efficiently indexing and accessing vast amounts of biomedical literature. Our experimental results, compared to benchmarks such as BioASQ and PubMedQA, have yielded improved performance in the retrieval and synthesis of domain-specific information. We also present a case study using our multimodal RAG in various Alzheimer's clinical scenarios. We infer that AlzheimerRAG can generate responses with accuracy non-inferior to humans and with low rates of hallucination.",
    "source_database": "arxiv",
    "arxiv_id": "2412.16701v3"
  },
  {
    "title": "Synchronous Faithfulness Monitoring for Trustworthy Retrieval-Augmented Generation",
    "authors": [
      "Di Wu",
      "Jia-Chen Gu",
      "Fan Yin",
      "Nanyun Peng",
      "Kai-Wei Chang"
    ],
    "year": "2024",
    "journal": "arXiv:2406.13692v2",
    "doi": "",
    "abstract": "Retrieval-augmented language models (RALMs) have shown strong performance and wide applicability in knowledge-intensive tasks. However, there are significant trustworthiness concerns as RALMs are prone to generating unfaithful outputs, including baseless information or contradictions with the retrieved context. This paper proposes SynCheck, a lightweight monitor that leverages fine-grained decoding dynamics including sequence likelihood, uncertainty quantification, context influence, and semantic alignment to synchronously detect unfaithful sentences. By integrating efficiently measurable and complementary signals, SynCheck enables accurate and immediate feedback and intervention, achieving 0.85 AUROC in detecting faithfulness errors across six long-form retrieval-augmented generation tasks, improving prior best method by 4%. Leveraging SynCheck, we further introduce FOD, a faithfulness-oriented decoding algorithm guided by beam search for long-form retrieval-augmented generation. Empirical results demonstrate that FOD outperforms traditional strategies such as abstention, reranking, or contrastive decoding significantly in terms of faithfulness, achieving over 10% improvement across six datasets.",
    "source_database": "arxiv",
    "arxiv_id": "2406.13692v2"
  },
  {
    "title": "Retrieval-Augmented and Knowledge-Grounded Language Models for Faithful Clinical Medicine",
    "authors": [
      "Fenglin Liu",
      "Bang Yang",
      "Chenyu You",
      "Xian Wu",
      "Shen Ge",
      "Zhangdaihong Liu",
      "Xu Sun",
      "Yang Yang",
      "David A. Clifton"
    ],
    "year": "2022",
    "journal": "arXiv:2210.12777v4",
    "doi": "",
    "abstract": "Language models (LMs), including large language models (such as ChatGPT), have the potential to assist clinicians in generating various clinical notes. However, LMs are prone to produce ``hallucinations'', i.e., generated content that is not aligned with facts and knowledge. In this paper, we propose the Re$^3$Writer method with retrieval-augmented generation and knowledge-grounded reasoning to enable LMs to generate faithful clinical texts. We demonstrate the effectiveness of our method in generating patient discharge instructions. It requires the LMs not to only understand the patients' long clinical documents, i.e., the health records during hospitalization, but also to generate critical instructional information provided both to carers and to the patient at the time of discharge. The proposed Re$^3$Writer imitates the working patterns of physicians to first \\textbf{re}trieve related working experience from historical instructions written by physicians, then \\textbf{re}ason related medical knowledge. Finally, it \\textbf{re}fines the retrieved working experience and reasoned medical knowledge to extract useful information, which is used to generate the discharge instructions for previously-unseen patients. Our experiments show that, using our method, the performance of five representative LMs can be substantially boosted across all metrics. Meanwhile, we show results from human evaluations to measure the effectiveness in terms of fluency, faithfulness, and comprehensiveness.",
    "source_database": "arxiv",
    "arxiv_id": "2210.12777v4"
  },
  {
    "title": "Multi-Task Retrieval-Augmented Text Generation with Relevance Sampling",
    "authors": [
      "Sebastian Hofstätter",
      "Jiecao Chen",
      "Karthik Raman",
      "Hamed Zamani"
    ],
    "year": "2022",
    "journal": "arXiv:2207.03030v1",
    "doi": "",
    "abstract": "This paper studies multi-task training of retrieval-augmented generation models for knowledge-intensive tasks. We propose to clean the training set by utilizing a distinct property of knowledge-intensive generation: The connection of query-answer pairs to items in the knowledge base. We filter training examples via a threshold of confidence on the relevance labels, whether a pair is answerable by the knowledge base or not. We train a single Fusion-in-Decoder (FiD) generator on seven combined tasks of the KILT benchmark. The experimental results suggest that our simple yet effective approach substantially improves competitive baselines on two strongly imbalanced tasks; and shows either smaller improvements or no significant regression on the remaining tasks. Furthermore, we demonstrate our multi-task training with relevance label sampling scales well with increased model capacity and achieves state-of-the-art results in five out of seven KILT tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2207.03030v1"
  },
  {
    "title": "Case Study: Runtime Safety Verification of Neural Network Controlled System",
    "authors": [
      "Frank Yang",
      "Sinong Simon Zhan",
      "Yixuan Wang",
      "Chao Huang",
      "Qi Zhu"
    ],
    "year": "2024",
    "journal": "arXiv:2408.08592v1",
    "doi": "",
    "abstract": "Neural networks are increasingly used in safety-critical applications such as robotics and autonomous vehicles. However, the deployment of neural-network-controlled systems (NNCSs) raises significant safety concerns. Many recent advances overlook critical aspects of verifying control and ensuring safety in real-time scenarios. This paper presents a case study on using POLAR-Express, a state-of-the-art NNCS reachability analysis tool, for runtime safety verification in a Turtlebot navigation system using LiDAR. The Turtlebot, equipped with a neural network controller for steering, operates in a complex environment with obstacles. We developed a safe online controller switching strategy that switches between the original NNCS controller and an obstacle avoidance controller based on the verification results. Our experiments, conducted in a ROS2 Flatland simulation environment, explore the capabilities and limitations of using POLAR-Express for runtime verification and demonstrate the effectiveness of our switching strategy.",
    "source_database": "arxiv",
    "arxiv_id": "2408.08592v1"
  },
  {
    "title": "Fast and Faithful: Real-Time Verification for Long-Document Retrieval-Augmented Generation Systems",
    "authors": [
      "Xunzhuo Liu",
      "Bowei He",
      "Xue Liu",
      "Haichen Zhang",
      "Huamin Chen"
    ],
    "year": "2026",
    "journal": "arXiv:2603.23508v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) is increasingly deployed in enterprise search and document-centric assistants, where responses must be grounded in long and complex source materials. In practice, verifying that generated answers faithfully reflect retrieved documents is difficult: large language models can check long contexts but are too slow and costly for interactive services, while lightweight classifiers operate within strict context limits and frequently miss evidence outside truncated passages. We present the design of a real-time verification component integrated into a production RAG pipeline that enables full-document grounding under latency constraints. The system processes documents up to 32K tokens and employs adaptive inference strategies to balance response time and verification coverage across workloads. We describe the architectural decisions, operational trade-offs, and evaluation methodology used to deploy the verifier, and show that full-context verification substantially improves detection of unsupported responses compared with truncated validation. Our experience highlights when long-context verification is necessary, why chunk-based checking often fails in real documents, and how latency budgets shape model design. These findings provide practical guidance for practitioners building reliable large-scale retrieval-augmented applications. (Model, benchmark, and code: https://huggingface.co/llm-semantic-router)",
    "source_database": "arxiv",
    "arxiv_id": "2603.23508v1"
  },
  {
    "title": "Unifying Qualitative and Quantitative Safety Verification of DNN-Controlled Systems",
    "authors": [
      "Dapeng Zhi",
      "Peixin Wang",
      "Si Liu",
      "Luke Ong",
      "Min Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2404.01769v1",
    "doi": "",
    "abstract": "The rapid advance of deep reinforcement learning techniques enables the oversight of safety-critical systems through the utilization of Deep Neural Networks (DNNs). This underscores the pressing need to promptly establish certified safety guarantees for such DNN-controlled systems. Most of the existing verification approaches rely on qualitative approaches, predominantly employing reachability analysis. However, qualitative verification proves inadequate for DNN-controlled systems as their behaviors exhibit stochastic tendencies when operating in open and adversarial environments. In this paper, we propose a novel framework for unifying both qualitative and quantitative safety verification problems of DNN-controlled systems. This is achieved by formulating the verification tasks as the synthesis of valid neural barrier certificates (NBCs). Initially, the framework seeks to establish almost-sure safety guarantees through qualitative verification. In cases where qualitative verification fails, our quantitative verification method is invoked, yielding precise lower and upper bounds on probabilistic safety across both infinite and finite time horizons. To facilitate the synthesis of NBCs, we introduce their $k$-inductive variants. We also devise a simulation-guided approach for training NBCs, aiming to achieve tightness in computing precise certified lower and upper bounds. We prototype our approach into a tool called $\\textsf{UniQQ}$ and showcase its efficacy on four classic DNN-controlled systems.",
    "source_database": "arxiv",
    "arxiv_id": "2404.01769v1"
  },
  {
    "title": "Faithfulness-Aware Uncertainty Quantification for Fact-Checking the Output of Retrieval Augmented Generation",
    "authors": [
      "Ekaterina Fadeeva",
      "Aleksandr Rubashevskii",
      "Dzianis Piatrashyn",
      "Roman Vashurin",
      "Shehzaad Dhuliawala",
      "Artem Shelmanov",
      "Timothy Baldwin",
      "Preslav Nakov",
      "Mrinmaya Sachan",
      "Maxim Panov"
    ],
    "year": "2025",
    "journal": "arXiv:2505.21072v3",
    "doi": "",
    "abstract": "Large Language Models (LLMs) enhanced with retrieval, an approach known as Retrieval-Augmented Generation (RAG), have achieved strong performance in open-domain question answering. However, RAG remains prone to hallucinations: factually incorrect outputs may arise from inaccuracies in the model's internal knowledge and the retrieved context. Existing approaches to mitigating hallucinations often conflate factuality with faithfulness to the retrieved evidence, incorrectly labeling factually correct statements as hallucinations if they are not explicitly supported by the retrieval. In this paper, we introduce FRANQ, a new method for hallucination detection in RAG outputs. FRANQ applies distinct uncertainty quantification (UQ) techniques to estimate factuality, conditioning on whether a statement is faithful to the retrieved context. To evaluate FRANQ and competing UQ methods, we construct a new long-form question answering dataset annotated for both factuality and faithfulness, combining automated labeling with manual validation of challenging cases. Extensive experiments across multiple datasets, tasks, and LLMs show that FRANQ achieves more accurate detection of factual errors in RAG-generated responses compared to existing approaches.",
    "source_database": "arxiv",
    "arxiv_id": "2505.21072v3"
  },
  {
    "title": "DeepCodeSeek: Real-Time API Retrieval for Context-Aware Code Generation",
    "authors": [
      "Esakkivel Esakkiraja",
      "Denis Akhiyarov",
      "Aditya Shanmugham",
      "Chitra Ganapathy"
    ],
    "year": "2025",
    "journal": "arXiv:2509.25716v1",
    "doi": "",
    "abstract": "Current search techniques are limited to standard RAG query-document applications. In this paper, we propose a novel technique to expand the code and index for predicting the required APIs, directly enabling high-quality, end-to-end code generation for auto-completion and agentic AI applications. We address the problem of API leaks in current code-to-code benchmark datasets by introducing a new dataset built from real-world ServiceNow Script Includes that capture the challenge of unclear API usage intent in the code. Our evaluation metrics show that this method achieves 87.86% top-40 retrieval accuracy, allowing the critical context with APIs needed for successful downstream code generation. To enable real-time predictions, we develop a comprehensive post-training pipeline that optimizes a compact 0.6B reranker through synthetic dataset generation, supervised fine-tuning, and reinforcement learning. This approach enables our compact reranker to outperform a much larger 8B model while maintaining 2.5x reduced latency, effectively addressing the nuances of enterprise-specific code without the computational overhead of larger models.",
    "source_database": "arxiv",
    "arxiv_id": "2509.25716v1"
  },
  {
    "title": "Retrieving, Rethinking and Revising: The Chain-of-Verification Can Improve Retrieval Augmented Generation",
    "authors": [
      "Bolei He",
      "Nuo Chen",
      "Xinran He",
      "Lingyong Yan",
      "Zhenkai Wei",
      "Jinchang Luo",
      "Zhen-Hua Ling"
    ],
    "year": "2024",
    "journal": "arXiv:2410.05801v1",
    "doi": "",
    "abstract": "Recent Retrieval Augmented Generation (RAG) aims to enhance Large Language Models (LLMs) by incorporating extensive knowledge retrieved from external sources. However, such approach encounters some challenges: Firstly, the original queries may not be suitable for precise retrieval, resulting in erroneous contextual knowledge; Secondly, the language model can easily generate inconsistent answer with external references due to their knowledge boundary limitation. To address these issues, we propose the chain-of-verification (CoV-RAG) to enhance the external retrieval correctness and internal generation consistency. Specifically, we integrate the verification module into the RAG, engaging in scoring, judgment, and rewriting. To correct external retrieval errors, CoV-RAG retrieves new knowledge using a revised query. To correct internal generation errors, we unify QA and verification tasks with a Chain-of-Thought (CoT) reasoning during training. Our comprehensive experiments across various LLMs demonstrate the effectiveness and adaptability compared with other strong baselines. Especially, our CoV-RAG can significantly surpass the state-of-the-art baselines using different LLM backbones.",
    "source_database": "arxiv",
    "arxiv_id": "2410.05801v1"
  }
]