[
  {
    "title": "\u201dA Midsummer Night\u2019s Dream\u201d quest for truth: From ChatGPT \u201challucinations\u201d to RAG reasoning and ACURAI precision \u2014 a scoping review on detection, minimizing, and (almost) complete error elimination and enhancing Large Language Models' re-liability",
    "authors": [
      "A. Anghelescu",
      "Constantin Munteanu",
      "Lucia Ana Maria Anghelescu",
      "G. Onose"
    ],
    "year": "2025",
    "journal": "Balneo and PRM Research Journal",
    "doi": "10.12680/balneo.2025.847",
    "pmid": "",
    "abstract": "Like A Midsummer Night\u2019s Dream, large language models (LLMs) exhibit vast imagination, drawing on massive training datasets. However, they may fabricate or mix information, lacking mechanisms to verify real-world sources. Most commercial LLMs, including those used in medicine, remain prone to hallucinations\u2014plausible but false content. Retrieval-Augmented Generation (RAG) aims to address this by grounding LLM outputs in real-time access to verified sources like scientific databases. A 2023\u20132025 PubMed search identified 91 papers on RAG and LLM applications across biomedical domains; 78 were useful for our paper, adressing medical domains. RAG techniques significantly reduce hallucinations by ensuring that only validated information informs model outputs. ACURAI, an advanced system based on \u201cphrase dominance and discrete functional units (DFUs),\u201d further enhances LLM accuracy. Tested on a novel \u201cRAG-Truth Dataset Caveats,\u201d ACURAI eliminated 91\u2013100% of junk outputs in GPT-3.5 and GPT-4. While LLMs can resemble Puck (creative yet unreliable), ACURAI, aided by RAG, acts more like Theseus, grounding answers in verified data. This framework strengthens the possible role of LLMs in clinical diagnosis, academic writing, and patient education, offering a practical path toward safer and more accurate medical AI. Ultimately, human oversight remains key to interpreting and validating AI-generated outputs.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "An Agentic Hybrid LLM\u2013RAG Framework for Explainable Clinical Decision Support",
    "authors": [
      "Mohammed Kapadia",
      "Mohammed Memon",
      "P. Mishra",
      "S. Okuhara"
    ],
    "year": "2026",
    "journal": "Proceedings of the 18th International Conference on Agents and Artificial Intelligence",
    "doi": "10.5220/0014459100004052",
    "pmid": "",
    "abstract": ": The fast evolution of Large Language Models (LLMs) has provided new opportunities to intelligent Clinical Decision Support Systems (CDSS), but such issues as hallucination, absent interpretability, and poor factual foundation still exist. This paper proposes a Hybrid LLM-Retrieval Augmented Generation (RAG) model of evidence-based clinical reasoning, which would combine transformer-based contextual understanding and retrieval-based factual verification. This system is an agentic design and consists of four collaborative components, namely Retriever, Transformer Encoder, Generator, and Evaluator Agents that together guarantee accuracy, interpretability, and transparency. The suggested framework transforms the diagnostic reasoning into a probabilistic optimisation problem, and the recommendations are conditionalized by multimodal patient data and top-k evidence obtained in the biomedical literature. A composite loss is a loss that optimises diagnostic accuracy, semantic consistency and factual faithfulness. Experimental validation on benchmark datasets, such as MIMIC-III, PubMedQA and ADReSSo 2021, performs better than current models, including BioBERT, GPT-3.5 and Med-PaLM 2, with 93.7% accuracy, 0.926 AUROC, and 68% reduction in rate of hallucination. The findings prove that the Hybrid LLM-RAG model is feasible in the context of aligning linguistic fluency and clinical reliability, developing a reliable AI-based decision support in healthcare-related applications. Although the findings are promising, they are achieved in controlled experimental conditions and do not demonstrate competitive performance assertion.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "MedTrust-RAG: Evidence Verification and Trust Alignment for Biomedical Question Answering",
    "authors": [
      "Yingpeng Ning",
      "Yuanyuan Sun",
      "Ling Luo",
      "Yanhua Wang",
      "Yuchen Pan",
      "Hongfei Lin"
    ],
    "year": "2025",
    "journal": "arXiv:2510.14400v2",
    "doi": "",
    "abstract": "Biomedical question answering (QA) requires accurate interpretation of complex medical knowledge. Large language models (LLMs) have shown promising capabilities in this domain, with retrieval-augmented generation (RAG) systems enhancing performance by incorporating external medical literature. However, RAG-based approaches in biomedical QA suffer from hallucinations due to post-retrieval noise and insufficient verification of retrieved evidence, undermining response reliability. We propose MedTrust-Guided Iterative RAG, a framework designed to enhance factual consistency and mitigate hallucinations in medical QA. Our method introduces three key innovations. First, it enforces citation-aware reasoning by requiring all generated content to be explicitly grounded in retrieved medical documents, with structured Negative Knowledge Assertions used when evidence is insufficient. Second, it employs an iterative retrieval-verification process, where a verification agent assesses evidence adequacy and refines queries through Medical Gap Analysis until reliable information is obtained. Third, it integrates the MedTrust-Align Module (MTAM) that combines verified positive examples with hallucination-aware negative samples, leveraging Direct Preference Optimization to reinforce citation-grounded reasoning while penalizing hallucination-prone response patterns.",
    "source_database": "arxiv",
    "arxiv_id": "2510.14400v2"
  },
  {
    "title": "LLMs and LVMs for agentic AI: a GPU-accelerated multimodal system architecture for RAG-grounded, explainable, and adaptive intelligence",
    "authors": [
      "Kiarash Ahi",
      "Chih-Hung Hsieh",
      "G. Fenger"
    ],
    "year": "2025",
    "journal": "",
    "doi": "10.1117/12.3078485",
    "pmid": "",
    "abstract": "This paper presents an architecture for an Agentic AI System that autonomously operates and manages complex workflows across enterprise and industrial software ecosystems such as Electronic Design Automation (EDA) tools (e.g., Siemens Calibre), Product Lifecycle Management (PLM) and Digital Twin platforms (e.g., Teamcenter Digital Reality Viewer), as well as knowledge-centric domains including HR analytics, financial modeling, healthcare diagnostics, and creative design platforms. This architecture leverages a multi-agent framework orchestrated by a central planner, integrating large language model (LLM) and large vision model (LVM) reasoning for multimodal understanding, retrieval-augmented generation (RAG) pipelines, and enterprise-grade governance to enable secure, explainable, and adaptive automation across both physical and virtual product lifecycle stages. The architecture is structured as a nine-layer intelligent stack, beginning with a natural language interface and extending through layers responsible for cognitive orchestration, specialized agents, contextual retrieval, reasoning, tool execution, security, access control, and feedback-driven learning. Users issue high-level intents\u2014such as \u201crun DRC and fix critical violations\u201d or \u201csynchronize the latest design update with the digital twin\u201d\u2014 which are interpreted by the planner agent and decomposed into sub-tasks. These are executed by specialized agents (e.g., simulation, review, or action agents), each interfacing securely with industrial tools and twin environments through sandboxed runtimes and version-controlled APIs. The planner dynamically adjusts task decomposition and agent routing based on resource constraints, latency budgets, and model confidence, enabling adaptive, performance-aware orchestration. Beyond industrial and engineering use cases, the same agentic architecture generalizes to broader enterprise workflows. In HR and finance, autonomous agents extract insights from structured and unstructured data, improve forecasting accuracy, and ensure regulatory compliance. In healthcare, multimodal reasoning that fuses text, imagery, and sensor data can assist clinicians in diagnosis and treatment planning while maintaining explainability. In creative and design environments, agentic co-pilots interpret user intent, generate assets, and optimize iterative design loops\u2014enhancing both productivity and human creativity. A core RAG layer grounds decisions in proprietary engineering knowledge (e.g., PDK rules, fab specifications, simulation logs, and historical twin data), while a chunk reranker ensures only the most relevant context is injected into LLM prompts. This RAG pipeline supports fast memory access, context pruning, and scalable grounding across high-volume logs and digital twin telemetry. This grounding layer can be extended to any domain where contextual reasoning over proprietary knowledge is critical\u2014ranging from clinical data repositories and enterprise ERPs to document archives and financial transaction graphs. To support this architecture\u2019s adaptive orchestration and multimodal agent execution, performance-optimized inference becomes critical. To meet the latency, throughput, and scalability demands of large-scale multimodal reasoning, the system incorporates GPU-accelerated inference pipelines, including ROI-guided compression and adaptive latent-space clustering to reduce computational overhead while preserving output fidelity. These GPU-accelerated strategies are based on the ROI-LCC framework, which integrates dynamic Region of Interest (ROI) selection, latentspace clustering, and learned GPU feature extraction to minimize redundancy and streamline computation. Outputs are processed through a guardrails and explainability (XAI) layer that filters unsafe content, validates decisions, and generates structured audit trails. The system includes a Human-in-the-Loop (HITL) mechanism to review high-impact or real-world synchronized actions before execution. These optimizations\u2014originally developed and validated on nanometer-resolution SEM imagery exhibiting nanoscale noise, low SNR, and extreme visual detail\u2014enable robust, high-throughput inference in compute-constrained scenarios such as EUV lithography and biomedical diagnostics. This architecture has been integrated into Calibre SEMSuite\u2122, demonstrating readiness for real-world deployment in precision-critical industrial environments. The architecture supports real-time telemetry, bias and drift detection, and a data flywheel that captures feedback and performance metrics to continuously refine agent behavior, prompt strategies, and model accuracy. Designed for hybrid on-prem/cloud deployment and compliant with RBAC/ABAC enterprise security policies, this system ensures scalability, transparency, and governance continuity across industrial, enterprise, and domain-specific ecosystems\u2014from design and manufacturing to financial analytics, healthcare diagnostics, HR operations, and creative content pipelines. Collectively, these capabilities position the architecture as a generalized substrate for enterprise-scale intelligence orchestration. It not only automates workflows but also augments human decision-making, improves analytical accuracy, and accelerates creativity across sectors\u2014bridging cognitive reasoning, multimodal perception, and secure execution. By unifying LLM reasoning and LVM orchestration, GPU-accelerated inference, grounded retrieval, digital twin synchronization, tool integration, and enterprise governance within a modular agentic framework, this system transforms traditional industrial software into an intelligent, auditable, and self-improving co-pilot\u2014accelerating design cycles, enhancing reliability, and bridging the gap between virtual models and physical systems through autonomous, explainable decision orchestration. These optimizations make the architecture suitable for deployment in latency-sensitive, compute-constrained industrial scenarios, including edge-assisted digital twin environments and high-throughput simulation workflows, as well as knowledge-driven enterprise systems that demand adaptive, explainable, and human-aligned intelligence.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Reason and Verify: A Framework for Faithful Retrieval-Augmented Generation",
    "authors": [
      "Eeham Khan",
      "Luis Rodriguez",
      "Marc Queudot"
    ],
    "year": "2026",
    "journal": "arXiv:2603.10143v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) significantly improves the factuality of Large Language Models (LLMs), yet standard pipelines often lack mechanisms to verify inter- mediate reasoning, leaving them vulnerable to hallucinations in high-stakes domains. To address this, we propose a domain-specific RAG framework that integrates explicit rea- soning and faithfulness verification. Our architecture augments standard retrieval with neural query rewriting, BGE-based cross-encoder reranking, and a rationale generation module that grounds sub-claims in specific evidence spans. We further introduce an eight-category verification taxonomy that enables fine-grained assessment of rationale faithfulness, distinguishing between explicit and implicit support patterns to facilitate structured error diagnosis. We evaluate this framework on the BioASQ and PubMedQA benchmarks, specifically analyzing the impact of dynamic in-context learning and rerank- ing under constrained token budgets. Experiments demonstrate that explicit rationale generation improves accuracy over vanilla RAG baselines, while dynamic demonstration selection combined with robust reranking yields further gains in few-shot settings. Using Llama-3-8B-Instruct, our approach achieves 89.1% on BioASQ-Y/N and 73.0% on Pub- MedQA, competitive with systems using significantly larger models. Additionally, we perform a pilot study combining human expert assessment with LLM-based verification to explore how explicit rationale generation improves system transparency and enables more detailed diagnosis of retrieval failures in biomedical question answering.",
    "source_database": "arxiv",
    "arxiv_id": "2603.10143v1"
  },
  {
    "title": "Retrieval-Augmented and Knowledge-Grounded Language Models for Faithful Clinical Medicine",
    "authors": [
      "Fenglin Liu",
      "Bang Yang",
      "Chenyu You",
      "Xian Wu",
      "Shen Ge",
      "Zhangdaihong Liu",
      "Xu Sun",
      "Yang Yang",
      "David A. Clifton"
    ],
    "year": "2022",
    "journal": "arXiv:2210.12777v4",
    "doi": "",
    "abstract": "Language models (LMs), including large language models (such as ChatGPT), have the potential to assist clinicians in generating various clinical notes. However, LMs are prone to produce ``hallucinations'', i.e., generated content that is not aligned with facts and knowledge. In this paper, we propose the Re$^3$Writer method with retrieval-augmented generation and knowledge-grounded reasoning to enable LMs to generate faithful clinical texts. We demonstrate the effectiveness of our method in generating patient discharge instructions. It requires the LMs not to only understand the patients' long clinical documents, i.e., the health records during hospitalization, but also to generate critical instructional information provided both to carers and to the patient at the time of discharge. The proposed Re$^3$Writer imitates the working patterns of physicians to first \\textbf{re}trieve related working experience from historical instructions written by physicians, then \\textbf{re}ason related medical knowledge. Finally, it \\textbf{re}fines the retrieved working experience and reasoned medical knowledge to extract useful information, which is used to generate the discharge instructions for previously-unseen patients. Our experiments show that, using our method, the performance of five representative LMs can be substantially boosted across all metrics. Meanwhile, we show results from human evaluations to measure the effectiveness in terms of fluency, faithfulness, and comprehensiveness.",
    "source_database": "arxiv",
    "arxiv_id": "2210.12777v4"
  },
  {
    "title": "Integrating Fine-Tuning and Retrieval-Augmented Generation for Healthcare AI Systems: A Scoping Review.",
    "authors": [
      "Collaco BG",
      "Srinivasagam P",
      "Gomez-Cabello CA",
      "Haider SA",
      "Genovese A",
      "Wood NG",
      "Bagaria S",
      "Lifson MA",
      "Forte AJ"
    ],
    "year": "2026",
    "journal": "Bioengineering (Basel, Switzerland)",
    "doi": "10.3390/bioengineering13020225",
    "pmid": "41749764",
    "abstract": "(1) Background: Large language models (LLMs) show promise in healthcare but are constrained by hallucinations, static knowledge, and limited domain specificity. Fine-tuning (FT) and retrieval-augmented generation (RAG) offer complementary solutions, with FT embedding domain reasoning and RAG enabling dynamic, up-to-date knowledge access. Hybrid FT + RAG frameworks have been proposed to improve factual accuracy and clinical reliability. This scoping review synthesizes current evidence on such hybrids in healthcare AI. (2) Methods: The search across PubMed, IEEE Xplore, Google Scholar, and Embase identified studies implementing explicit FT + RAG hybrids in healthcare or biomedical tasks. Eligible studies reported empirical evaluations of LLM performance or behavior. Data were extracted on base models, FT strategies, RAG architectures, applications, and performance outcomes. (3) Results: Seven studies met inclusion criteria. FT + RAG systems consistently outperformed FT-only or RAG-only approaches across QA, clinical summarization, report generation, and decision support tasks. Parameter-efficient FT methods (e.g., LoRA) were common, while RAG implementations varied (dense, hybrid, hierarchical, multimodal, federated). Reported benefits included improved accuracy, reduced hallucination, and greater clinician preference and feasibility in protected settings. (4) Conclusions: FT + RAG frameworks represent a promising direction for clinically grounded healthcare AI, combining domain-specific reasoning with transparent, up-to-date retrieval. Future work should prioritize standardized evaluation, workflow integration, and governance to enable safe deployment.",
    "source_database": "pubmed"
  },
  {
    "title": "RAG-Guardrails Integration for AI Content Control",
    "authors": [
      "R. More"
    ],
    "year": "2025",
    "journal": "Proceedings of the 2025 18th International Conference on Computer Science and Information Technology",
    "doi": "10.1145/3783862.3783896",
    "pmid": "",
    "abstract": "Generative AI, particularly large language models (LLMs), has shown remarkable potential across domains such as healthcare, legal services, and finance. However, their adoption is hindered by two persistent challenges: hallucination-where models generate factually incorrect information-and the risk of producing biased or unsafe content. This paper proposes a hybrid framework that integrates Retrieval-Augmented Generation (RAG) with NVIDIA NeMo Guardrails to address these concerns. RAG mitigates hallucinations by grounding model outputs in externally retrieved, trusted data sources, while NeMo Guardrails enforce domain-specific safety and compliance constraints through predefined behavioral policies. Empirical evaluations demonstrate that this combined approach reduces hallucinated content by 30\u201345% and improves safety and policy adherence across multiple enterprise use cases. The system exhibits strong potential for deployment in regulated, high-stakes environments. Future work will focus on enhancing real-time responsiveness and expanding multilingual and culturally adaptive capabilities. The proposed framework offers a scalable foundation for building trustworthy, domain-aligned generative AI solutions.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Context-Aware Retrieval-Augmented Generation for Artificial Intelligence in Urology.",
    "authors": [
      "Sriram A",
      "N M",
      "Sundan B",
      "Krishnamoorthy S"
    ],
    "year": "2025",
    "journal": "Cureus",
    "doi": "10.7759/cureus.88167",
    "pmid": "40821282",
    "abstract": "Background Artificial intelligence (AI) is increasingly being used in healthcare, particularly for interpreting complex medical queries. However, conventional AI models often generate inaccurate or irrelevant responses that are commonly termed hallucinations, which may compromise patient safety. To address this, our study introduces a modified retrieval-augmented generation (RAG) framework tailored for the urology domain to enhance contextual relevance and accuracy in AI-generated responses. Methodology We developed a context-aware RAG system integrating PubMedBERT embeddings for encoding and retrieving urological literature stored in a Pinecone vector database. The system uses named entity recognition for domain-specific query filtering and incorporates dynamic memory to retain contextual flow during interactions. Response generation is powered by the LLaMA3-8B model via LangChain. A custom dataset of urology-related queries was used for evaluation, with a large language model-based scoring using the Deepseek-R1 model. Results The proposed framework demonstrated a significant reduction in hallucinations, with responses being more contextually relevant and evidence-based. Compared to baseline models, our system achieved an 89% performance improvement in generating medically appropriate answers. Integration of memory modules and named entity filtering further improved precision and reliability. Conclusions Our RAG-enhanced system shows strong potential for clinical use by producing trustworthy, context-aware responses in urology. It addresses key challenges in medical AI, including hallucination mitigation and domain relevance. Future work will focus on reducing inference latency and improving automated validation without manual oversight.",
    "source_database": "pubmed"
  },
  {
    "title": "RAGing ahead in rheumatology: new language model architectures to tame artificial intelligence.",
    "authors": [
      "Benavent D",
      "Venerito V",
      "Michelena X"
    ],
    "year": "2025",
    "journal": "Therapeutic advances in musculoskeletal disease",
    "doi": "10.1177/1759720X251331529",
    "pmid": "40292012",
    "abstract": "Artificial intelligence (AI) is increasingly transforming rheumatology with research on disease detection, monitoring, and outcome prediction through the analysis of large datasets. The advent of generative models and large language models (LLMs) has expanded AI's capabilities, particularly in natural language processing (NLP) tasks such as question-answering and medical literature synthesis. While NLP has shown promise in identifying rheumatic diseases from electronic health records with high accuracy, LLMs face significant challenges, including hallucinations and a lack of domain-specific knowledge, which limit their reliability in specialized medical fields like rheumatology. Retrieval-augmented generation (RAG) emerges as a solution to these limitations by integrating LLMs with real-time access to external, domain-specific databases. RAG enhances the accuracy and relevance of AI-generated responses by retrieving pertinent information during the generation process, reducing hallucinations, and improving the trustworthiness of AI applications. This architecture allows for precise, context-aware outputs and can handle unstructured data effectively. Despite its success in other industries, the application of RAG in medicine, and specifically in rheumatology, remains underexplored. Potential applications in rheumatology include retrieving up-to-date clinical guidelines, summarizing complex patient histories from unstructured data, aiding in patient identification for clinical trials, enhancing pharmacovigilance efforts, and supporting personalized patient education. RAG also offers advantages in data privacy by enabling local data handling and reducing reliance on large, general-purpose models. Future directions involve integrating RAG with fine-tuned, smaller LLMs and exploring multimodal models that can process diverse data types. Challenges such as infrastructure costs, data privacy concerns, and the need for specialized evaluation metrics must be addressed. Nevertheless, RAG presents a promising opportunity to improve AI applications in rheumatology, offering a more precise, accountable, and sustainable approach to integrating advanced language models into clinical practice and research.",
    "source_database": "pubmed"
  },
  {
    "title": "Integrating Retrieval-Augmented Generation with Large Language Models in Nephrology: Advancing Practical Applications.",
    "authors": [
      "Miao J",
      "Thongprayoon C",
      "Suppadungsuk S",
      "Garcia Valencia OA",
      "Cheungpasitporn W"
    ],
    "year": "2024",
    "journal": "Medicina (Kaunas, Lithuania)",
    "doi": "10.3390/medicina60030445",
    "pmid": "38541171",
    "abstract": "The integration of large language models (LLMs) into healthcare, particularly in nephrology, represents a significant advancement in applying advanced technology to patient care, medical research, and education. These advanced models have progressed from simple text processors to tools capable of deep language understanding, offering innovative ways to handle health-related data, thus improving medical practice efficiency and effectiveness. A significant challenge in medical applications of LLMs is their imperfect accuracy and/or tendency to produce hallucinations-outputs that are factually incorrect or irrelevant. This issue is particularly critical in healthcare, where precision is essential, as inaccuracies can undermine the reliability of these models in crucial decision-making processes. To overcome these challenges, various strategies have been developed. One such strategy is prompt engineering, like the chain-of-thought approach, which directs LLMs towards more accurate responses by breaking down the problem into intermediate steps or reasoning sequences. Another one is the retrieval-augmented generation (RAG) strategy, which helps address hallucinations by integrating external data, enhancing output accuracy and relevance. Hence, RAG is favored for tasks requiring up-to-date, comprehensive information, such as in clinical decision making or educational applications. In this article, we showcase the creation of a specialized ChatGPT model integrated with a RAG system, tailored to align with the KDIGO 2023 guidelines for chronic kidney disease. This example demonstrates its potential in providing specialized, accurate medical advice, marking a step towards more reliable and efficient nephrology practices.",
    "source_database": "pubmed"
  },
  {
    "title": "Artificial Intelligence for Quantitative Finance: A RAG-Augmented Multi-Agent Framework for Robust Equity Strategy Discovery",
    "authors": [
      "Jianfei Wang",
      "Hualin Li"
    ],
    "year": "2025",
    "journal": "Proceedings of the 2025 9th International Conference on Computer Science and Artificial Intelligence",
    "doi": "10.1145/3788149.3788249",
    "pmid": "",
    "abstract": "This paper introduces an AI-driven multi-agent framework for automated quantitative strategy generation and validation, integrating large language model (LLM) agents with Retrieval-Augmented Generation (RAG) to enhance factual accuracy and research reliability. The system orchestrates specialized agents for market analysis, financial knowledge retrieval, feature engineering, strategy construction, backtesting, and performance interpretation, establishing a comprehensive autonomous investment research pipeline. By grounding each agent's reasoning in a curated financial knowledge base, the framework mitigates hallucination risks, improves decision consistency, and provides transparent explanations of trading logic and risk characteristics. Experimental evaluation on simulated equity market data demonstrates that the RAG-enhanced multi-agent system achieves superior performance, generating a +3.25% return with improved consistency compared to single-agent (+2.15%) and non-RAG multi-agent (+1.82%) variants. The RAG-augmented approach exhibits higher win rates (51.2% vs 48-49%) and more robust risk-adjusted returns, validating its effectiveness in producing interpretable and reliable trading strategies. These results underscore the transformative potential of knowledge-grounded multi-agent AI systems in modern quantitative finance, while highlighting the framework's adaptability across different market environments without dependency on specific asset classes or historical periods.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Refine Medical Diagnosis Using Generation Augmented Retrieval and Clinical Practice Guidelines.",
    "authors": [
      "Li W",
      "Zhang H",
      "Zhang H",
      "Li Z",
      "Dong Z",
      "Chen Y",
      "Bidargaddi N",
      "Liu H"
    ],
    "year": "2025",
    "journal": "IEEE journal of biomedical and health informatics",
    "doi": "10.1109/JBHI.2025.3641931",
    "pmid": "41364573",
    "abstract": "Current medical language models, adapted from large language models, typically predict ICD code-based diagnosis from electronic health records (EHRs) because these labels are readily available. However, ICD codes do not capture the nuanced, context-rich reasoning clinicians use for diagnosis. Clinicians synthesize diverse patient data and reference clinical practice guidelines (CPGs) to make evidence-based decisions. This misalignment limits the clinical utility of existing models. We introduce GARMLE-G, a Generation-Augmented Retrieval framework that grounds medical language model outputs in authoritative CPGs. Unlike conventional Retrieval-Augmented Generation based approaches, GARMLE-G enables hallucination-free outputs by directly retrieving authoritative guideline content without relying on model-generated text. It (1) integrates LLM predictions with EHR data to create semantically rich queries, (2) retrieves relevant CPG knowledge snippets via embedding similarity, and (3) fuses guideline content with model output to generate clinically aligned recommendations. A prototype system for hypertension and coronary heart disease diagnosis was developed and evaluated on multiple metrics, demonstrating superior retrieval precision, semantic relevance, and clinical guideline adherence compared to RAG-based baselines, while maintaining a lightweight architecture suitable for localized healthcare deployment. This work provides a scalable, low-cost, and hallucination-free method for grounding medical language models in evidence-based clinical practice, with strong potential for broader clinical deployment.",
    "source_database": "pubmed"
  },
  {
    "title": "Retrieval-augmented generation for interpreting clinical laboratory regulations using large language models.",
    "authors": [
      "Nanua S",
      "Steward R",
      "Neely B",
      "Datto M",
      "Youens K"
    ],
    "year": "2025",
    "journal": "Journal of pathology informatics",
    "doi": "10.1016/j.jpi.2025.100520",
    "pmid": "41244595",
    "abstract": "Large language models (LLMs) have demonstrated strong performance on general knowledge tasks, but they have important limitations as standalone tools for question answering in specialized domains where accuracy and consistency are critical. Retrieval-augmented generation (RAG) is a strategy in which LLM outputs are grounded in dynamically retrieved source documents, offering advantages in accuracy, explainability, and maintainability. We developed and evaluated a custom RAG system called Raven, designed to answer laboratory regulatory questions using the part of the Code of Federal Regulations (CFR) pertaining to laboratory (42 CFR Part 493) as an authoritative source. Raven employed a vector search pipeline and a LLM to generate grounded responses via a chatbot-style interface. The system was tested using 103 synthetic laboratory regulatory questions, 88 of which were explicitly addressed in the CFR. Compared to answers generated manually by a board-certified pathologist, Raven's responses were judged to be totally complete and correct in 92.0% of those 88 cases, with little irrelevant content and a low potential for regulatory or medical error. Performance declined significantly on questions not addressed in the CFR, confirming the system's grounding in the source documents. Most suboptimal responses were attributable to faulty source document retrieval rather than model hallucination or misinterpretation. These findings demonstrate that a basic RAG system can produce useful, accurate, and verifiable answers to complex regulatory questions. With appropriate safeguards and with thoughtful integration into user workflows, tools like Raven may serve as valuable decision-support systems in laboratory medicine and other knowledge-intensive healthcare domains.",
    "source_database": "pubmed"
  },
  {
    "title": "Retrieval-augmented generation elevates local LLM quality in radiology contrast media consultation.",
    "authors": [
      "Wada A",
      "Tanaka Y",
      "Nishizawa M",
      "Yamamoto A",
      "Akashi T",
      "Hagiwara A",
      "Hayakawa Y",
      "Kikuta J",
      "Shimoji K",
      "Sano K",
      "Kamagata K",
      "Nakanishi A",
      "Aoki S"
    ],
    "year": "2025",
    "journal": "NPJ digital medicine",
    "doi": "10.1038/s41746-025-01802-z",
    "pmid": "40604147",
    "abstract": "Large language models (LLMs) demonstrate significant potential in healthcare applications, but clinical deployment is limited by privacy concerns and insufficient medical domain training. This study investigated whether retrieval-augmented generation (RAG) can improve locally deployable LLM for radiology contrast media consultation. In 100 synthetic iodinated contrast media consultations we compared Llama 3.2-11B (baseline and RAG) with three cloud-based models-GPT-4o mini, Gemini 2.0 Flash and Claude 3.5 Haiku. A blinded radiologist ranked the five replies per case, and three LLM-based judges scored accuracy, safety, structure, tone, applicability and latency. Under controlled conditions, RAG eliminated hallucinations (0% vs 8%; \u03c7\u00b2\u208dYates\u208e = 6.38, p\u2009=\u20090.012) and improved mean rank by 1.3 (Z = -4.82, p\u2009<\u20090.001), though performance gaps with cloud models persist. The RAG-enhanced model remained faster (2.6\u2009s vs 4.9-7.3\u2009s) while the LLM-based judges preferred it over GPT-4o mini, though the radiologist ranked GPT-4o mini higher. RAG thus provides meaningful improvements for local clinical LLMs while maintaining the privacy benefits of on-premise deployment.",
    "source_database": "pubmed"
  },
  {
    "title": "Grounded by Experience: Generative Healthcare Prediction Augmented with Hierarchical Agentic Retrieval",
    "authors": [
      "Chuang Zhao",
      "Hui Tang",
      "Hongke Zhao",
      "Xiaofang Zhou",
      "Xiaomeng Li"
    ],
    "year": "2025",
    "journal": "arXiv:2511.13293v1",
    "doi": "",
    "abstract": "Accurate healthcare prediction is critical for improving patient outcomes and reducing operational costs. Bolstered by growing reasoning capabilities, large language models (LLMs) offer a promising path to enhance healthcare predictions by drawing on their rich parametric knowledge. However, LLMs are prone to factual inaccuracies due to limitations in the reliability and coverage of their embedded knowledge. While retrieval-augmented generation (RAG) frameworks, such as GraphRAG and its variants, have been proposed to mitigate these issues by incorporating external knowledge, they face two key challenges in the healthcare scenario: (1) identifying the clinical necessity to activate the retrieval mechanism, and (2) achieving synergy between the retriever and the generator to craft contextually appropriate retrievals. To address these challenges, we propose GHAR, a \\underline{g}enerative \\underline{h}ierarchical \\underline{a}gentic \\underline{R}AG framework that simultaneously resolves when to retrieve and how to optimize the collaboration between submodules in healthcare. Specifically, for the first challenge, we design a dual-agent architecture comprising Agent-Top and Agent-Low. Agent-Top acts as the primary physician, iteratively deciding whether to rely on parametric knowledge or to initiate retrieval, while Agent-Low acts as the consulting service, summarising all task-relevant knowledge once retrieval was triggered. To tackle the second challenge, we innovatively unify the optimization of both agents within a formal Markov Decision Process, designing diverse rewards to align their shared goal of accurate prediction while preserving their distinct roles. Extensive experiments on three benchmark datasets across three popular tasks demonstrate our superiority over state-of-the-art baselines, highlighting the potential of hierarchical agentic RAG in advancing healthcare systems.",
    "source_database": "arxiv",
    "arxiv_id": "2511.13293v1"
  },
  {
    "title": "Auto-GDA: Automatic Domain Adaptation for Efficient Grounding Verification in Retrieval Augmented Generation",
    "authors": [
      "Tobias Leemann",
      "Periklis Petridis",
      "Giuseppe Vietri",
      "Dionysis Manousakas",
      "Aaron Roth",
      "Serg\u00fcl Ayd\u00f6re"
    ],
    "year": "2024",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2410.03461",
    "pmid": "",
    "abstract": "While retrieval-augmented generation (RAG) has been shown to enhance factuality of large language model (LLM) outputs, LLMs still suffer from hallucination, generating incorrect or irrelevant information. A common detection strategy involves prompting the LLM again to assess whether its response is grounded in the retrieved evidence, but this approach is costly. Alternatively, lightweight natural language inference (NLI) models for efficient grounding verification can be used at inference time. While existing pre-trained NLI models offer potential solutions, their performance remains subpar compared to larger models on realistic RAG inputs. RAG inputs are more complex than most datasets used for training NLI models and have characteristics specific to the underlying knowledge base, requiring adaptation of the NLI models to a specific target domain. Additionally, the lack of labeled instances in the target domain makes supervised domain adaptation, e.g., through fine-tuning, infeasible. To address these challenges, we introduce Automatic Generative Domain Adaptation (Auto-GDA). Our framework enables unsupervised domain adaptation through synthetic data generation. Unlike previous methods that rely on handcrafted filtering and augmentation strategies, Auto-GDA employs an iterative process to continuously improve the quality of generated samples using weak labels from less efficient teacher models and discrete optimization to select the most promising augmented samples. Experimental results demonstrate the effectiveness of our approach, with models fine-tuned on synthetic data using Auto-GDA often surpassing the performance of the teacher model and reaching the performance level of LLMs at 10% of their computational cost.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Faithfulness-Aware Uncertainty Quantification for Fact-Checking the Output of Retrieval Augmented Generation",
    "authors": [
      "Ekaterina Fadeeva",
      "Aleksandr Rubashevskii",
      "Dzianis Piatrashyn",
      "Roman Vashurin",
      "Shehzaad Dhuliawala",
      "Artem Shelmanov",
      "Timothy Baldwin",
      "Preslav Nakov",
      "Mrinmaya Sachan",
      "Maxim Panov"
    ],
    "year": "2025",
    "journal": "arXiv:2505.21072v3",
    "doi": "",
    "abstract": "Large Language Models (LLMs) enhanced with retrieval, an approach known as Retrieval-Augmented Generation (RAG), have achieved strong performance in open-domain question answering. However, RAG remains prone to hallucinations: factually incorrect outputs may arise from inaccuracies in the model's internal knowledge and the retrieved context. Existing approaches to mitigating hallucinations often conflate factuality with faithfulness to the retrieved evidence, incorrectly labeling factually correct statements as hallucinations if they are not explicitly supported by the retrieval. In this paper, we introduce FRANQ, a new method for hallucination detection in RAG outputs. FRANQ applies distinct uncertainty quantification (UQ) techniques to estimate factuality, conditioning on whether a statement is faithful to the retrieved context. To evaluate FRANQ and competing UQ methods, we construct a new long-form question answering dataset annotated for both factuality and faithfulness, combining automated labeling with manual validation of challenging cases. Extensive experiments across multiple datasets, tasks, and LLMs show that FRANQ achieves more accurate detection of factual errors in RAG-generated responses compared to existing approaches.",
    "source_database": "arxiv",
    "arxiv_id": "2505.21072v3"
  },
  {
    "title": "Enhancing medical AI with retrieval-augmented generation: A mini narrative review.",
    "authors": [
      "Gargari OK",
      "Habibi G"
    ],
    "year": "2025",
    "journal": "Digital health",
    "doi": "10.1177/20552076251337177",
    "pmid": "40343063",
    "abstract": "Retrieval-augmented generation (RAG) is a powerful technique in artificial intelligence (AI) and machine learning that enhances the capabilities of large language models (LLMs) by integrating external data sources, allowing for more accurate, contextually relevant responses. In medical applications, RAG has the potential to improve diagnostic accuracy, clinical decision support, and patient care. This narrative review explores the application of RAG across various medical domains, including guideline interpretation, diagnostic assistance, clinical trial eligibility screening, clinical information retrieval, and information extraction from scientific literature. Studies highlight the benefits of RAG in providing accurate, up-to-date information, improving clinical outcomes, and streamlining processes. Notable applications include GPT-4 models enhanced with RAG to interpret hepatologic guidelines, assist in differential diagnosis, and aid in clinical trial screening. Furthermore, RAG-based systems have demonstrated superior performance over traditional methods in tasks such as patient diagnosis, clinical decision-making, and medical information extraction. Despite its advantages, challenges remain, particularly in model evaluation, cost-efficiency, and reducing AI hallucinations. This review emphasizes the potential of RAG in advancing medical AI applications and advocates for further optimization of retrieval mechanisms, embedding models, and collaboration between AI researchers and healthcare professionals to maximize RAG's impact on medical practice.",
    "source_database": "pubmed"
  },
  {
    "title": "Leveraging long context in retrieval augmented language models for medical question answering.",
    "authors": [
      "Zhang G",
      "Xu Z",
      "Jin Q",
      "Chen F",
      "Fang Y",
      "Liu Y",
      "Rousseau JF",
      "Xu Z",
      "Lu Z",
      "Weng C",
      "Peng Y"
    ],
    "year": "2025",
    "journal": "NPJ digital medicine",
    "doi": "10.1038/s41746-025-01651-w",
    "pmid": "40316710",
    "abstract": "While holding great promise for improving and facilitating healthcare through applications of medical literature summarization, large language models (LLMs) struggle to produce up-to-date responses on evolving topics due to outdated knowledge or hallucination. Retrieval-augmented generation (RAG) is a pivotal innovation that improves the accuracy and relevance of LLM responses by integrating LLMs with a search engine and external sources of knowledge. However, the quality of RAG responses can be largely impacted by the rank and density of key information in the retrieval results, such as the \"lost-in-the-middle\" problem. In this work, we aim to improve the robustness and reliability of the RAG workflow in the medical domain. Specifically, we propose a map-reduce strategy, BriefContext, to combat the \"lost-in-the-middle\" issue without modifying the model weights. We demonstrated the advantage of the workflow with various LLM backbones and on multiple QA datasets. This method promises to improve the safety and reliability of LLMs deployed in healthcare domains by reducing the risk of misinformation, ensuring critical clinical content is retained in generated responses, and enabling more trustworthy use of LLMs in critical tasks such as medical question answering, clinical decision support, and patient-facing applications.",
    "source_database": "pubmed"
  },
  {
    "title": "Evidence-based artificial intelligence: Implementing retrieval-augmented generation models to enhance clinical decision support in plastic surgery.",
    "authors": [
      "Ozmen BB",
      "Mathur P"
    ],
    "year": "2025",
    "journal": "Journal of plastic, reconstructive & aesthetic surgery : JPRAS",
    "doi": "10.1016/j.bjps.2025.03.053",
    "pmid": "40174259",
    "abstract": "The rapid advancement of large language models (LLMs) has generated significant enthusiasm within healthcare, especially in supporting clinical decision-making and patient management. However, inherent limitations including hallucinations, outdated clinical context, and unreliable references pose serious concerns for their clinical utility. Retrieval-Augmented Generation (RAG) models address these limitations by integrating validated, curated medical literature directly into AI workflows, significantly enhancing the accuracy, relevance, and transparency of generated outputs. This viewpoint discusses how RAG frameworks can specifically benefit plastic and reconstructive surgery by providing contextually accurate, evidence-based, and clinically grounded support for decision-making. Potential clinical applications include clinical decision support, efficient evidence synthesis, customizable patient education, informed consent materials, multilingual capabilities, and structured surgical documentation. By querying specialized databases that incorporate contemporary guidelines and literature, RAG models can markedly reduce inaccuracies and increase the reliability of AI-generated responses. However, the implementation of RAG technology demands rigorous database curation, regular updating with guidelines from surgical societies, and ongoing validation to maintain clinical relevance. Addressing challenges related to data privacy, governance, ethical considerations, and user training remains critical for successful clinical adoption. In conclusion, RAG models represent a significant advancement in overcoming traditional LLM limitations, promoting transparency and clinical accuracy with great potential for plastic surgery. Plastic surgeons and researchers are encouraged to explore and integrate these innovative generative AI frameworks to enhance patient care, surgical outcomes, communication, documentation quality, and education.",
    "source_database": "pubmed"
  },
  {
    "title": "Utilizing large language models for gastroenterology research: a conceptual framework.",
    "authors": [
      "Berry P",
      "Dhanakshirur RR",
      "Khanna S"
    ],
    "year": "2025",
    "journal": "Therapeutic advances in gastroenterology",
    "doi": "10.1177/17562848251328577",
    "pmid": "40171241",
    "abstract": "Large language models (LLMs) transform healthcare by assisting clinicians with decision-making, research, and patient management. In gastroenterology, LLMs have shown potential in clinical decision support, data extraction, and patient education. However, challenges such as bias, hallucinations, integration with clinical workflows, and regulatory compliance must be addressed for safe and effective implementation. This manuscript presents a structured framework for integrating LLMs into gastroenterology, using Hepatitis C treatment as a real-world application. The framework outlines key steps to ensure accuracy, safety, and clinical relevance while mitigating risks associated with artificial intelligence (AI)-driven healthcare tools. The framework includes defining clinical goals, assembling a multidisciplinary team, data collection and preparation, model selection, fine-tuning, calibration, hallucination mitigation, user interface development, integration with electronic health records, real-world validation, and continuous improvement. Retrieval-augmented generation and fine-tuning approaches are evaluated for optimizing model adaptability. Bias detection, reinforcement learning from human feedback, and structured prompt engineering are incorporated to enhance reliability. Ethical and regulatory considerations, including the Health Insurance Portability and Accountability Act, General Data Protection Regulation, and AI-specific guidelines (DECIDE-AI, SPIRIT-AI, CONSORT-AI), are addressed to ensure responsible AI deployment. LLMs have the potential to enhance decision-making, research efficiency, and patient care in gastroenterology, but responsible deployment requires bias mitigation, transparency, and ongoing validation. Future research should focus on multi-institutional validation and AI-assisted clinical trials to establish LLMs as reliable tools in gastroenterology.",
    "source_database": "pubmed"
  },
  {
    "title": "Open-Source Retrieval Augmented Generation Framework for Retrieving Accurate Medication Insights from Formularies for African Healthcare Workers",
    "authors": [
      "Axum AI",
      " :",
      "J. Owoyemi",
      "S. Abubakar",
      "A. Owoyemi",
      "T. O. Togunwa",
      "F. C. Madubuko",
      "S. Oyatoye",
      "Z. Oyetolu",
      "K. Akyea",
      "A. O. Mohammed",
      "A. Adebakin"
    ],
    "year": "2025",
    "journal": "arXiv:2502.15722v1",
    "doi": "",
    "abstract": "Accessing accurate medication insights is vital for enhancing patient safety, minimizing errors, and supporting clinical decision-making. However, healthcare professionals in Africa often rely on manual and time-consuming processes to retrieve drug information, exacerbated by limited access to pharmacists due to brain drain and healthcare disparities. This paper presents \"Drug Insights,\" an open-source Retrieval-Augmented Generation (RAG) chatbot designed to streamline medication lookup for healthcare workers in Africa. By leveraging a corpus of Nigerian pharmaceutical data and advanced AI technologies, including Pinecone databases and GPT models, the system delivers accurate, context-specific responses with minimal hallucination. The chatbot integrates prompt engineering and S-BERT evaluation to optimize retrieval and response generation. Preliminary tests, including pharmacist feedback, affirm the tool's potential to improve drug information access while highlighting areas for enhancement, such as UI/UX refinement and extended corpus integration.",
    "source_database": "arxiv",
    "arxiv_id": "2502.15722v1"
  },
  {
    "title": "Chunking, Retrieval, and Re-ranking: An Empirical Evaluation of RAG Architectures for Policy Document Question Answering",
    "authors": [
      "A. Maharjan",
      "Umesh Yadav"
    ],
    "year": "2026",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2601.15457",
    "pmid": "",
    "abstract": "The integration of Large Language Models (LLMs) into the public health policy sector offers a transformative approach to navigating the vast repositories of regulatory guidance maintained by agencies such as the Centers for Disease Control and Prevention (CDC). However, the propensity for LLMs to generate hallucinations, defined as plausible but factually incorrect assertions, presents a critical barrier to the adoption of these technologies in high-stakes environments where information integrity is non-negotiable. This empirical evaluation explores the effectiveness of Retrieval-Augmented Generation (RAG) architectures in mitigating these risks by grounding generative outputs in authoritative document context. Specifically, this study compares a baseline Vanilla LLM against Basic RAG and Advanced RAG pipelines utilizing cross-encoder re-ranking. The experimental framework employs a Mistral-7B-Instruct-v0.2 model and an all-MiniLM-L6-v2 embedding model to process a corpus of official CDC policy analytical frameworks and guidance documents. The analysis measures the impact of two distinct chunking strategies, recursive character-based and token-based semantic splitting, on system accuracy, measured through faithfulness and relevance scores across a curated set of complex policy scenarios. Quantitative findings indicate that while Basic RAG architectures provide a substantial improvement in faithfulness (0.621) over Vanilla baselines (0.347), the Advanced RAG configuration achieves a superior faithfulness average of 0.797. These results demonstrate that two-stage retrieval mechanisms are essential for achieving the precision required for domain-specific policy question answering, though structural constraints in document segmentation remain a significant bottleneck for multi-step reasoning tasks.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Medical Graph-RAG: Bilingual Graph-Based Reasoning for Cardiological Intelligence",
    "authors": [
      "Leen I. A. Shaqalaih",
      "Omar Belal",
      "Fatma K\u00fc\u00e7\u00fck",
      "Yi\u0121it Tuncer",
      "M. Ganiz"
    ],
    "year": "2025",
    "journal": "2025 International Conference on INnovations in Intelligent SysTems and Applications (INISTA)",
    "doi": "10.1109/INISTA68122.2025.11249583",
    "pmid": "",
    "abstract": "Clinical question answering requires factual accuracy, transparency, and evidence that is traceable. Despite recent progress, Large Language Models (LLMs) still hallucinate and struggle with specialized terminology. Retrieval-Augmented Generation (RAG) mitigates this by grounding answers in external sources, but conventional RAG neglects the rich relational structure of medical knowledge. Building on MedGraphRAG, an evidence-focused, graph-based RAG framework, we present the first bilingual (English-Turkish) adaptation. Our study differs from the original in three key ways. First, we substitute GPT-4o-mini for GPT-4 as the generator. Second, instead of the MedC-K repository used in the original work, instead curated a much smaller set of cardiology textbooks and open-access articles. Third, we additionally evaluate a medical specialist model: MedGemma. We further extend the system by translating MIMIC-IV clinical notes and a subset of UMLS concepts into Turkish, enabling Turkish medical graph construction and retrieval. Experiments show that in English, MedGraphRAG retains strong gains over baseline. In Turkish, performance degrades relative to English. We observe that the largest driver of performance may be due to the size/coverage of the upper repository (medical reports). MedGemma-27B underperforms GPT-4o-mini in our setup on text-only MCQs. Overall, results highlight the sensitivity of MedGraphRAG to the breadth of curated medical sources and provide the first systematic assessment for Turkish.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Harnessing Large Language Models in Neonatal Intraventricular Hemorrhage: Exploring Retrieval Augmented Generation Methodology for Prognostic Variable Discovery.",
    "authors": [
      "Arora T",
      "Beam K"
    ],
    "year": "2026",
    "journal": "American journal of perinatology",
    "doi": "10.1055/a-2838-5446",
    "pmid": "41871598",
    "abstract": "The objective of this study is to evaluate whether large language models (LLMs) can autonomously synthesize existing literature and accurately extract prognostic variables for neonatal intraventricular hemorrhage (IVH) and its outcomes while assessing their capability for clinical feature ranking and risk stratification.This pilot study employed a systematic literature review combined with retrieval-augmented generation (RAG) methodology. GPT 4 (OpenAI) and Claude Sonnet (4.0, Anthropic) were prompted to identify peer-reviewed studies utilizing machine learning and deep learning to predict IVH outcomes in preterm neonates. Data extraction was prompted to follow TRIPOD artificial intelligence (AI) guidelines, capturing study design, population characteristics, predictor variables, and outcome measures. Semi-automated RAG extraction was performed with manual validation to mitigate hallucination risk.LLMs initially identified 39 studies, with 28 meeting some or all the validation criteria after excluding references that were hallucinated. From these, 14 distinct prognostic predictors were extracted across four outcome domains: mortality, progression, complications, and resolution. Universal high-impact predictors included gestational age (13 mentions; 41%), birth weight (8 mentions, 25%), and Apgar scores (11 mentions, 34%). Variables were categorized into three clinical tiers based on frequency, outcome breadth, and modifiability. A preliminary risk stratification model demonstrated high-risk neonates (<28 weeks, <1,000g, Apgar <3) with estimated progression risk >70%, and mortality >50%, while low-risk neonates (>32 weeks, >1,500\u2009g, Apgar\u2009>\u20095) showed favorable trajectories.This study demonstrates that LLMs can synthesize medical literature and extract clinically relevant prognostic variables for neonatal IVH outcomes. However, LLM outputs were susceptible to hallucinations and incomplete data synthesis, underscoring the need for rigorous clinical oversight and human validation to ensure reliability. The identified universal predictors provide a foundation for developing AI-assisted clinical decision support tools. Notable research gaps include the complete absence of resolution prediction studies and limited investigation of complication predictors, highlighting opportunities for future investigation in precision neonatology. \u00b7 LLMs can synthesize medical literature. \u00b7 LLMs can assist in creating a prognostic ontology. \u00b7 Human oversight is critical when using LLMs for healthcare.",
    "source_database": "pubmed"
  },
  {
    "title": "Development and Evaluation of a Retrieval-Augmented Generation Chatbot for Orthopedic and Trauma Surgery Patient Education: Mixed-Methods Study.",
    "authors": [
      "Baur D",
      "Ansorg J",
      "Heyde CE",
      "Voelker A"
    ],
    "year": "2025",
    "journal": "JMIR AI",
    "doi": "10.2196/75262",
    "pmid": "41134117",
    "abstract": "Large language models are increasingly applied in health care for documentation, patient education, and clinical decision support. However, their factual reliability can be compromised by hallucinations and a lack of source traceability. Retrieval-augmented generation (RAG) enhances response accuracy by combining generative models with document retrieval mechanisms. While promising in medical contexts, RAG-based systems remain underexplored in orthopedic and trauma surgery patient education, particularly in non-English settings.",
    "source_database": "pubmed"
  },
  {
    "title": "MEGA-RAG: a retrieval-augmented generation framework with multi-evidence guided answer refinement for mitigating hallucinations of LLMs in public health.",
    "authors": [
      "Xu S",
      "Yan Z",
      "Dai C",
      "Wu F"
    ],
    "year": "2025",
    "journal": "Frontiers in public health",
    "doi": "10.3389/fpubh.2025.1635381",
    "pmid": "41132171",
    "abstract": "The increasing adoption of large language models (LLMs) in public health has raised significant concerns about hallucinations-factually inaccurate or misleading outputs that can compromise clinical communication and policy decisions.",
    "source_database": "pubmed"
  },
  {
    "title": "Contradictions in Context: Challenges for Retrieval-Augmented Generation in Healthcare",
    "authors": [
      "Saeedeh Javadi",
      "Sara Mirabi",
      "Manan Gangar",
      "Bahadorreza Ofoghi"
    ],
    "year": "2025",
    "journal": "arXiv:2511.06668v2",
    "doi": "",
    "abstract": "In high-stakes information domains such as healthcare, where large language models (LLMs) can produce hallucinations or misinformation, retrieval-augmented generation (RAG) has been proposed as a mitigation strategy, grounding model outputs in external, domain-specific documents. Yet, this approach can introduce errors when source documents contain outdated or contradictory information. This work investigates the performance of five LLMs in generating RAG-based responses to medicine-related queries. Our contributions are three-fold: i) the creation of a benchmark dataset using consumer medicine information documents from the Australian Therapeutic Goods Administration (TGA), where headings are repurposed as natural language questions, ii) the retrieval of PubMed abstracts using TGA headings, stratified across multiple publication years, to enable controlled temporal evaluation of outdated evidence, and iii) a comparative analysis of the frequency and impact of outdated or contradictory content on model-generated responses, assessing how LLMs integrate and reconcile temporally inconsistent information. Our findings show that contradictions between highly similar abstracts do, in fact, degrade performance, leading to inconsistencies and reduced factual accuracy in model answers. These results highlight that retrieval similarity alone is insufficient for reliable medical RAG and underscore the need for contradiction-aware filtering strategies to ensure trustworthy responses in high-stakes domains.",
    "source_database": "arxiv",
    "arxiv_id": "2511.06668v2"
  },
  {
    "title": "MedTrust-RAG: Evidence Verification and Trust Alignment for Biomedical Question Answering",
    "authors": [
      "Yingpeng Ning",
      "Yuanyuan Sun",
      "Ling Luo",
      "Yanhua Wang",
      "Yuchen Pan",
      "Hongfei Lin"
    ],
    "year": "2025",
    "journal": "2025 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)",
    "doi": "10.1109/BIBM66473.2025.11356290",
    "pmid": "",
    "abstract": "Biomedical question answering (QA) requires precise interpretation of complex medical knowledge. Large language models (LLMs) and retrieval-augmented generation (RAG) leverage external medical literature but often produce hallucinations due to noisy retrieval and insufficient verification. We propose MedTrust-Guided Iterative RAG, a framework that improves factual consistency and reduces hallucinations in medical QA. It introduces three innovations. First, citation-aware reasoning grounds generation in retrieved documents and uses Negative Knowledge Assertions when evidence is missing. Second, an iterative retrieval-verification process refines queries through Medical Gap Analysis. Third, the MedTrust-Align Module (MTAM) applies Direct Preference Optimization to align generation with verified evidence and suppress hallucination-prone patterns.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "DeepSeek-Med-8B: Medical LLM for Chinese Diagnosis and Referral",
    "authors": [
      "Chenxing Li",
      "J. Mao",
      "Bin Liu",
      "Weiwei Luo"
    ],
    "year": "2025",
    "journal": "2025 8th International Conference on Computer Information Science and Application Technology (CISAT)",
    "doi": "10.1109/CISAT66811.2025.11181817",
    "pmid": "",
    "abstract": "The uneven distribution of medical resources in China poses significant challenges, especially in rural areas. While large language models (LLMs) offer potential for clinical support, existing systems like GPT-4 and Med-PaLM suffer from hallucinations, English-centric biases, and lack real-time physician integration. We present DeepSeek-Med-8B, a Chinese medical conversational agent based on the DeepSeek-R1-DistillLlama-8B architecture.DeepSeek-Med-8B is trained through: (i) Supervised Fine-Tuning (SFT) on curated Chinese medical corpora; (ii) Reinforcement Learning with AI and Doctor Feedback (RLAIF) for factuality, empathy, and referral quality; and (iii) Retrieval-Augmented Generation (RAG) for real-time grounding in physician databases.Across eight clinical tasks, DeepSeek-Med-8B achieves a top1 mean score of 66.9 on GPT-4o-based benchmarks and a 74% top-3 doctor match rate, outperforming rule-based baselines. The model runs efficiently on a single RTX 4090 GPU via INT8 quantization.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "FAIR-RAG: Faithful Adaptive Iterative Refinement for Retrieval-Augmented Generation",
    "authors": [
      "Mohammad Aghajani Asl",
      "Majid Asgari-Bidhendi",
      "Behrooz Minaei-Bidgoli"
    ],
    "year": "2025",
    "journal": "arXiv:2510.22344v1",
    "doi": "",
    "abstract": "While Retrieval-Augmented Generation (RAG) mitigates hallucination and knowledge staleness in Large Language Models (LLMs), existing frameworks often falter on complex, multi-hop queries that require synthesizing information from disparate sources. Current advanced RAG methods, employing iterative or adaptive strategies, lack a robust mechanism to systematically identify and fill evidence gaps, often propagating noise or failing to gather a comprehensive context. We introduce FAIR-RAG, a novel agentic framework that transforms the standard RAG pipeline into a dynamic, evidence-driven reasoning process. At its core is an Iterative Refinement Cycle governed by a module we term Structured Evidence Assessment (SEA). The SEA acts as an analytical gating mechanism: it deconstructs the initial query into a checklist of required findings and audits the aggregated evidence to identify confirmed facts and, critically, explicit informational gaps. These gaps provide a precise signal to an Adaptive Query Refinement agent, which generates new, targeted sub-queries to retrieve missing information. This cycle repeats until the evidence is verified as sufficient, ensuring a comprehensive context for a final, strictly faithful generation. We conducted experiments on challenging multi-hop QA benchmarks, including HotpotQA, 2WikiMultiHopQA, and MusiQue. In a unified experimental setup, FAIR-RAG significantly outperforms strong baselines. On HotpotQA, it achieves an F1-score of 0.453 -- an absolute improvement of 8.3 points over the strongest iterative baseline -- establishing a new state-of-the-art for this class of methods on these benchmarks. Our work demonstrates that a structured, evidence-driven refinement process with explicit gap analysis is crucial for unlocking reliable and accurate reasoning in advanced RAG systems for complex, knowledge-intensive tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2510.22344v1"
  },
  {
    "title": "SimulRAG: Simulator-based RAG for Grounding LLMs in Long-form Scientific QA",
    "authors": [
      "Haozhou Xu",
      "Dongxia Wu",
      "Matteo Chinazzi",
      "Ruijia Niu",
      "Rose Yu",
      "Yi-An Ma"
    ],
    "year": "2025",
    "journal": "arXiv:2509.25459v1",
    "doi": "",
    "abstract": "Large language models (LLMs) show promise in solving scientific problems. They can help generate long-form answers for scientific questions, which are crucial for comprehensive understanding of complex phenomena that require detailed explanations spanning multiple interconnected concepts and evidence. However, LLMs often suffer from hallucination, especially in the challenging task of long-form scientific question answering. Retrieval-Augmented Generation (RAG) approaches can ground LLMs by incorporating external knowledge sources to improve trustworthiness. In this context, scientific simulators, which play a vital role in validating hypotheses, offer a particularly promising retrieval source to mitigate hallucination and enhance answer factuality. However, existing RAG approaches cannot be directly applied for scientific simulation-based retrieval due to two fundamental challenges: how to retrieve from scientific simulators, and how to efficiently verify and update long-form answers. To overcome these challenges, we propose the simulator-based RAG framework (SimulRAG) and provide a long-form scientific QA benchmark covering climate science and epidemiology with ground truth verified by both simulations and human annotators. In this framework, we propose a generalized simulator retrieval interface to transform between textual and numerical modalities. We further design a claim-level generation method that utilizes uncertainty estimation scores and simulator boundary assessment (UE+SBA) to efficiently verify and update claims. Extensive experiments demonstrate SimulRAG outperforms traditional RAG baselines by 30.4% in informativeness and 16.3% in factuality. UE+SBA further improves efficiency and quality for claim-level generation.",
    "source_database": "arxiv",
    "arxiv_id": "2509.25459v1"
  },
  {
    "title": "Current state of LLM Risks and AI Guardrails",
    "authors": [
      "Suriya Ganesh Ayyamperumal",
      "Limin Ge"
    ],
    "year": "2024",
    "journal": "arXiv:2406.12934v1",
    "doi": "",
    "abstract": "Large language models (LLMs) have become increasingly sophisticated, leading to widespread deployment in sensitive applications where safety and reliability are paramount. However, LLMs have inherent risks accompanying them, including bias, potential for unsafe actions, dataset poisoning, lack of explainability, hallucinations, and non-reproducibility. These risks necessitate the development of \"guardrails\" to align LLMs with desired behaviors and mitigate potential harm.   This work explores the risks associated with deploying LLMs and evaluates current approaches to implementing guardrails and model alignment techniques. We examine intrinsic and extrinsic bias evaluation methods and discuss the importance of fairness metrics for responsible AI development. The safety and reliability of agentic LLMs (those capable of real-world actions) are explored, emphasizing the need for testability, fail-safes, and situational awareness.   Technical strategies for securing LLMs are presented, including a layered protection model operating at external, secondary, and internal levels. System prompts, Retrieval-Augmented Generation (RAG) architectures, and techniques to minimize bias and protect privacy are highlighted.   Effective guardrail design requires a deep understanding of the LLM's intended use case, relevant regulations, and ethical considerations. Striking a balance between competing requirements, such as accuracy and privacy, remains an ongoing challenge. This work underscores the importance of continuous research and development to ensure the safe and responsible use of LLMs in real-world applications.",
    "source_database": "arxiv",
    "arxiv_id": "2406.12934v1"
  },
  {
    "title": "SimulRAG: Simulator-based RAG for Grounding LLMs in Long-form Scientific QA",
    "authors": [
      "Haozhou Xu",
      "D. Wu",
      "M. Chinazzi",
      "Ruijia Niu",
      "Rose Yu",
      "Yi-An Ma"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2509.25459",
    "pmid": "",
    "abstract": "Large language models (LLMs) show promise in solving scientific problems. They can help generate long-form answers for scientific questions, which are crucial for comprehensive understanding of complex phenomena that require detailed explanations spanning multiple interconnected concepts and evidence. However, LLMs often suffer from hallucination, especially in the challenging task of long-form scientific question answering. Retrieval-Augmented Generation (RAG) approaches can ground LLMs by incorporating external knowledge sources to improve trustworthiness. In this context, scientific simulators, which play a vital role in validating hypotheses, offer a particularly promising retrieval source to mitigate hallucination and enhance answer factuality. However, existing RAG approaches cannot be directly applied for scientific simulation-based retrieval due to two fundamental challenges: how to retrieve from scientific simulators, and how to efficiently verify and update long-form answers. To overcome these challenges, we propose the simulator-based RAG framework (SimulRAG) and provide a long-form scientific QA benchmark covering climate science and epidemiology with ground truth verified by both simulations and human annotators. In this framework, we propose a generalized simulator retrieval interface to transform between textual and numerical modalities. We further design a claim-level generation method that utilizes uncertainty estimation scores and simulator boundary assessment (UE+SBA) to efficiently verify and update claims. Extensive experiments demonstrate SimulRAG outperforms traditional RAG baselines by 30.4% in informativeness and 16.3% in factuality. UE+SBA further improves efficiency and quality for claim-level generation.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "HYPER-RAG: Evaluating Hyperparameter Trade-Offs in Biomedical Retrieval-Augmented Generation",
    "authors": [
      "Ankush Sil Sarma",
      "Pawan Kumar Singh"
    ],
    "year": "2025",
    "journal": "2025 IEEE Pune Section International Conference (PuneCon)",
    "doi": "10.1109/PuneCon67554.2025.11377827",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) improves the factual accuracy of large language models by combining document retrieval with text generation. In biomedical question answering, where correctness is critical, the effect of key hyperparameters has not been studied in a systematic way. This paper presents an evaluation of RAG on the COVID-QA dataset with a focus on three retrievers (dense, BM25, hybrid), two retrieval depths (top- $\\mathrm{k}=1,3$), and optional reranking with a cross encoder. We use a single biomedical prompt and measure exact match (EM), F1 score, semantic similarity, groundedness, and latency. We also report a composite score that balances lexical accuracy, semantic similarity, and efficiency. Our results on a $\\mathbf{1 0 0}$-question subset show that reranking improves grounding at the cost of extra latency, and that increasing top-k improves recall but gives smaller gains after a point. The study highlights that multiple metrics are needed to judge biomedical RAG systems reliably and that careful tuning of retrieval and reranking settings can yield practical improvements under compute constraints.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "ModernBERT + ColBERT: Enhancing biomedical RAG through an advanced re-ranking retriever",
    "authors": [
      "Eduardo Mart'inez Rivera",
      "F. Menolascina"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2510.04757",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) is a powerful technique for enriching Large Language Models (LLMs) with external knowledge, allowing for factually grounded responses, a critical requirement in high-stakes domains such as healthcare. However, the efficacy of RAG systems is fundamentally restricted by the performance of their retrieval module, since irrelevant or semantically misaligned documents directly compromise the accuracy of the final generated response. General-purpose dense retrievers can struggle with the nuanced language of specialised domains, while the high accuracy of in-domain models is often achieved at prohibitive computational costs. In this work, we aim to address this trade-off by developing and evaluating a two-stage retrieval architecture that combines a lightweight ModernBERT bidirectional encoder for efficient initial candidate retrieval with a ColBERTv2 late-interaction model for fine-grained re-ranking. We conduct comprehensive evaluations of our retriever module performance and RAG system performance in the biomedical context, fine-tuning the IR module using 10k question-passage pairs from PubMedQA. Our analysis of the retriever module confirmed the positive impact of the ColBERT re-ranker, which improved Recall@3 by up to 4.2 percentage points compared to its retrieve-only counterpart. When integrated into the biomedical RAG, our IR module leads to a state-of-the-art average accuracy of 0.4448 on the five tasks of the MIRAGE question-answering benchmark, outperforming strong baselines such as MedCPT (0.4436). Our ablation studies reveal that this performance is critically dependent on a joint fine-tuning process that aligns the retriever and re-ranker; otherwise, the re-ranker might degrade the performance.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Mitigating Artificial Intelligence Hallucinations in Education: A Comparative Study of Retrieval-Augmented Generation (RAG) and Large Language Models",
    "authors": [
      "Pei-hua Chen",
      "Yuen-Min Huang",
      "Ting\u2010Ting Wu",
      "Hsin\u2010Yu Lee"
    ],
    "year": "2025",
    "journal": "2025 7th International Conference on Modern Educational Technology (ICMET)",
    "doi": "10.1109/ICMET67594.2025.11451842",
    "pmid": "",
    "abstract": "The integration of Large Language Models (LLMs) into educational technologies promises to revolutionize personalized learning. However, their propensity for \u201challucination,\u201d the generation of factually incorrect or nonsensical information, poses a significant risk to knowledge integrity and student trust. To address this critical challenge, this paper investigates Retrieval-Augmented Generation (RAG), an architectural approach that mitigates hallucinations by grounding the model's responses in factual data. Before generating an answer, the RAG framework retrieves relevant information from a verified knowledge base, ensuring the output is contextually accurate. We conducted a comparative study between a standard LLM and a RAG-powered system whose knowledge base was populated with specific curriculum materials. The generated answers to domain-specific questions were assessed through two distinct lenses: a quantitative evaluation of factual accuracy by subject matter experts and a qualitative analysis of student perceptions gathered through semi-structured interviews. Our preliminary findings indicate that the RAG model demonstrates a marked improvement in factual accuracy and a significant reduction in hallucinatory content. Furthermore, students perceived the RAG-generated responses as more trustworthy and useful for their learning. This research provides empirical evidence for the necessity of adopting RAG in educational AI, offering a pathway toward developing more reliable and effective digital learning tools.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Optimizing Medical Question-Answering Systems: A Comparative Study of Fine-Tuned and Zero-Shot Large Language Models with RAG Framework",
    "authors": [
      "Tasnimul Hassan",
      "Md. Karim",
      "Haziq Jeelani",
      "Elham Behnam",
      "R. Green",
      "F. Syed"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2512.05863",
    "pmid": "",
    "abstract": "Medical question-answering (QA) systems can benefit from advances in large language models (LLMs), but directly applying LLMs to the clinical domain poses challenges such as maintaining factual accuracy and avoiding hallucinations. In this paper, we present a retrieval-augmented generation (RAG) based medical QA system that combines domain-specific knowledge retrieval with open-source LLMs to answer medical questions. We fine-tune two state-of-the-art open LLMs (LLaMA~2 and Falcon) using Low-Rank Adaptation (LoRA) for efficient domain specialization. The system retrieves relevant medical literature to ground the LLM's answers, thereby improving factual correctness and reducing hallucinations. We evaluate the approach on benchmark datasets (PubMedQA and MedMCQA) and show that retrieval augmentation yields measurable improvements in answer accuracy compared to using LLMs alone. Our fine-tuned LLaMA~2 model achieves 71.8% accuracy on PubMedQA, substantially improving over the 55.4% zero-shot baseline, while maintaining transparency by providing source references. We also detail the system design and fine-tuning methodology, demonstrating that grounding answers in retrieved evidence reduces unsupported content by approximately 60%. These results highlight the potential of RAG-augmented open-source LLMs for reliable biomedical QA, pointing toward practical clinical informatics applications.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Probabilistic distances-based hallucination detection in LLMs with RAG",
    "authors": [
      "Rodion Oblovatny",
      "Alexandra Kuleshova",
      "Konstantin Polev",
      "Alexey Zaytsev"
    ],
    "year": "2025",
    "journal": "arXiv:2506.09886v2",
    "doi": "",
    "abstract": "Detecting hallucinations in large language models (LLMs) is critical for their safety in many applications. Without proper detection, these systems often provide harmful, unreliable answers. In recent years, LLMs have been actively used in retrieval-augmented generation (RAG) settings. However, hallucinations remain even in this setting, and while numerous hallucination detection methods have been proposed, most approaches are not specifically designed for RAG systems. To overcome this limitation, we introduce a hallucination detection method based on estimating the distances between the distributions of prompt token embeddings and language model response token embeddings. The method examines the geometric structure of token hidden states to reliably extract a signal of factuality in text, while remaining friendly to long sequences. Extensive experiments demonstrate that our method achieves state-of-the-art or competitive performance. It also has transferability from solving the NLI task to the hallucination detection task, making it a fully unsupervised and efficient method with a competitive performance on the final task.",
    "source_database": "arxiv",
    "arxiv_id": "2506.09886v2"
  },
  {
    "title": "Reducing Hallucinations of Medical Multimodal Large Language Models with Visual Retrieval-Augmented Generation",
    "authors": [
      "Yun-Wei Chu",
      "Kai Zhang",
      "Christopher Malon",
      "Martin Renqiang Min"
    ],
    "year": "2025",
    "journal": "arXiv:2502.15040v1",
    "doi": "",
    "abstract": "Multimodal Large Language Models (MLLMs) have shown impressive performance in vision and text tasks. However, hallucination remains a major challenge, especially in fields like healthcare where details are critical. In this work, we show how MLLMs may be enhanced to support Visual RAG (V-RAG), a retrieval-augmented generation framework that incorporates both text and visual data from retrieved images. On the MIMIC-CXR chest X-ray report generation and Multicare medical image caption generation datasets, we show that Visual RAG improves the accuracy of entity probing, which asks whether a medical entities is grounded by an image. We show that the improvements extend both to frequent and rare entities, the latter of which may have less positive training data. Downstream, we apply V-RAG with entity probing to correct hallucinations and generate more clinically accurate X-ray reports, obtaining a higher RadGraph-F1 score.",
    "source_database": "arxiv",
    "arxiv_id": "2502.15040v1"
  },
  {
    "title": "Fast and Faithful: Real-Time Verification for Long-Document Retrieval-Augmented Generation Systems",
    "authors": [
      "Xunzhuo Liu",
      "Bowei He",
      "Xue Liu",
      "Haichen Zhang",
      "Huamin Chen"
    ],
    "year": "2026",
    "journal": "arXiv:2603.23508v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) is increasingly deployed in enterprise search and document-centric assistants, where responses must be grounded in long and complex source materials. In practice, verifying that generated answers faithfully reflect retrieved documents is difficult: large language models can check long contexts but are too slow and costly for interactive services, while lightweight classifiers operate within strict context limits and frequently miss evidence outside truncated passages. We present the design of a real-time verification component integrated into a production RAG pipeline that enables full-document grounding under latency constraints. The system processes documents up to 32K tokens and employs adaptive inference strategies to balance response time and verification coverage across workloads. We describe the architectural decisions, operational trade-offs, and evaluation methodology used to deploy the verifier, and show that full-context verification substantially improves detection of unsupported responses compared with truncated validation. Our experience highlights when long-context verification is necessary, why chunk-based checking often fails in real documents, and how latency budgets shape model design. These findings provide practical guidance for practitioners building reliable large-scale retrieval-augmented applications. (Model, benchmark, and code: https://huggingface.co/llm-semantic-router)",
    "source_database": "arxiv",
    "arxiv_id": "2603.23508v1"
  },
  {
    "title": "Enhancing Truth with AI: Evaluating ML, LLMs, and RAG in Combating Misinformation",
    "authors": [
      "Manisha",
      "Manisha Jailia"
    ],
    "year": "2025",
    "journal": "2025 2nd International Conference on Advanced Computing and Emerging Technologies (ACET)",
    "doi": "10.1109/ACET67282.2025.11430189",
    "pmid": "",
    "abstract": "In the digital age, misinformation is a new danger. It impacts how people talk about things, harms democracy, and changes how people think about the government. This study answers the important demand for better ways to find false information by comparing classic ML models with the newest LLMs and then creating a hybrid framework that contains the best parts of both. Unfortunately, classic classifiers like Naive Bayes and the Passive-Aggressive Classifier often miss content that is both hostile and full of context. LLMs like GPT-3.5 and GPT-4, on the other hand, are very accurate and know what words mean, but they need a lot of processing power and can make stuff up in their outputs. Adding Retrieval-Augmented Generation (RAG) to the LLM pipeline fixes these issues. Retrieval-Augmented Generation (RAG) plays a crucial role by grounding LLM outputs in verified external knowledge, reducing hallucination and improving factual accuracy. By combining retrieval with generation, RAG ensures that misinformation detection becomes more reliable, contextaware, and aligned with real-world evidence. Our hybrid technique works better than existing algorithms on benchmark datasets including LIAR, and Buzz Feed. It has an F1 score of 95.9% and an accuracy score of 96.3%. It also keeps inference times in check. Also, SHAP analysis and attention weight visualisation make things easier to comprehend. This study looks at the good and bad sides of the present ways of identifying misinformation and suggests a solution that is easy to comprehend and can be applied in the real world.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Navi: RAG-Powered LLM Chatbot for Academic Institutions",
    "authors": [
      "Amiya P Bovas"
    ],
    "year": "2025",
    "journal": "International Journal for Research in Applied Science and Engineering Technology",
    "doi": "10.22214/ijraset.2025.75655",
    "pmid": "",
    "abstract": "With the growing integration of artificial intelligence (AI) across educational ecosystems, there is an increasing\ndemand for intelligent conversational agents that can efficiently deliver reliable, domain-specific information to students,\nfaculty, and visitors. This research introduces Navi, an academic virtual assistant designed using a Large Language Model\n(LLM) combined with a Retrieval-Augmented Generation (RAG) framework to generate accurate and contextually grounded\nresponses [1], [5]. The chatbot incorporates the Mistral-7B-Instruct model [3] for response generation and leverages a FAISSbased vector database [4], where embeddings are produced using the all-MiniLM-L6-v2 sentence transformer model. When a\nuser submits a query, relevant document segments are retrieved from institutional data sources and integrated into the LLM\u2019s\nprompt, enabling precise, factual, and contextually aligned output.\nNavi offers a range of advanced capabilities, including natural language understanding, multi-turn contextual dialogues,\nmultilingual query handling, sentiment adaptation, speech-enabled interaction, and user-personalized responses. Performance\nevaluation through simulated academic queries indicates improved response accuracy, coherence, and informativeness,\nachieving an average relevance score between 0.7\u20130.85. The experimental results confirm that combining RAG with an LLM\nsubstantially reduces hallucinations, enhances factual grounding, and improves user satisfaction. Overall, Navi demonstrates a\nscalable and dependable framework for deploying AI-driven information assistants within educational institutions",
    "source_database": "semantic_scholar"
  },
  {
    "title": "LLM-Driven Learner Modeling and Personalized Learning Pathways: A Closed-Loop Framework and Engineering Design for Virtual Laboratories",
    "authors": [
      "Ruijie Wang",
      "Guangtao Xu"
    ],
    "year": "2025",
    "journal": "2025 International Conference on Educational Technology Management (ICETM)",
    "doi": "10.1109/ICETM67477.2025.11413398",
    "pmid": "",
    "abstract": "Focusing on virtual experiment teaching, this paper proposes a personalized learning closed-loop with LLM as the core. A simulation engine provides a verifiable factual baseline, while the LLM undertakes semantic interpretation, two-phase path way generation (skeleton-verification-refinement), fact-grounded judgement and feedback, and explanatory summarization. To enhance robustness and compliance, the framework employs retrieval-augmented generation (RAG), structured outputs, and a second-pass verifier as guardrails. At the learner-modeling layer, we fuse LLM semantic increments with BKT/IRT steady estimates to obtain a fine grained yet stable representation that drives adaptive replanning. The engineering design covers windowed reporting and fact checks, an orchestration service with template interfaces, result caching and tiered inference (small model first), minimal-necessary data collection with anonymization, and classroom-orien ted batching and rate limiting. Although large scale evaluation re mains for future work, the framework connects the key chain \u201cinterpretation\u2014modeling\u2014path\u2014judgement\u2014explanation,\u201d demonstrating interpretability, controllability, and deployment feasibility.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Towards Omni-RAG: Comprehensive Retrieval-Augmented Generation for Large Language Models in Medical Applications",
    "authors": [
      "Zhe Chen",
      "Yusheng Liao",
      "Shuyang Jiang",
      "Pingjie Wang",
      "Yiqiu Guo",
      "Yanfeng Wang",
      "Yu Wang"
    ],
    "year": "2025",
    "journal": "arXiv:2501.02460v3",
    "doi": "",
    "abstract": "Large language models hold promise for addressing medical challenges, such as medical diagnosis reasoning, research knowledge acquisition, clinical decision-making, and consumer health inquiry support. However, they often generate hallucinations due to limited medical knowledge. Incorporating external knowledge is therefore critical, which necessitates multi-source knowledge acquisition. We address this challenge by framing it as a source planning problem, which is to formulate context-appropriate queries tailored to the attributes of diverse sources. Existing approaches either overlook source planning or fail to achieve it effectively due to misalignment between the model's expectation of the sources and their actual content. To bridge this gap, we present MedOmniKB, a repository comprising multigenre and multi-structured medical knowledge sources. Leveraging these sources, we propose the Source Planning Optimisation method, which enhances multi-source utilisation. Our approach involves enabling an expert model to explore and evaluate potential plans while training a smaller model to learn source alignment. Experimental results demonstrate that our method substantially improves multi-source planning performance, enabling the optimised small model to achieve state-of-the-art results in leveraging diverse medical knowledge sources.",
    "source_database": "arxiv",
    "arxiv_id": "2501.02460v3"
  },
  {
    "title": "Large language models in clinical nutrition: an overview of its applications, capabilities, limitations, and potential future prospects.",
    "authors": [
      "Belkhouribchia J",
      "Pen JJ"
    ],
    "year": "2025",
    "journal": "Frontiers in nutrition",
    "doi": "10.3389/fnut.2025.1635682",
    "pmid": "40851903",
    "abstract": "The integration of large language models (LLMs) into clinical nutrition marks a transformative advancement, offering promising solutions for enhancing patient care, personalizing dietary recommendations, and supporting evidence-based clinical decision-making. Trained on extensive text corpora and powered by transformer-based architectures, LLMs demonstrate remarkable capabilities in natural language understanding and generation. This review provides an overview of their current and potential applications in clinical nutrition, focusing on key technologies including prompt engineering, fine-tuning, retrieval-augmented generation, and multimodal integration. These enhancements increase domain relevance, factual accuracy, and contextual responsiveness, enabling LLMs to deliver more reliable outputs in nutrition-related tasks. Recent studies have shown LLMs' utility in dietary planning, nutritional education, obesity management, and malnutrition risk assessment. Despite these advances, challenges remain. Limitations in reasoning, factual accuracy, and domain specificity, along with risks of bias and hallucination, underscore the need for rigorous validation and human oversight. Furthermore, ethical considerations, environmental costs, and infrastructural integration must be addressed before widespread adoption. Future directions include combining LLMs with predictive analytics, integrating them with electronic health records and wearables, and adapting them for multilingual, culturally sensitive dietary guidance. LLMs also hold potential as research and educational tools, assisting in literature synthesis and patient engagement. Their transformative promise depends on cross-disciplinary collaboration, responsible deployment, and clinician training. Ultimately, while LLMs are not a replacement for healthcare professionals, they offer powerful augmentation tools for delivering scalable, personalized, and data-driven nutritional care in an increasingly complex healthcare environment.",
    "source_database": "pubmed"
  },
  {
    "title": "AlzheimerRAG: Multimodal Retrieval Augmented Generation for Clinical Use Cases using PubMed articles",
    "authors": [
      "Aritra Kumar Lahiri",
      "Qinmin Vivian Hu"
    ],
    "year": "2024",
    "journal": "arXiv:2412.16701v3",
    "doi": "https://doi.org/10.3390/make7030089",
    "abstract": "Recent advancements in generative AI have fostered the development of highly adept Large Language Models (LLMs) that integrate diverse data types to empower decision-making. Among these, multimodal retrieval-augmented generation (RAG) applications are promising because they combine the strengths of information retrieval and generative models, enhancing their utility across various domains, including clinical use cases. This paper introduces AlzheimerRAG, a Multimodal RAG application for clinical use cases, primarily focusing on Alzheimer's Disease case studies from PubMed articles. This application incorporates cross-modal attention fusion techniques to integrate textual and visual data processing by efficiently indexing and accessing vast amounts of biomedical literature. Our experimental results, compared to benchmarks such as BioASQ and PubMedQA, have yielded improved performance in the retrieval and synthesis of domain-specific information. We also present a case study using our multimodal RAG in various Alzheimer's clinical scenarios. We infer that AlzheimerRAG can generate responses with accuracy non-inferior to humans and with low rates of hallucination.",
    "source_database": "arxiv",
    "arxiv_id": "2412.16701v3"
  },
  {
    "title": "Investigating Retrieval-Augmented Generation in Quranic Studies: A Study of 13 Open-Source Large Language Models",
    "authors": [
      "Zahra Khalila",
      "Arbi Haza Nasution",
      "Winda Monika",
      "Aytug Onan",
      "Yohei Murakami",
      "Yasir Bin Ismail Radi",
      "Noor Mohammad Osmani"
    ],
    "year": "2025",
    "journal": "arXiv:2503.16581v1",
    "doi": "https://doi.org/10.14569/IJACSA.2025.01602134",
    "abstract": "Accurate and contextually faithful responses are critical when applying large language models (LLMs) to sensitive and domain-specific tasks, such as answering queries related to quranic studies. General-purpose LLMs often struggle with hallucinations, where generated responses deviate from authoritative sources, raising concerns about their reliability in religious contexts. This challenge highlights the need for systems that can integrate domain-specific knowledge while maintaining response accuracy, relevance, and faithfulness. In this study, we investigate 13 open-source LLMs categorized into large (e.g., Llama3:70b, Gemma2:27b, QwQ:32b), medium (e.g., Gemma2:9b, Llama3:8b), and small (e.g., Llama3.2:3b, Phi3:3.8b). A Retrieval-Augmented Generation (RAG) is used to make up for the problems that come with using separate models. This research utilizes a descriptive dataset of Quranic surahs including the meanings, historical context, and qualities of the 114 surahs, allowing the model to gather relevant knowledge before responding. The models are evaluated using three key metrics set by human evaluators: context relevance, answer faithfulness, and answer relevance. The findings reveal that large models consistently outperform smaller models in capturing query semantics and producing accurate, contextually grounded responses. The Llama3.2:3b model, even though it is considered small, does very well on faithfulness (4.619) and relevance (4.857), showing the promise of smaller architectures that have been well optimized. This article examines the trade-offs between model size, computational efficiency, and response quality while using LLMs in domain-specific applications.",
    "source_database": "arxiv",
    "arxiv_id": "2503.16581v1"
  },
  {
    "title": "Is Conformal Factuality for RAG-based LLMs Robust? Novel Metrics and Systematic Insights",
    "authors": [
      "Yi Chen",
      "Daiwei Chen",
      "Sukrut Madhav Chikodikar",
      "Caitlyn Heqi Yin",
      "Ramya Korlakai Vinayak"
    ],
    "year": "2026",
    "journal": "arXiv:2603.16817v1",
    "doi": "",
    "abstract": "Large language models (LLMs) frequently hallucinate, limiting their reliability in knowledge-intensive applications. Retrieval-augmented generation (RAG) and conformal factuality have emerged as potential ways to address this limitation. While RAG aims to ground responses in retrieved evidence, it provides no statistical guarantee that the final output is correct. Conformal factuality filtering offers distribution-free statistical reliability by scoring and filtering atomic claims using a threshold calibrated on held-out data, however, the informativeness of the final output is not guaranteed. We systematically analyze the reliability and usefulness of conformal factuality for RAG-based LLMs across generation, scoring, calibration, robustness, and efficiency. We propose novel informativeness-aware metrics that better reflect task utility under conformal filtering. Across three benchmarks and multiple model families, we find that (i) conformal filtering suffers from low usefulness at high factuality levels due to vacuous outputs, (ii) conformal factuality guarantee is not robust to distribution shifts and distractors, highlighting the limitation that requires calibration data to closely match deployment conditions, and (iii) lightweight entailment-based verifiers match or outperform LLM-based model confidence scorers while requiring over $100\\times$ fewer FLOPs. Overall, our results expose factuality-informativeness trade-offs and fragility of conformal filtering framework under distribution shifts and distractors, highlighting the need for new approaches for reliability with robustness and usefulness as key metrics, and provide actionable guidance for building RAG pipelines that are both reliable and computationally efficient.",
    "source_database": "arxiv",
    "arxiv_id": "2603.16817v1"
  },
  {
    "title": "GROUNDEDKG-RAG: Grounded Knowledge Graph Index for Long-document Question Answering",
    "authors": [
      "Tianyi Zhang",
      "Andreas Marfurt"
    ],
    "year": "2026",
    "journal": "arXiv:2604.04359v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) systems have been widely adopted in contemporary large language models (LLMs) due to their ability to improve generation quality while reducing the required input context length. In this work, we focus on RAG systems for long-document question answering. Current approaches suffer from a heavy reliance on LLM descriptions resulting in high resource consumption and latency, repetitive content across hierarchical levels, and hallucinations due to no or limited grounding in the source text. To improve both efficiency and factual accuracy through grounding, we propose GroundedKG-RAG, a RAG system in which the knowledge graph is explicitly extracted from and grounded in the source document. Specifically, we define nodes in GroundedKG as entities and actions, and edges as temporal or semantic relations, with each node and edge grounded in the original sentences. We construct GroundedKG from semantic role labeling (SRL) and abstract meaning representation (AMR) parses and then embed it for retrieval. During querying, we apply the same transformation to the query and retrieve the most relevant sentences from the grounded source text for question answering. We evaluate GroundedKG-RAG on examples from the NarrativeQA dataset and find that it performs on par with a state-of-the art proprietary long-context model at smaller cost and outperforms a competitive baseline. Additionally, our GroundedKG is interpretable and readable by humans, facilitating auditing of results and error analysis.",
    "source_database": "arxiv",
    "arxiv_id": "2604.04359v1"
  },
  {
    "title": "A Systematic Review of Key Retrieval-Augmented Generation (RAG) Systems: Progress, Gaps, and Future Directions",
    "authors": [
      "Agada Joseph Oche",
      "Ademola Glory Folashade",
      "Tirthankar Ghosal",
      "Arpan Biswas"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2507.18910",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) represents a major advancement in natural language processing (NLP), combining large language models (LLMs) with information retrieval systems to enhance factual grounding, accuracy, and contextual relevance. This paper presents a comprehensive systematic review of RAG, tracing its evolution from early developments in open domain question answering to recent state-of-the-art implementations across diverse applications. The review begins by outlining the motivations behind RAG, particularly its ability to mitigate hallucinations and outdated knowledge in parametric models. Core technical components-retrieval mechanisms, sequence-to-sequence generation models, and fusion strategies are examined in detail. A year-by-year analysis highlights key milestones and research trends, providing insight into RAG's rapid growth. The paper further explores the deployment of RAG in enterprise systems, addressing practical challenges related to retrieval of proprietary data, security, and scalability. A comparative evaluation of RAG implementations is conducted, benchmarking performance on retrieval accuracy, generation fluency, latency, and computational efficiency. Persistent challenges such as retrieval quality, privacy concerns, and integration overhead are critically assessed. Finally, the review highlights emerging solutions, including hybrid retrieval approaches, privacy-preserving techniques, optimized fusion strategies, and agentic RAG architectures. These innovations point toward a future of more reliable, efficient, and context-aware knowledge-intensive NLP systems.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Turk-LettuceDetect: A Hallucination Detection Model for Turkish RAG Applications",
    "authors": [
      "Selva Tas",
      "Mahmut El Huseyni",
      "\u00d6zay Ezerceli",
      "Reyhan Bayraktar",
      "Fatma Bet\u00fcl Terzioglu"
    ],
    "year": "2025",
    "journal": "2025 3rd International Conference on Foundation and Large Language Models (FLLM)",
    "doi": "10.1109/FLLM67465.2025.11390913",
    "pmid": "",
    "abstract": "The widespread adoption of Large Language Models (LLMs) has been hindered by their tendency to hallucinate, generating plausible but factually incorrect information. While Retrieval-Augmented Generation (RAG) systems attempt to address this issue by grounding responses in external knowledge, hallucination remains a persistent challenge, particularly for morphologically complex, low-resource languages like Turkish. This paper introduces Turk-LettuceDetect, the first suite of hallucination detection models specifically designed for Turkish RAG applications. Building on the LettuceDetect framework, we formulate hallucination detection as a token-level classification task and fine-tune three distinct encoder architectures: a Turkishspecific ModernBERT, TurkEmbed4STS, and multilingual EuroBERT. These models were trained on a machine-translated version of the RAGTruth benchmark dataset containing 17,790 instances across question answering, data-to-text generation, and summarization tasks. Our experimental results show that the ModernBERT-based model achieves an F1-score of 0.7266 on the complete test set, with particularly strong performance on structured tasks. The models maintain computational efficiency while supporting long contexts up to 8,192 tokens, making them suitable for real-time deployment. Comparative analysis reveals that while state-of-the-art LLMs demonstrate high recall, they suffer from low precision due to over-generation of hallucinated content, underscoring the necessity of specialized detection mechanisms. By releasing our models and translated dataset, this work addresses a critical gap in multilingual NLP and establishes a foundation for developing more reliable and trustworthy AI applications for Turkish and other languages.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "A Survey of Retrieval-Augmented Generation (RAG) for Large Language Models",
    "authors": [
      "Yusong Ma",
      "Hongxuan Nie",
      "Chao Chen",
      "Jiujie Zhang",
      "Jiali Jiang",
      "Bisheng Wang",
      "Yuqi Xia"
    ],
    "year": "2025",
    "journal": "2025 International Conference on Trustworthy Big Data and Artificial Intelligence (ICTBAI)",
    "doi": "10.1109/ICTBAI68361.2025.00008",
    "pmid": "",
    "abstract": "While Large Language Models (LLMs) are revolutionary, their deployment is constrained by inherent limitations such as factual hallucination and static knowledge. This survey systematically reviews Retrieval-Augmented Generation (RAG), a key paradigm for addressing these challenges by grounding LLMs in external, verifiable knowledge. To overcome the flaws of standalone models, RAG integrates LLMs with updatable knowledge bases, a hybrid approach that significantly enhances output accuracy and trustworthiness. Our primary finding is the technology\u2019s clear evolutionary trajectory, which we structure into three stages: Naive, Advanced, and Modular RAG. This progression demonstrates a shift away from monolithic parametric memory towards intelligent systems that interact with external data. By summarizing the field\u2019s progression, key challenges like retriever-generator alignment, and future directions such as integration with agentic architectures, this work concludes that RAG is a crucial technology for propelling AI to be more evidence-based and capable of complex reasoning.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "THaMES: An End-to-End Tool for Hallucination Mitigation and Evaluation in Large Language Models",
    "authors": [
      "Mengfei Liang",
      "Archish Arun",
      "Zekun Wu",
      "Cristian Munoz",
      "Jonathan Lutch",
      "Emre Kazim",
      "Adriano Koshiyama",
      "Philip Treleaven"
    ],
    "year": "2024",
    "journal": "arXiv:2409.11353v3",
    "doi": "",
    "abstract": "Hallucination, the generation of factually incorrect content, is a growing challenge in Large Language Models (LLMs). Existing detection and mitigation methods are often isolated and insufficient for domain-specific needs, lacking a standardized pipeline. This paper introduces THaMES (Tool for Hallucination Mitigations and EvaluationS), an integrated framework and library addressing this gap. THaMES offers an end-to-end solution for evaluating and mitigating hallucinations in LLMs, featuring automated test set generation, multifaceted benchmarking, and adaptable mitigation strategies. It automates test set creation from any corpus, ensuring high data quality, diversity, and cost-efficiency through techniques like batch processing, weighted sampling, and counterfactual validation. THaMES assesses a model's ability to detect and reduce hallucinations across various tasks, including text generation and binary classification, applying optimal mitigation strategies like In-Context Learning (ICL), Retrieval Augmented Generation (RAG), and Parameter-Efficient Fine-tuning (PEFT). Evaluations of state-of-the-art LLMs using a knowledge base of academic papers, political news, and Wikipedia reveal that commercial models like GPT-4o benefit more from RAG than ICL, while open-weight models like Llama-3.1-8B-Instruct and Mistral-Nemo gain more from ICL. Additionally, PEFT significantly enhances the performance of Llama-3.1-8B-Instruct in both evaluation tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2409.11353v3"
  },
  {
    "title": "fastbmRAG: A Fast Graph-Based RAG Framework for Efficient Processing of Large-Scale Biomedical Literature",
    "authors": [
      "Guofeng Meng",
      "Li Shen",
      "Qiuyan Zhong",
      "Wei Wang",
      "Haizhou Zhang",
      "Xiaozhen Wang"
    ],
    "year": "2025",
    "journal": "arXiv:2511.10014v1",
    "doi": "",
    "abstract": "Large language models (LLMs) are rapidly transforming various domains, including biomedicine and healthcare, and demonstrate remarkable potential from scientific research to new drug discovery. Graph-based retrieval-augmented generation (RAG) systems, as a useful application of LLMs, can improve contextual reasoning through structured entity and relationship identification from long-context knowledge, e.g. biomedical literature. Even though many advantages over naive RAGs, most of graph-based RAGs are computationally intensive, which limits their application to large-scale dataset. To address this issue, we introduce fastbmRAG, an fast graph-based RAG optimized for biomedical literature. Utilizing well organized structure of biomedical papers, fastbmRAG divides the construction of knowledge graph into two stages, first drafting graphs using abstracts; and second, refining them using main texts guided by vector-based entity linking, which minimizes redundancy and computational load. Our evaluations demonstrate that fastbmRAG is over 10x faster than existing graph-RAG tools and achieve superior coverage and accuracy to input knowledge. FastbmRAG provides a fast solution for quickly understanding, summarizing, and answering questions about biomedical literature on a large scale. FastbmRAG is public available in https://github.com/menggf/fastbmRAG.",
    "source_database": "arxiv",
    "arxiv_id": "2511.10014v1"
  },
  {
    "title": "Engineering RAG Systems for Real-World Applications: Design, Development, and Evaluation",
    "authors": [
      "Md Toufique Hasan",
      "Muhammad Waseem",
      "Kai-Kristian Kemell",
      "A. Khan",
      "Mika Saari",
      "Pekka Abrahamsson"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.1007/978-3-032-04200-2_10",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) systems are emerging as a key approach for grounding Large Language Models (LLMs) in external knowledge, addressing limitations in factual accuracy and contextual relevance. However, there is a lack of empirical studies that report on the development of RAG-based implementations grounded in real-world use cases, evaluated through general user involvement, and accompanied by systematic documentation of lessons learned. This paper presents five domain-specific RAG applications developed for real-world scenarios across governance, cybersecurity, agriculture, industrial research, and medical diagnostics. Each system incorporates multilingual OCR, semantic retrieval via vector embeddings, and domain-adapted LLMs, deployed through local servers or cloud APIs to meet distinct user needs. A web-based evaluation involving a total of 100 participants assessed the systems across six dimensions: (i) Ease of Use, (ii) Relevance, (iii) Transparency, (iv) Responsiveness, (v) Accuracy, and (vi) Likelihood of Recommendation. Based on user feedback and our development experience, we documented twelve key lessons learned, highlighting technical, operational, and ethical challenges affecting the reliability and usability of RAG systems in practice.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Promises and challenges of applying large language models in the healthcare domain.",
    "authors": [
      "Wang Q",
      "Gong Z",
      "Lai Z",
      "Bu L",
      "Dahlweid FM",
      "Sun H"
    ],
    "year": "2026",
    "journal": "Frontiers in digital health",
    "doi": "10.3389/fdgth.2026.1772274",
    "pmid": "41924178",
    "abstract": "Large language models are rapidly moving from theoretical concepts to active clinical pilots. Current approaches diverge between general-purpose models, which adapt to healthcare via prompt engineering, and domain-specific models, which prioritize deep alignment with medical knowledge graphs to ensure safety. Despite reported benefits in documentation efficiency and diagnostic reasoning, significant challenges remain regarding hallucination, privacy, and the validity of evaluation metrics. This Mini Review synthesizes current evidence, contrasts these two modeling paradigms, highlights key controversies, and maps out future development routes including retrieval-augmented generation and agentic architectures.",
    "source_database": "pubmed"
  },
  {
    "title": "Large language models for clinical decision support in gastroenterology and hepatology.",
    "authors": [
      "Wiest IC",
      "Bhat M",
      "Clusmann J",
      "Schneider CV",
      "Jiang X",
      "Kather JN"
    ],
    "year": "2025",
    "journal": "Nature reviews. Gastroenterology & hepatology",
    "doi": "10.1038/s41575-025-01108-1",
    "pmid": "40846793",
    "abstract": "Clinical decision making in gastroenterology and hepatology has become increasingly complex and challenging for physicians. This growing complexity can be addressed by computational tools that support clinical decisions. Although numerous clinical decision support systems (CDSS) have emerged, they have faced difficulties with real-world performance and generalizability, resulting in limited clinical adoption. Generative artificial intelligence (AI), particularly large language models (LLMs), are introducing new possibilities for CDSS by offering more flexible and adaptable support that better reflects complex clinical scenarios. LLMs can process unstructured text, including patient data and medical guidelines, and integrate various information sources with high accuracy, especially when augmented with retrieval-augmented generation. Thus, LLMs can provide dynamic, context-specific support by generating personalized treatment recommendations, identifying potential complications based on patient history, and enabling natural language interactions with health-care providers. However, important challenges persist, particularly regarding biases, hallucinations, interoperability barriers, and proper training of health-care providers. We examine the parallel evolution of the complexity in clinical management in gastroenterology and hepatology, and the technical developments leading to current generative AI models. We discuss how these advances are converging to create effective CDSS, providing a conceptual basis for further development and clinical adoption of these systems.",
    "source_database": "pubmed"
  },
  {
    "title": "MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries",
    "authors": [
      "Yixuan Tang",
      "Yi Yang"
    ],
    "year": "2024",
    "journal": "arXiv:2401.15391v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) augments large language models (LLM) by retrieving relevant knowledge, showing promising potential in mitigating LLM hallucinations and enhancing response quality, thereby facilitating the great adoption of LLMs in practice. However, we find that existing RAG systems are inadequate in answering multi-hop queries, which require retrieving and reasoning over multiple pieces of supporting evidence. Furthermore, to our knowledge, no existing RAG benchmarking dataset focuses on multi-hop queries. In this paper, we develop a novel dataset, MultiHop-RAG, which consists of a knowledge base, a large collection of multi-hop queries, their ground-truth answers, and the associated supporting evidence. We detail the procedure of building the dataset, utilizing an English news article dataset as the underlying RAG knowledge base. We demonstrate the benchmarking utility of MultiHop-RAG in two experiments. The first experiment compares different embedding models for retrieving evidence for multi-hop queries. In the second experiment, we examine the capabilities of various state-of-the-art LLMs, including GPT-4, PaLM, and Llama2-70B, in reasoning and answering multi-hop queries given the evidence. Both experiments reveal that existing RAG methods perform unsatisfactorily in retrieving and answering multi-hop queries. We hope MultiHop-RAG will be a valuable resource for the community in developing effective RAG systems, thereby facilitating greater adoption of LLMs in practice. The MultiHop-RAG and implemented RAG system is publicly available at https://github.com/yixuantt/MultiHop-RAG/.",
    "source_database": "arxiv",
    "arxiv_id": "2401.15391v1"
  },
  {
    "title": "Less Finetuning, Better Retrieval: Rethinking LLM Adaptation for Biomedical Retrievers via Synthetic Data and Model Merging",
    "authors": [
      "Sameh Khattab",
      "Jean-Philippe Corbeil",
      "Osman Alperen Koras",
      "Amin Dada",
      "Julian Friedrich",
      "Fran\u00e7ois Beaulieu",
      "Paul Vozila",
      "J. Kleesiek"
    ],
    "year": "2026",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2602.04731",
    "pmid": "",
    "abstract": "Retrieval-augmented generation (RAG) has become the backbone of grounding Large Language Models (LLMs), improving knowledge updates and reducing hallucinations. Recently, LLM-based retriever models have shown state-of-the-art performance for RAG applications. However, several technical aspects remain underexplored on how to adapt general-purpose LLMs into effective domain-specific retrievers, especially in specialized domains such as biomedicine. We present Synthesize-Train-Merge (STM), a modular framework that enhances decoder-only LLMs with synthetic hard negatives, retrieval prompt optimization, and model merging. Experiments on a subset of 12 medical and general tasks from the MTEB benchmark show STM boosts task-specific experts by up to 23.5\\% (average 7.5\\%) and produces merged models that outperform both single experts and strong baselines without extensive pretraining. Our results demonstrate a scalable, efficient path for turning general LLMs into high-performing, domain-specialized retrievers, preserving general-domain capabilities while excelling on specialized tasks.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Stream RAG: Instant and Accurate Spoken Dialogue Systems with Streaming Tool Usage",
    "authors": [
      "Siddhant Arora",
      "Haidar Khan",
      "Kai Sun",
      "Xin Dong",
      "Sajal Choudhary",
      "Seungwhan Moon",
      "Xinyuan Zhang",
      "Adithya Sagar",
      "S. Appini",
      "Kaushik Patnaik",
      "Sanat Sharma",
      "Shinji Watanabe",
      "Anuj Kumar",
      "Ahmed A Aly",
      "Yue Liu",
      "Florian Metze",
      "Zhaojiang Lin"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2510.02044",
    "pmid": "",
    "abstract": "End-to-end speech-in speech-out dialogue systems are emerging as a powerful alternative to traditional ASR-LLM-TTS pipelines, generating more natural, expressive responses with significantly lower latency. However, these systems remain prone to hallucinations due to limited factual grounding. While text-based dialogue systems address this challenge by integrating tools such as web search and knowledge graph APIs, we introduce the first approach to extend tool use directly into speech-in speech-out systems. A key challenge is that tool integration substantially increases response latency, disrupting conversational flow. To mitigate this, we propose Streaming Retrieval-Augmented Generation (Streaming RAG), a novel framework that reduces user-perceived latency by predicting tool queries in parallel with user speech, even before the user finishes speaking. Specifically, we develop a post-training pipeline that teaches the model when to issue tool calls during ongoing speech and how to generate spoken summaries that fuse audio queries with retrieved text results, thereby improving both accuracy and responsiveness. To evaluate our approach, we construct AudioCRAG, a benchmark created by converting queries from the publicly available CRAG dataset into speech form. Experimental results demonstrate that our streaming RAG approach increases QA accuracy by up to 200% relative (from 11.1% to 34.2% absolute) and further enhances user experience by reducing tool use latency by 20%. Importantly, our streaming RAG approach is modality-agnostic and can be applied equally to typed input, paving the way for more agentic, real-time AI assistants.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Generate but Verify: Answering with Faithfulness in RAG-based Question Answering",
    "authors": [
      "Simone Filice",
      "Elad Haramaty",
      "Guy Horowitz",
      "Zohar S. Karnin",
      "L. Lewin-Eytan",
      "Alex Shtoff"
    ],
    "year": "2025",
    "journal": "",
    "doi": "10.18653/v1/2025.ijcnlp-long.56",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enhances LLMs by grounding answers in retrieved passages, which is key in factual Question Answering. However, generated answers may still be unfaithful to the passages, either due to retrieval or generation errors. Many RAG downstream applications rely on assessing answer faithfulness for applying fallback strategies, yet address it implicitly, without a consistent evaluation methodology. We introduce the task of Answering with Faithfulness (AwF), which brings faithfulness prediction to the forefront, explicitly coupling it with answer generation. We define variants of the precision and recall metrics tailored to this task, facilitating direct evaluation and comparison of different AwF methods. We then demonstrate, both theoretically and empirically, that for RAG applications using AwF as a sub-procedure, an improvement to the AwF metrics translates to an improvement to the downstream performance. This results in improved performance for recently published results.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Hallucination Detection with Small Language Models",
    "authors": [
      "Ming Cheung"
    ],
    "year": "2025",
    "journal": "arXiv:2506.22486v1",
    "doi": "",
    "abstract": "Since the introduction of ChatGPT, large language models (LLMs) have demonstrated significant utility in various tasks, such as answering questions through retrieval-augmented generation. Context can be retrieved using a vectorized database, serving as a foundation for LLMs to generate responses. However, hallucinations in responses can undermine the reliability of LLMs in practical applications, and they are not easily detectable in the absence of ground truth, particularly in question-and-answer scenarios. This paper proposes a framework that integrates multiple small language models to verify responses generated by LLMs using the retrieved context from a vectorized database. By breaking down the responses into individual sentences and utilizing the probability of generating \"Yes\" tokens from the outputs of multiple models for a given set of questions, responses, and relevant context, hallucinations can be detected. The proposed framework is validated through experiments with real datasets comprising over 100 sets of questions, answers, and contexts, including responses with fully and partially correct sentences. The results demonstrate a 10\\% improvement in F1 scores for detecting correct responses compared to hallucinations, indicating that multiple small language models can be effectively employed for answer verification, providing a scalable and efficient solution for both academic and practical applications.",
    "source_database": "arxiv",
    "arxiv_id": "2506.22486v1"
  },
  {
    "title": "ChatTogoVar: a TogoVar-based retrieval-augmented generation system for precise genomic variant interpretation.",
    "authors": [
      "Mitsuhashi N",
      "Fujiwara T",
      "Yamaguchi A"
    ],
    "year": "2026",
    "journal": "Human genome variation",
    "doi": "10.1038/s41439-026-00344-4",
    "pmid": "41956998",
    "abstract": "Large language models (LLMs) have recently been adopted to assist in the interpretation of human genomic variants. However, general-purpose LLMs can produce incorrect outputs (commonly termed 'hallucinations'), particularly on specialized queries, raising concerns about their reliability for variant interpretation. Here, to mitigate this risk, we developed ChatTogoVar, a retrieval-augmented generation system that queries TogoVar, a variant database that integrates information, such as allele frequency and clinical significance, and incorporates the retrieved results into prompts. We constructed a benchmark of 150 questions sampled from a predefined pool of 1500 template-variant combinations (50 templates \u00d7 30 variants). For large-scale assessment, we used the full 1500-question pool for automated LLM-based scoring. ChatTogoVar achieved the highest score for 135/150 questions, outperforming both a general-purpose LLM and an existing specialized system. Furthermore, automatic evaluation of all 1500 questions by an LLM confirmed the same trend. These results suggest that integrating a reliable variant database with an LLM can improve the accuracy of variant interpretation and that ChatTogoVar may serve as a practical tool to support genomic medicine and personalized healthcare.",
    "source_database": "pubmed"
  },
  {
    "title": "Retrieval augmented generation for large language models in healthcare: A systematic review.",
    "authors": [
      "Amugongo LM",
      "Mascheroni P",
      "Brooks S",
      "Doering S",
      "Seidel J"
    ],
    "year": "2025",
    "journal": "PLOS digital health",
    "doi": "10.1371/journal.pdig.0000877",
    "pmid": "40498738",
    "abstract": "Large Language Models (LLMs) have demonstrated promising capabilities to solve complex tasks in critical sectors such as healthcare. However, LLMs are limited by their training data which is often outdated, the tendency to generate inaccurate (\"hallucinated\") content and a lack of transparency in the content they generate. To address these limitations, retrieval augmented generation (RAG) grounds the responses of LLMs by exposing them to external knowledge sources. However, in the healthcare domain there is currently a lack of systematic understanding of which datasets, RAG methodologies and evaluation frameworks are available. This review aims to bridge this gap by assessing RAG-based approaches employed by LLMs in healthcare, focusing on the different steps of retrieval, augmentation and generation. Additionally, we identify the limitations, strengths and gaps in the existing literature. Our synthesis shows that 78.9% of studies used English datasets and 21.1% of the datasets are in Chinese. We find that a range of techniques are employed RAG-based LLMs in healthcare, including Naive RAG, Advanced RAG, and Modular RAG. Surprisingly, proprietary models such as GPT-3.5/4 are the most used for RAG applications in healthcare. We find that there is a lack of standardised evaluation frameworks for RAG-based applications. In addition, the majority of the studies do not assess or address ethical considerations related to RAG in healthcare. It is important to account for ethical challenges that are inherent when AI systems are implemented in the clinical setting. Lastly, we highlight the need for further research and development to ensure responsible and effective adoption of RAG in the medical domain.",
    "source_database": "pubmed"
  },
  {
    "title": "Riddle Me This! Stealthy Membership Inference for Retrieval-Augmented Generation",
    "authors": [
      "Ali Naseh",
      "Yuefeng Peng",
      "Anshuman Suri",
      "Harsh Chaudhari",
      "Alina Oprea",
      "Amir Houmansadr"
    ],
    "year": "2025",
    "journal": "arXiv:2502.00306v2",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enables Large Language Models (LLMs) to generate grounded responses by leveraging external knowledge databases without altering model parameters. Although the absence of weight tuning prevents leakage via model parameters, it introduces the risk of inference adversaries exploiting retrieved documents in the model's context. Existing methods for membership inference and data extraction often rely on jailbreaking or carefully crafted unnatural queries, which can be easily detected or thwarted with query rewriting techniques common in RAG systems. In this work, we present Interrogation Attack (IA), a membership inference technique targeting documents in the RAG datastore. By crafting natural-text queries that are answerable only with the target document's presence, our approach demonstrates successful inference with just 30 queries while remaining stealthy; straightforward detectors identify adversarial prompts from existing methods up to ~76x more frequently than those generated by our attack. We observe a 2x improvement in TPR@1%FPR over prior inference attacks across diverse RAG configurations, all while costing less than $0.02 per document inference.",
    "source_database": "arxiv",
    "arxiv_id": "2502.00306v2"
  },
  {
    "title": "RAGPart & RAGMask: Retrieval-Stage Defenses Against Corpus Poisoning in Retrieval-Augmented Generation",
    "authors": [
      "Pankayaraj Pathmanathan",
      "Michael-Andrei Panaitescu-Liess",
      "Cho-Yu Jason Chiang",
      "Furong Huang"
    ],
    "year": "2025",
    "journal": "arXiv:2512.24268v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) has emerged as a promising paradigm to enhance large language models (LLMs) with external knowledge, reducing hallucinations and compensating for outdated information. However, recent studies have exposed a critical vulnerability in RAG pipelines corpus poisoning where adversaries inject malicious documents into the retrieval corpus to manipulate model outputs. In this work, we propose two complementary retrieval-stage defenses: RAGPart and RAGMask. Our defenses operate directly on the retriever, making them computationally lightweight and requiring no modification to the generation model. RAGPart leverages the inherent training dynamics of dense retrievers, exploiting document partitioning to mitigate the effect of poisoned points. In contrast, RAGMask identifies suspicious tokens based on significant similarity shifts under targeted token masking. Across two benchmarks, four poisoning strategies, and four state-of-the-art retrievers, our defenses consistently reduce attack success rates while preserving utility under benign conditions. We further introduce an interpretable attack to stress-test our defenses. Our findings highlight the potential and limitations of retrieval-stage defenses, providing practical insights for robust RAG deployments.",
    "source_database": "arxiv",
    "arxiv_id": "2512.24268v1"
  },
  {
    "title": "MultiRAG: A Knowledge-guided Framework for Mitigating Hallucination in Multi-source Retrieval Augmented Generation",
    "authors": [
      "Wenlong Wu",
      "Haofen Wang",
      "Bohan Li",
      "Peixuan Huang",
      "Xinzhe Zhao",
      "Lei Liang"
    ],
    "year": "2025",
    "journal": "arXiv:2508.03553v1",
    "doi": "https://doi.org/10.1109/ICDE65448.2025.00230",
    "abstract": "Retrieval Augmented Generation (RAG) has emerged as a promising solution to address hallucination issues in Large Language Models (LLMs). However, the integration of multiple retrieval sources, while potentially more informative, introduces new challenges that can paradoxically exacerbate hallucination problems. These challenges manifest primarily in two aspects: the sparse distribution of multi-source data that hinders the capture of logical relationships and the inherent inconsistencies among different sources that lead to information conflicts. To address these challenges, we propose MultiRAG, a novel framework designed to mitigate hallucination in multi-source retrieval-augmented generation through knowledge-guided approaches. Our framework introduces two key innovations: (1) a knowledge construction module that employs multi-source line graphs to efficiently aggregate logical relationships across different knowledge sources, effectively addressing the sparse data distribution issue; and (2) a sophisticated retrieval module that implements a multi-level confidence calculation mechanism, performing both graph-level and node-level assessments to identify and eliminate unreliable information nodes, thereby reducing hallucinations caused by inter-source inconsistencies. Extensive experiments on four multi-domain query datasets and two multi-hop QA datasets demonstrate that MultiRAG significantly enhances the reliability and efficiency of knowledge retrieval in complex multi-source scenarios. \\textcolor{blue}{Our code is available in https://github.com/wuwenlong123/MultiRAG.",
    "source_database": "arxiv",
    "arxiv_id": "2508.03553v1"
  },
  {
    "title": "CARROT: A Learned Cost-Constrained Retrieval Optimization System for RAG",
    "authors": [
      "Ziting Wang",
      "Haitao Yuan",
      "Wei Dong",
      "Gao Cong",
      "Feifei Li"
    ],
    "year": "2024",
    "journal": "arXiv:2411.00744v2",
    "doi": "",
    "abstract": "Large Language Models (LLMs) have demonstrated impressive ability in generation and reasoning tasks but struggle with handling up-to-date knowledge, leading to inaccuracies or hallucinations. Retrieval-Augmented Generation (RAG) mitigates this by retrieving and incorporating external knowledge into input prompts. In particular, due to LLMs' context window limitations and long-context hallucinations, only the most relevant \"chunks\" are retrieved. However, current RAG systems face three key challenges: (1) chunks are often retrieved independently without considering their relationships, such as redundancy and ordering; (2) the utility of chunks is non-monotonic, as adding more chunks can degrade quality; and (3) retrieval strategies fail to adapt to the unique characteristics of different queries. To overcome these challenges, we design a cost-constrained retrieval optimization framework for RAG. We adopt a Monte Carlo Tree Search (MCTS) based strategy to find the optimal chunk combination order, which considers the chunks' correlations. In addition, to address the non-monotonicity of chunk utility, instead of treating budget exhaustion as the termination condition, we design a utility computation strategy to identify the optimal chunk combination without necessarily exhausting the budget. Furthermore, we propose a configuration agent that predicts optimal configurations for each query domain, improving our framework's adaptability and efficiency. Experimental results demonstrate up to a 30% improvement over baseline models, highlighting the framework's effectiveness, scalability, and suitability. Our source code has been released at https://github.com/wang0702/CARROT.",
    "source_database": "arxiv",
    "arxiv_id": "2411.00744v2"
  },
  {
    "title": "Hybrid Retrieval for Hallucination Mitigation in Large Language Models: A Comparative Analysis",
    "authors": [
      "Chandana Sree Mala",
      "Gizem Gezici",
      "Fosca Giannotti"
    ],
    "year": "2025",
    "journal": "arXiv:2504.05324v1",
    "doi": "",
    "abstract": "Large Language Models (LLMs) excel in language comprehension and generation but are prone to hallucinations, producing factually incorrect or unsupported outputs. Retrieval Augmented Generation (RAG) systems address this issue by grounding LLM responses with external knowledge. This study evaluates the relationship between retriever effectiveness and hallucination reduction in LLMs using three retrieval approaches: sparse retrieval based on BM25 keyword search, dense retrieval using semantic search with Sentence Transformers, and a proposed hybrid retrieval module. The hybrid module incorporates query expansion and combines the results of sparse and dense retrievers through a dynamically weighted Reciprocal Rank Fusion score. Using the HaluBench dataset, a benchmark for hallucinations in question answering tasks, we assess retrieval performance with metrics such as mean average precision and normalised discounted cumulative gain, focusing on the relevance of the top three retrieved documents. Results show that the hybrid retriever achieves better relevance scores, outperforming both sparse and dense retrievers. Further evaluation of LLM-generated answers against ground truth using metrics such as accuracy, hallucination rate, and rejection rate reveals that the hybrid retriever achieves the highest accuracy on fails, the lowest hallucination rate, and the lowest rejection rate. These findings highlight the hybrid retriever's ability to enhance retrieval relevance, reduce hallucination rates, and improve LLM reliability, emphasising the importance of advanced retrieval techniques in mitigating hallucinations and improving response accuracy.",
    "source_database": "arxiv",
    "arxiv_id": "2504.05324v1"
  },
  {
    "title": "KRAGEN: a knowledge graph-enhanced RAG framework for biomedical problem solving using large language models",
    "authors": [
      "Nicholas Matsumoto",
      "Jay Moran",
      "Hyunjun Choi",
      "Miguel E. Hernandez",
      "Mythreye Venkatesan",
      "Z. Wang",
      "Jason H. Moore"
    ],
    "year": "2024",
    "journal": "Bioinformatics",
    "doi": "10.1093/bioinformatics/btae353",
    "pmid": "38830083",
    "abstract": "Abstract Motivation Answering and solving complex problems using a large language model (LLM) given a certain domain such as biomedicine is a challenging task that requires both factual consistency and logic, and LLMs often suffer from some major limitations, such as hallucinating false or irrelevant information, or being influenced by noisy data. These issues can compromise the trustworthiness, accuracy, and compliance of LLM-generated text and insights. Results Knowledge Retrieval Augmented Generation ENgine (KRAGEN) is a new tool that combines knowledge graphs, Retrieval Augmented Generation (RAG), and advanced prompting techniques to solve complex problems with natural language. KRAGEN converts knowledge graphs into a vector database and uses RAG to retrieve relevant facts from it. KRAGEN uses advanced prompting techniques: namely graph-of-thoughts (GoT), to dynamically break down a complex problem into smaller subproblems, and proceeds to solve each subproblem by using the relevant knowledge through the RAG framework, which limits the hallucinations, and finally, consolidates the subproblems and provides a solution. KRAGEN\u2019s graph visualization allows the user to interact with and evaluate the quality of the solution\u2019s GoT structure and logic. Availability and implementation KRAGEN is deployed by running its custom Docker containers. KRAGEN is available as open-source from GitHub at: https://github.com/EpistasisLab/KRAGEN.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Towards a Multi-Agent System Based on LLM and RAG for Automated and Customizable Urban Diagnostics",
    "authors": [
      "Rida Azmi",
      "Ebnou Abdem Seyid Abdellahi",
      "Mariem Bounabi",
      "J\u00e9r\u00f4me Chenal",
      "Mohammed Hlal",
      "Elbachir Diop"
    ],
    "year": "2025",
    "journal": "2025 International Conference on Intelligent Systems: Theories and Applications (SITA)",
    "doi": "10.1109/SITA67914.2025.11273206",
    "pmid": "",
    "abstract": "The increasing complexity and dynamism of urban environments necessitate advanced tools for comprehensive and timely diagnostics. Traditional methods are often labor-intensive, fragmented, and struggle to synthesize the vast, heterogeneous data streams generated by modern cities. This paper presents a novel theoretical framework for a multi-agent system that synergistically integrates Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) to deliver automated and customizable urban diagnostics. The proposed system employs a modular, plug-and-play architecture orchestrated by a core LLM, which coordinates a team of specialized agents for tasks including data extraction, analysis, auto-debugging, and report generation. A key innovation is the use of a handbook driven RAG mechanism, where structured technical guides for various data sources and thematic domains serve as a verifiable knowledge base, grounding the system's outputs in factual, domain-specific information. This knowledge-driven approach enables the dynamic generation of code, the handling of diverse data formats, and the assembly of complex diagnostic reports tailored to user specifications provided in natural language. By outlining the system's architecture, workflow, knowledge management strategy, and core theoretical principles, this paper establishes a foundational contribution towards developing more intelligent, adaptive, and reliable systems for urban planning and governance.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Conceptual Design of an LLM-Based Tech Product Recommendation System Using LangChain, LangGraph, Firecrawl, and n8n with RAG, Fine-Tuning, Prompt Engineering, and KNN with Cosine Similarity",
    "authors": [
      "Mrs. Abha Pathak",
      "Mrs. Tejaswini Mali",
      "Mr. Sanket Rathod",
      "Mr. Niraj Rane",
      "Ms. Aditi Rakh",
      "Ms. Swapnali Pimpare"
    ],
    "year": "2025",
    "journal": "International Journal of Advanced Research in Science, Communication and Technology",
    "doi": "10.48175/ijarsct-29973",
    "pmid": "",
    "abstract": "Choosing the right technology product has become increasingly difficult for consumers due to limited technical knowledge, rapidly evolving specifications, and the overwhelming number of available options. Traditional recommendation systems rely on static filters or keyword-based searches, often producing incomplete or context-insensitive results. This paper proposes a conceptual design for an AI-driven recommendation framework that leverages Large Language Models (LLMs) to deliver accurate, explainable, and personalized product suggestions. The system integrates LangChain and LangGraph to manage reasoning, tool orchestration, and multi-step control flow, while product similarity is computed using K-Nearest Neighbors (KNN) with cosine similarity. To ensure factual grounding and reduce hallucination, the design incorporates Retrieval-Augmented Generation (RAG), complemented by fine-tuning and prompt engineering for domain-specific alignment. A continuously updated product knowledge base, maintained through automated web scraping using Firecrawl and workflow synchronization via n8n, supports real-time data accuracy. The proposed framework enables natural-language interaction and aims to provide reliable recommendations for devices such as smartphones, laptops, and wearables, offering a scalable and modular foundation for next-generation tech product advisory systems",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Finetune-RAG: Fine-Tuning Language Models to Resist Hallucination in Retrieval-Augmented Generation",
    "authors": [
      "Zhan Peng Lee",
      "A. Lin",
      "Calvin Tan"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2505.10792",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) has emerged as a powerful framework to improve factuality in large language models (LLMs) by grounding their outputs in retrieved documents. However, ensuring perfect retrieval of relevant information remains challenging, and when irrelevant content is passed downstream to an LLM, it can lead to hallucinations. In this work, we propose Finetune-RAG, a simple and effective fine-tuning approach that features the first-of-its-kind RAG training dataset constructed to mimic real-world imperfections. Experimental results show that Finetune-RAG improves factual accuracy by 21.2% over the base model. We also propose Bench-RAG, an LLM-as-a-judge evaluation pipeline that stress tests models under realistic imperfect retrieval scenarios. Our codebase and dataset are fully open sourced for community use.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "RAGalyst: Automated Human-Aligned Agentic Evaluation for Domain-Specific RAG",
    "authors": [
      "Joshua Gao",
      "Quoc Huy Pham",
      "Subin Varghese",
      "Silwal Saurav",
      "Vedhus Hoskere"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2511.04502",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) is a critical technique for grounding Large Language Models (LLMs) in factual evidence, yet evaluating RAG systems in specialized, safety-critical domains remains a significant challenge. Existing evaluation frameworks often rely on heuristic-based metrics that fail to capture domain-specific nuances and other works utilize LLM-as-a-Judge approaches that lack validated alignment with human judgment. This paper introduces RAGalyst, an automated, human-aligned agentic framework designed for the rigorous evaluation of domain-specific RAG systems. RAGalyst features an agentic pipeline that generates high-quality, synthetic question-answering (QA) datasets from source documents, incorporating an agentic filtering step to ensure data fidelity. The framework refines two key LLM-as-a-Judge metrics-Answer Correctness and Answerability-using prompt optimization to achieve a strong correlation with human annotations. Applying this framework to evaluate various RAG components across three distinct domains (military operations, cybersecurity, and bridge engineering), we find that performance is highly context-dependent. No single embedding model, LLM, or hyperparameter configuration proves universally optimal. Additionally, we provide an analysis on the most common low Answer Correctness reasons in RAG. These findings highlight the necessity of a systematic evaluation framework like RAGalyst, which empowers practitioners to uncover domain-specific trade-offs and make informed design choices for building reliable and effective RAG systems. RAGalyst is available on our Github.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "A Hybrid GNN-LLM Framework for Correlating Cybersecurity Incidents",
    "authors": [
      "Lina Baha",
      "Amine Mammasse",
      "Oualid Saci"
    ],
    "year": "2025",
    "journal": "2025 Fourth International Conference on Theoretical and Applicative Aspects of Computer Science (ICTAACS)",
    "doi": "10.1109/ICTAACS69003.2025.11399321",
    "pmid": "",
    "abstract": "High volumes of alerts from Intrusion Detection Systems (IDS) cause significant \"alert fatigue\" among security analysts, hindering the identification of genuine incidents. Existing automated correlation methods often lack the semantic context and explainability needed for effective response. This paper presents a hybrid framework that integrates Graph Neural Networks (GNNs) and Large Language Models (LLMs) to correlate and explain security alerts. The approach constructs an alert graph using hybrid node features that fuse structured data with semantic embeddings. A GraphSAGE model is trained for link prediction to identify correlated alerts. Crucially, the framework implements a feedback loop where the GNN\u2019s predictions serve as factual grounding for a Retrieval-Augmented Generation (RAG) module, producing human-readable justifications and actionable recommendations. Evaluated on the CIC-IDS 2017 dataset, the model achieves an Area Under the Curve (AUC) of 0.9731 and an accuracy of 90.02%. We demonstrate its ability to group alerts into coherent incidents, bridging the gap between automated detection and human-centric incident response.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Rational Synthesizers or Heuristic Followers? Analyzing LLMs in RAG-based Question-Answering",
    "authors": [
      "Atharv Naphade"
    ],
    "year": "2026",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2601.06189",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) is the prevailing paradigm for grounding Large Language Models (LLMs), yet the mechanisms governing how models integrate groups of conflicting retrieved evidence remain opaque. Does an LLM answer a certain way because the evidence is factually strong, because of a prior belief, or merely because it is repeated frequently? To answer this, we introduce GroupQA, a curated dataset of 1,635 controversial questions paired with 15,058 diversely-sourced evidence documents, annotated for stance and qualitative strength. Through controlled experiments, we characterize group-level evidence aggregation dynamics: Paraphrasing an argument can be more persuasive than providing distinct independent support; Models favor evidence presented first rather than last, and Larger models are increasingly resistant to adapt to presented evidence. Additionally, we find that LLM explanations to group-based answers are unfaithful. Together, we show that LLMs behave consistently as vulnerable heuristic followers, with direct implications for improving RAG system design.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Benchmarking Vector, Graph and Hybrid Retrieval Augmented Generation (RAG) Pipelines for Open Radio Access Networks (ORAN)",
    "authors": [
      "Sarat Ahmad",
      "Zeinab Nezami",
      "Maryam Hafeez",
      "S. A. R. Zaidi"
    ],
    "year": "2025",
    "journal": "2025 IEEE 36th International Symposium on Personal, Indoor and Mobile Radio Communications (PIMRC)",
    "doi": "10.1109/PIMRC62392.2025.11274810",
    "pmid": "",
    "abstract": "Generative AI (GenAI) is expected to play a pivotal role in enabling autonomous optimization in future wireless networks. Within the ORAN architecture, Large Language Models (LLMs) can be specialized to generate xApps and rApps by leveraging specifications and API definitions from the RAN Intelligent Controller (RIC) platform. However, fine-tuning base LLMs for telecom-specific tasks remains expensive and resource-intensive. Retrieval-Augmented Generation (RAG) offers a practical alternative through in-context learning, enabling domain adaptation without full retraining. While traditional RAG systems rely on vector-based retrieval, emerging variants such as GraphRAG and Hybrid GraphRAG incorporate knowledge graphs or dual retrieval strategies to support multi-hop reasoning and improve factual grounding. Despite their promise, these methods lack systematic, metric-driven evaluations, particularly in high-stakes domains such as ORAN. In this study, we conduct a comparative evaluation of Vector RAG, GraphRAG, and Hybrid GraphRAG using ORAN specifications. We assess performance across varying question complexities using established generation metrics: faithfulness, answer relevance, context relevance, and factual correctness. Results show that both GraphRAG and Hybrid GraphRAG outperform traditional RAG. Hybrid GraphRAG improves factual correctness by 8%, while GraphRAG improves context relevance by 11%.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Reviewing Clinical Knowledge in Medical Large Language Models: Training and Beyond",
    "authors": [
      "Qiyuan Li",
      "Haijiang Liu",
      "Caicai Guo",
      "Chao Gao",
      "Deyu Chen",
      "Meng Wang",
      "Feng Gao",
      "Frank van Harmelen",
      "Jinguang Gu"
    ],
    "year": "2025",
    "journal": "arXiv:2502.20988v2",
    "doi": "https://doi.org/10.1016/j.knosys.2025.114215",
    "abstract": "The large-scale development of large language models (LLMs) in medical contexts, such as diagnostic assistance and treatment recommendations, necessitates that these models possess accurate medical knowledge and deliver traceable decision-making processes. Clinical knowledge, encompassing the insights gained from research on the causes, prognosis, diagnosis, and treatment of diseases, has been extensively examined within real-world medical practices. Recently, there has been a notable increase in research efforts aimed at integrating this type of knowledge into LLMs, encompassing not only traditional text and multimodal data integration but also technologies such as knowledge graphs (KGs) and retrieval-augmented generation (RAG). In this paper, we review the various initiatives to embed clinical knowledge into training-based, KG-supported, and RAG-assisted LLMs. We begin by gathering reliable knowledge sources from the medical domain, including databases and datasets. Next, we evaluate implementations for integrating clinical knowledge through specialized datasets and collaborations with external knowledge sources such as KGs and relevant documentation. Furthermore, we discuss the applications of the developed medical LLMs in the industrial sector to assess the disparity between models developed in academic settings and those in industry. We conclude the survey by presenting evaluation systems applicable to relevant tasks and identifying potential challenges facing this field. In this review, we do not aim for completeness, since any ostensibly complete review would soon be outdated. Our goal is to illustrate diversity by selecting representative and accessible items from current research and industry practices, reflecting real-world situations rather than claiming completeness. Thus, we emphasize showcasing diverse approaches.",
    "source_database": "arxiv",
    "arxiv_id": "2502.20988v2"
  },
  {
    "title": "VeriCite: Towards Reliable Citations in Retrieval-Augmented Generation via Rigorous Verification",
    "authors": [
      "Haosheng Qian",
      "Yixing Fan",
      "Jiafeng Guo",
      "Ruqing Zhang",
      "Qi Chen",
      "Dawei Yin",
      "Xueqi Cheng"
    ],
    "year": "2025",
    "journal": "arXiv:2510.11394v1",
    "doi": "https://doi.org/10.1145/3767695.3769505",
    "abstract": "Retrieval-Augmented Generation (RAG) has emerged as a crucial approach for enhancing the responses of large language models (LLMs) with external knowledge sources. Despite the impressive performance in complex question-answering tasks, RAG still struggles with hallucinations. Attributing RAG-generated content through in-line citations has demonstrated potential in reducing hallucinations and facilitating human verification. Existing citation generation methods primarily rely on either fine-tuning the generator or employing post-processing approaches for citation matching. However, the former approach demands substantial annotated data and computational resources, while the latter often encounters difficulties in managing multiple citations and frequently produces suboptimal results. In this paper, we introduce a novel framework, called VeriCite, designed to rigorously validate supporting evidence and enhance answer attribution. Specifically, VeriCite breaks down into a three-stage generation: 1) The initial answer generation first generates a response based on all available contexts and has its claims verified through the NLI model; 2) the supporting evidence selection assesses the utility of each document and extracts useful supporting evidences; 3) the final answer refinement integrates the initial response and collected evidences to produce the final, refined answer.We conduct experiments across five open-source LLMs and four datasets, demonstrating that VeriCite can significantly improve citation quality while maintaining the correctness of the answers.",
    "source_database": "arxiv",
    "arxiv_id": "2510.11394v1"
  },
  {
    "title": "To Retrieve or Not to Retrieve? Uncertainty Detection for Dynamic Retrieval Augmented Generation",
    "authors": [
      "Kaustubh D. Dhole"
    ],
    "year": "2025",
    "journal": "arXiv:2501.09292v3",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation equips large language models with the capability to retrieve external knowledge, thereby mitigating hallucinations by incorporating information beyond the model's intrinsic abilities. However, most prior works have focused on invoking retrieval deterministically, which makes it unsuitable for tasks such as long-form question answering. Instead, dynamically performing retrieval by invoking it only when the underlying LLM lacks the required knowledge can be more efficient. In this context, we delve deeper into the question, \"To Retrieve or Not to Retrieve?\" by exploring multiple uncertainty detection methods. We evaluate these methods for the task of long-form question answering, employing dynamic retrieval, and present our comparisons. Our findings suggest that uncertainty detection metrics, such as Degree Matrix Jaccard and Eccentricity, can reduce the number of retrieval calls by almost half, with only a slight reduction in question-answering accuracy.",
    "source_database": "arxiv",
    "arxiv_id": "2501.09292v3"
  },
  {
    "title": "The Geometry of Queries: Query-Based Innovations in Retrieval-Augmented Generation for Healthcare QA",
    "authors": [
      "Eric Yang",
      "Jonathan Amar",
      "Jong Ha Lee",
      "Bhawesh Kumar",
      "Yugang Jia"
    ],
    "year": "2024",
    "journal": "arXiv:2407.18044v2",
    "doi": "",
    "abstract": "Deploying Large Language Models (LLMs) for healthcare question answering requires robust methods to ensure accuracy and reliability. This work introduces Query-Based Retrieval Augmented Generation (QB-RAG), a framework for enhancing Retrieval-Augmented Generation (RAG) systems in healthcare question-answering by pre-aligning user queries with a database of curated, answerable questions derived from healthcare content. A key component of QB-RAG is an LLM-based filtering mechanism that ensures that only relevant and answerable questions are included in the database, enabling reliable reference query generation at scale. We provide theoretical motivation for QB-RAG, conduct a comparative analysis of existing retrieval enhancement techniques, and introduce a generalizable, comprehensive evaluation framework that assesses both the retrieval effectiveness and the quality of the generated response based on faithfulness, relevance, and adherence to the guideline. Our empirical evaluation on a healthcare data set demonstrates the superior performance of QB-RAG compared to existing retrieval methods, highlighting its practical value in building trustworthy digital health applications for health question-answering.",
    "source_database": "arxiv",
    "arxiv_id": "2407.18044v2"
  },
  {
    "title": "Hybrid-Code v2: Zero-Hallucination Clinical ICD-10 Coding via Neuro-Symbolic Verification and Automated Knowledge Base Expansion",
    "authors": [
      "Yunguo Yu"
    ],
    "year": "2025",
    "journal": "arXiv:2512.23743v2",
    "doi": "",
    "abstract": "Automated clinical ICD-10 coding is a high-impact healthcare task requiring a balance between coverage, precision, and safety. While neural approaches achieve strong performance, they suffer from hallucination-generating invalid or unsupported codes-posing unacceptable risks in safety-critical clinical settings. Rule-based systems eliminate hallucination but lack scalability and coverage due to manual knowledge base (KB) curation.   We present Hybrid-Code v2, a neuro-symbolic framework that achieves zero Type-I hallucination by construction while maintaining competitive coverage and precision. The system integrates neural candidate generation with a symbolic KB verification layer that enforces validity constraints through multi-layer verification, including format, evidence grounding, negation detection, temporal consistency, and exclusion rules. In addition, we introduce an automated KB expansion mechanism that extracts and validates coding patterns from unlabeled clinical text, addressing the scalability limitations of rule-based systems.   Evaluated on the MIMIC-III dataset against ClinicalBERT, BioBERT, rule-based systems, and GPT-4, Hybrid-Code v2 achieves 85% coverage, 92% precision, and 0% Type-I hallucination, outperforming rule-based systems by +40% coverage while eliminating hallucination observed in neural baselines (6-18%). The proposed architecture provides a formal safety guarantee for syntactic validity while preserving strong empirical performance.   These results demonstrate that neuro-symbolic verification can enforce safety constraints in neural medical AI systems without sacrificing effectiveness, offering a generalizable design pattern for deploying trustworthy AI in safety-critical domains.",
    "source_database": "arxiv",
    "arxiv_id": "2512.23743v2"
  },
  {
    "title": "Retrieval Augmented Thought Process for Private Data Handling in Healthcare",
    "authors": [
      "Thomas Pouplin",
      "Hao Sun",
      "Samuel Holt",
      "Mihaela van der Schaar"
    ],
    "year": "2024",
    "journal": "arXiv:2402.07812v2",
    "doi": "",
    "abstract": "Large Language Models (LLMs) have demonstrated the strong potential to assist both clinicians and the general public with their extensive medical knowledge. However, their application in healthcare is constrained due to concerns about the privacy of data used in training, which prevents the integration of private and personal information because of security and ethical issues. Moreover, if their capabilities can be enhanced with information retrieval to access up-to-date knowledge, the current integration of LLMs with Information retrieval lacks robustness to imperfect retrieval, which can hinder their effectiveness and even reduce overall performance. In this work, we address this challenge by introducing the Retrieval-Augmented Thought Process (RATP). Given access to external knowledge, RATP formulates the thought generation of LLMs as a multiple-step decision process. To optimise such a thought process, RATP leverages Monte-Carlo Tree Search and learns a proxy reward function that permits cost-efficient inference. On a private dataset of electronic medical records, deliberately excluded from any LLM training set, RATP achieves 35% additional accuracy compared to in-context retrieval-augmented generation for the question-answering task.",
    "source_database": "arxiv",
    "arxiv_id": "2402.07812v2"
  },
  {
    "title": "Explainable Depression Detection in Clinical Interviews with Personalized Retrieval-Augmented Generation",
    "authors": [
      "Linhai Zhang",
      "Ziyang Gao",
      "Deyu Zhou",
      "Yulan He"
    ],
    "year": "2025",
    "journal": "arXiv:2503.01315v1",
    "doi": "",
    "abstract": "Depression is a widespread mental health disorder, and clinical interviews are the gold standard for assessment. However, their reliance on scarce professionals highlights the need for automated detection. Current systems mainly employ black-box neural networks, which lack interpretability, which is crucial in mental health contexts. Some attempts to improve interpretability use post-hoc LLM generation but suffer from hallucination. To address these limitations, we propose RED, a Retrieval-augmented generation framework for Explainable depression Detection. RED retrieves evidence from clinical interview transcripts, providing explanations for predictions. Traditional query-based retrieval systems use a one-size-fits-all approach, which may not be optimal for depression detection, as user backgrounds and situations vary. We introduce a personalized query generation module that combines standard queries with user-specific background inferred by LLMs, tailoring retrieval to individual contexts. Additionally, to enhance LLM performance in social intelligence, we augment LLMs by retrieving relevant knowledge from a social intelligence datastore using an event-centric retriever. Experimental results on the real-world benchmark demonstrate RED's effectiveness compared to neural networks and LLM-based baselines.",
    "source_database": "arxiv",
    "arxiv_id": "2503.01315v1"
  },
  {
    "title": "PlainQAFact: Retrieval-augmented Factual Consistency Evaluation Metric for Biomedical Plain Language Summarization",
    "authors": [
      "Zhiwen You",
      "Yue Guo"
    ],
    "year": "2025",
    "journal": "arXiv:2503.08890v4",
    "doi": "https://doi.org/10.1016/j.jbi.2026.105019",
    "abstract": "Hallucinated outputs from large language models (LLMs) pose risks in the medical domain, especially for lay audiences making health-related decisions. Existing automatic factual consistency evaluation methods, such as entailment- and question-answering (QA) -based, struggle with plain language summarization (PLS) due to elaborative explanation phenomenon, which introduces external content (e.g., definitions, background, examples) absent from the scientific abstract to enhance comprehension. To address this, we introduce PlainQAFact, an automatic factual consistency evaluation metric trained on a fine-grained, human-annotated dataset PlainFact, for evaluating factual consistency of both source-simplified and elaborately explained sentences. PlainQAFact first classifies sentence type, then applies a retrieval-augmented QA scoring method. Empirical results show that existing evaluation metrics fail to evaluate the factual consistency in PLS, especially for elaborative explanations, whereas PlainQAFact consistently outperforms them across all evaluation settings. We further analyze PlainQAFact's effectiveness across external knowledge sources, answer extraction strategies, answer overlap measures, and document granularity levels, refining its overall factual consistency assessment. Taken together, our work presents a sentence-aware, retrieval-augmented metric targeted at elaborative explanations in biomedical PLS tasks, providing the community with both a new benchmark and a practical evaluation tool to advance reliable and safe plain language communication in the medical domain. PlainQAFact and PlainFact are available at: https://github.com/zhiwenyou103/PlainQAFact",
    "source_database": "arxiv",
    "arxiv_id": "2503.08890v4"
  },
  {
    "title": "P-RAG: Prompt-Enhanced Parametric RAG with LoRA and Selective CoT for Biomedical and Multi-Hop QA",
    "authors": [
      "Xingda Lyu",
      "Gongfu Lyu",
      "Zitai Yan",
      "Yuxin Jiang"
    ],
    "year": "2026",
    "journal": "arXiv:2602.15874v1",
    "doi": "https://doi.org/10.54254/2755-2721/2025.AST28253",
    "abstract": "Large Language Models (LLMs) demonstrate remarkable capabilities but remain limited by their reliance on static training data. Retrieval-Augmented Generation (RAG) addresses this constraint by retrieving external knowledge during inference, though it still depends heavily on knowledge base quality. To explore potential improvements, we evaluated three RAG variants-Standard RAG, DA-RAG, and our proposed Prompt-Enhanced Parametric RAG (P-RAG), a hybrid architecture that integrates parametric knowledge within the LLM and retrieved evidence, guided by Chain-of-Thought (CoT) prompting and Low-Rank Adaptation (LoRA) fine-tuning-on both general and biomedical datasets. Using LLaMA-3.2-1B-Instruct fine-tuned via LoRA, we evaluate on PubMedQA and 2WikiMultihopQA. P-RAG outperforms Standard RAG on PubMedQA by 10.47 percentage points in F1 (93.33% vs. 82.86%; 12.64% relative). On 2WikiMultihopQA, P-RAG nearly doubles the overall score vs. Standard RAG (33.44% vs. 17.83%) and achieves 44.03% on the Compare subset (with 42.74% Bridge, 21.84% Inference, 8.60% Compose). CoT prompting substantially improves multi-hop reasoning but yields mixed results for simpler, single-hop queries. These findings underscore P-RAG's potential for accurate, scalable, and contextually adaptive biomedical question answering. Our contributions include: (1) LoRA-based fine-tuning of LLaMA-3.2-1B-Instruct for biomedical QA, (2) introduction of P-RAG with Chain-of-Thought prompting, and (3) state-of-the-art results on PubMedQA and 2WikiMultihopQA.",
    "source_database": "arxiv",
    "arxiv_id": "2602.15874v1"
  },
  {
    "title": "GNN-RAG: Graph Neural Retrieval for Large Language Model Reasoning",
    "authors": [
      "Costas Mavromatis",
      "George Karypis"
    ],
    "year": "2024",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2405.20139",
    "pmid": "",
    "abstract": "Knowledge Graphs (KGs) represent human-crafted factual knowledge in the form of triplets (head, relation, tail), which collectively form a graph. Question Answering over KGs (KGQA) is the task of answering natural questions grounding the reasoning to the information provided by the KG. Large Language Models (LLMs) are the state-of-the-art models for QA tasks due to their remarkable ability to understand natural language. On the other hand, Graph Neural Networks (GNNs) have been widely used for KGQA as they can handle the complex graph information stored in the KG. In this work, we introduce GNN-RAG, a novel method for combining language understanding abilities of LLMs with the reasoning abilities of GNNs in a retrieval-augmented generation (RAG) style. First, a GNN reasons over a dense KG subgraph to retrieve answer candidates for a given question. Second, the shortest paths in the KG that connect question entities and answer candidates are extracted to represent KG reasoning paths. The extracted paths are verbalized and given as input for LLM reasoning with RAG. In our GNN-RAG framework, the GNN acts as a dense subgraph reasoner to extract useful graph information, while the LLM leverages its natural language processing ability for ultimate KGQA. Furthermore, we develop a retrieval augmentation (RA) technique to further boost KGQA performance with GNN-RAG. Experimental results show that GNN-RAG achieves state-of-the-art performance in two widely used KGQA benchmarks (WebQSP and CWQ), outperforming or matching GPT-4 performance with a 7B tuned LLM. In addition, GNN-RAG excels on multi-hop and multi-entity questions outperforming competing approaches by 8.9--15.5% points at answer F1.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Synchronous Faithfulness Monitoring for Trustworthy Retrieval-Augmented Generation",
    "authors": [
      "Di Wu",
      "Jia-Chen Gu",
      "Fan Yin",
      "Nanyun Peng",
      "Kai-Wei Chang"
    ],
    "year": "2024",
    "journal": "arXiv:2406.13692v2",
    "doi": "",
    "abstract": "Retrieval-augmented language models (RALMs) have shown strong performance and wide applicability in knowledge-intensive tasks. However, there are significant trustworthiness concerns as RALMs are prone to generating unfaithful outputs, including baseless information or contradictions with the retrieved context. This paper proposes SynCheck, a lightweight monitor that leverages fine-grained decoding dynamics including sequence likelihood, uncertainty quantification, context influence, and semantic alignment to synchronously detect unfaithful sentences. By integrating efficiently measurable and complementary signals, SynCheck enables accurate and immediate feedback and intervention, achieving 0.85 AUROC in detecting faithfulness errors across six long-form retrieval-augmented generation tasks, improving prior best method by 4%. Leveraging SynCheck, we further introduce FOD, a faithfulness-oriented decoding algorithm guided by beam search for long-form retrieval-augmented generation. Empirical results demonstrate that FOD outperforms traditional strategies such as abstention, reranking, or contrastive decoding significantly in terms of faithfulness, achieving over 10% improvement across six datasets.",
    "source_database": "arxiv",
    "arxiv_id": "2406.13692v2"
  },
  {
    "title": "The Development and Evaluation of a Retrieval-Augmented Generation Large Language Model Virtual Assistant for Postoperative Instructions.",
    "authors": [
      "Haider SA",
      "Prabha S",
      "Gomez Cabello CA",
      "Genovese A",
      "Collaco B",
      "Wood N",
      "London J",
      "Bagaria S",
      "Tao C",
      "Forte AJ"
    ],
    "year": "2025",
    "journal": "Bioengineering (Basel, Switzerland)",
    "doi": "10.3390/bioengineering12111219",
    "pmid": "41301175",
    "abstract": "During postoperative recovery, patients and their caregivers often lack crucial information, leading to numerous repetitive inquiries that burden healthcare providers. Traditional discharge materials, including paper handouts and patient portals, are often static, overwhelming, or underutilized, leading to patient overwhelm and contributing to unnecessary ER visits and overall healthcare overutilization. Conversational chatbots offer a solution, but Natural Language Processing (NLP) systems are often inflexible and limited in understanding, while powerful Large Language Models (LLMs) are prone to generating \"hallucinations\".",
    "source_database": "pubmed"
  },
  {
    "title": "Reducing hallucination in structured outputs via Retrieval-Augmented Generation",
    "authors": [
      "Patrice B\u00e9chard",
      "Orlando Marquez Ayala"
    ],
    "year": "2024",
    "journal": "arXiv:2404.08189v1",
    "doi": "https://doi.org/10.18653/v1/2024.naacl-industry.19",
    "abstract": "A common and fundamental limitation of Generative AI (GenAI) is its propensity to hallucinate. While large language models (LLM) have taken the world by storm, without eliminating or at least reducing hallucinations, real-world GenAI systems may face challenges in user adoption. In the process of deploying an enterprise application that produces workflows based on natural language requirements, we devised a system leveraging Retrieval Augmented Generation (RAG) to greatly improve the quality of the structured output that represents such workflows. Thanks to our implementation of RAG, our proposed system significantly reduces hallucinations in the output and improves the generalization of our LLM in out-of-domain settings. In addition, we show that using a small, well-trained retriever encoder can reduce the size of the accompanying LLM, thereby making deployments of LLM-based systems less resource-intensive.",
    "source_database": "arxiv",
    "arxiv_id": "2404.08189v1"
  },
  {
    "title": "Enhancing Critical Thinking with AI: A Tailored Warning System for RAG Models",
    "authors": [
      "Xuyang Zhu",
      "Sejoon Chang",
      "Andrew Kuik"
    ],
    "year": "2025",
    "journal": "arXiv:2504.16883v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) systems offer a powerful approach to enhancing large language model (LLM) outputs by incorporating fact-checked, contextually relevant information. However, fairness and reliability concerns persist, as hallucinations can emerge at both the retrieval and generation stages, affecting users' reasoning and decision-making. Our research explores how tailored warning messages -- whose content depends on the specific context of hallucination -- shape user reasoning and actions in an educational quiz setting. Preliminary findings suggest that while warnings improve accuracy and awareness of high-level hallucinations, they may also introduce cognitive friction, leading to confusion and diminished trust in the system. By examining these interactions, this work contributes to the broader goal of AI-augmented reasoning: developing systems that actively support human reflection, critical thinking, and informed decision-making rather than passive information consumption.",
    "source_database": "arxiv",
    "arxiv_id": "2504.16883v1"
  },
  {
    "title": "Detecting Hallucination and Coverage Errors in Retrieval Augmented Generation for Controversial Topics",
    "authors": [
      "Tyler A. Chang",
      "Katrin Tomanek",
      "Jessica Hoffmann",
      "Nithum Thain",
      "Erin van Liemt",
      "Kathleen Meier-Hellstern",
      "Lucas Dixon"
    ],
    "year": "2024",
    "journal": "arXiv:2403.08904v1",
    "doi": "",
    "abstract": "We explore a strategy to handle controversial topics in LLM-based chatbots based on Wikipedia's Neutral Point of View (NPOV) principle: acknowledge the absence of a single true answer and surface multiple perspectives. We frame this as retrieval augmented generation, where perspectives are retrieved from a knowledge base and the LLM is tasked with generating a fluent and faithful response from the given perspectives. As a starting point, we use a deterministic retrieval system and then focus on common LLM failure modes that arise during this approach to text generation, namely hallucination and coverage errors. We propose and evaluate three methods to detect such errors based on (1) word-overlap, (2) salience, and (3) LLM-based classifiers. Our results demonstrate that LLM-based classifiers, even when trained only on synthetic errors, achieve high error detection performance, with ROC AUC scores of 95.3% for hallucination and 90.5% for coverage error detection on unambiguous error cases. We show that when no training data is available, our other methods still yield good results on hallucination (84.0%) and coverage error (85.2%) detection.",
    "source_database": "arxiv",
    "arxiv_id": "2403.08904v1"
  },
  {
    "title": "RAG System for Supporting Japanese Litigation Procedures: Faithful Response Generation Complying with Legal Norms",
    "authors": [
      "Yuya Ishihara",
      "Atsushi Keyaki",
      "Hiroaki Yamada",
      "Ryutaro Ohara",
      "Mihoko Sumida"
    ],
    "year": "2025",
    "journal": "arXiv:2511.22858v1",
    "doi": "",
    "abstract": "This study discusses the essential components that a Retrieval-Augmented Generation (RAG)-based LLM system should possess in order to support Japanese medical litigation procedures complying with legal norms. In litigation, expert commissioners, such as physicians, architects, accountants, and engineers, provide specialized knowledge to help judges clarify points of dispute. When considering the substitution of these expert roles with a RAG-based LLM system, the constraint of strict adherence to legal norms is imposed. Specifically, three requirements arise: (1) the retrieval module must retrieve appropriate external knowledge relevant to the disputed issues in accordance with the principle prohibiting the use of private knowledge, (2) the responses generated must originate from the context provided by the RAG and remain faithful to that context, and (3) the retrieval module must reference external knowledge with appropriate timestamps corresponding to the issues at hand. This paper discusses the design of a RAG-based LLM system that satisfies these requirements.",
    "source_database": "arxiv",
    "arxiv_id": "2511.22858v1"
  },
  {
    "title": "FIT-RAG: Black-Box RAG with Factual Information and Token Reduction",
    "authors": [
      "Yuren Mao",
      "Xuemei Dong",
      "Wenyi Xu",
      "Yunjun Gao",
      "Bin Wei",
      "Ying Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2403.14374v1",
    "doi": "",
    "abstract": "Due to the extraordinarily large number of parameters, fine-tuning Large Language Models (LLMs) to update long-tail or out-of-date knowledge is impractical in lots of applications. To avoid fine-tuning, we can alternatively treat a LLM as a black-box (i.e., freeze the parameters of the LLM) and augment it with a Retrieval-Augmented Generation (RAG) system, namely black-box RAG. Recently, black-box RAG has achieved success in knowledge-intensive tasks and has gained much attention. Existing black-box RAG methods typically fine-tune the retriever to cater to LLMs' preferences and concatenate all the retrieved documents as the input, which suffers from two issues: (1) Ignorance of Factual Information. The LLM preferred documents may not contain the factual information for the given question, which can mislead the retriever and hurt the effectiveness of black-box RAG; (2) Waste of Tokens. Simply concatenating all the retrieved documents brings large amounts of unnecessary tokens for LLMs, which degenerates the efficiency of black-box RAG. To address these issues, this paper proposes a novel black-box RAG framework which utilizes the factual information in the retrieval and reduces the number of tokens for augmentation, dubbed FIT-RAG. FIT-RAG utilizes the factual information by constructing a bi-label document scorer. Besides, it reduces the tokens by introducing a self-knowledge recognizer and a sub-document-level token reducer. FIT-RAG achieves both superior effectiveness and efficiency, which is validated by extensive experiments across three open-domain question-answering datasets: TriviaQA, NQ and PopQA. FIT-RAG can improve the answering accuracy of Llama2-13B-Chat by 14.3\\% on TriviaQA, 19.9\\% on NQ and 27.5\\% on PopQA, respectively. Furthermore, it can save approximately half of the tokens on average across the three datasets.",
    "source_database": "arxiv",
    "arxiv_id": "2403.14374v1"
  },
  {
    "title": "Enterprise GenAI: LLM Deployment on AWS",
    "authors": [
      "Sufiyan Shaikh"
    ],
    "year": "2026",
    "journal": "International Journal for Research in Applied Science and Engineering Technology",
    "doi": "10.22214/ijraset.2026.77762",
    "pmid": "",
    "abstract": "Generative AI and Large Language Models (LLMs) have transitioned from experimental prototypes to critical\nenterprise assets, requiring robust, scalable, and secure deployment frameworks. This paper presents a comprehensive survey of\nLLM deployment strategies on Amazon Web Services (AWS), focusing on the shift from consumer-grade to enterprise-ready\narchitectures. We analyze the AWS Generative AI stack, specifically comparing managed serverless approaches via Amazon\nBedrock with customizable infrastructure through Amazon SageMaker. The survey highlights key architectural patterns,\nincluding Retrieval-Augmented Generation (RAG) for grounding models in proprietary data and multi-agent systems for\ncomplex task orchestration. Furthermore, we examine the critical role of LLMOps in managing the model lifecycle, ensuring\nsecurity through Guardrails, and optimizing costs via quantization and provisioned throughput. By synthesizing real-world case\nstudies and performance metrics, this paper provides a scalable roadmap for organizations to implement production-grade\nGenerative AI solutions that maintain data sovereignty and operational excellence.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Principled Context Engineering for RAG: Statistical Guarantees via Conformal Prediction",
    "authors": [
      "Debashish Chakraborty",
      "Eugene Yang",
      "Daniel Khashabi",
      "Dawn J. Lawrie",
      "Kevin Duh"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.1007/978-3-032-21300-6_45",
    "pmid": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enhances factual grounding in large language models (LLMs) by incorporating retrieved evidence, but LLM accuracy declines when long or noisy contexts exceed the model's effective attention span. Existing pre-generation filters rely on heuristics or uncalibrated LLM confidence scores, offering no statistical control over retained evidence. We evaluate and demonstrate context engineering through conformal prediction, a coverage-controlled filtering framework that removes irrelevant content while preserving recall of supporting evidence. Using both embedding- and LLM-based scoring functions, we test this approach on the NeuCLIR and RAGTIME collections. Conformal filtering consistently meets its target coverage, ensuring that a specified fraction of relevant snippets are retained, and reduces retained context by 2-3x relative to unfiltered retrieval. On NeuCLIR, downstream factual accuracy measured by ARGUE F1 improves under strict filtering and remains stable at moderate coverage, indicating that most discarded material is redundant or irrelevant. These results demonstrate that conformal prediction enables reliable, coverage-controlled context reduction in RAG, offering a model-agnostic and principled approach to context engineering.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "GRACE-RAG: Graph Retrieval with Adaptive Chunk Extraction for Long-Context Question Answering",
    "authors": [
      "Tianwei Huang",
      "Shuai Lei",
      "Askar Hamdulla",
      "Chunxiao Gao",
      "Huaping Zhang"
    ],
    "year": "2026",
    "journal": "2026 International Conference on Communication Networks and Machine Learning (CNML)",
    "doi": "10.1109/CNML68938.2026.11452294",
    "pmid": "",
    "abstract": "Retrieval-augmented generation (RAG) improves factuality by grounding large language models (LLMs) on external corpora, but it still struggles with multi-hop reasoning and long-context overload. We propose GRACE-RAG, a two-stage framework that (i) builds a fine-grained chunk\u2013sentence\u2013entity graph via dynamic chunking during offline indexing, (ii) performs query-aware entity activation and personalized PageRank for associative retrieval online, and (iii) adaptively compresses retrieved evidence under a token budget before answer generation. Experiments on seven QA benchmarks show consistent gains over strong graph-based RAG baselines.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Design and Implementation of a RAG Chatbot System for Scientific Research Institutes",
    "authors": [
      "Igor Radulovi\u0107",
      "Jovana Mitri\u0107",
      "Katarina Kovijani\u0107",
      "Mija Ljuka",
      "Nejra Merdovi\u0107",
      "Mad\u017eida Hundur Hiyari",
      "A. Badnjevi\u0107"
    ],
    "year": "2026",
    "journal": "2026 30th International Conference on Information Technology (IT)",
    "doi": "10.1109/IT67293.2026.11435604",
    "pmid": "",
    "abstract": "This paper presents the design and implementation of a prototype chatbot system based on the Retrieval-Augmented Generation (RAG) architecture, applied in a scientific research institute to improve knowledge access. The system combines semantic search over a vector knowledge base with response generation using large language models, enabling contextually relevant institutional information. A case study was conducted to evaluate the prototype in a real-world environment. Results indicate improved factual grounding compared to an LLM-only baseline within the evaluated dataset, although the evaluation was limited to a small set of queries and a single institutional document collection.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "MedHallu: A Comprehensive Benchmark for Detecting Medical Hallucinations in Large Language Models",
    "authors": [
      "Shrey Pandit",
      "Jiawei Xu",
      "Junyuan Hong",
      "Zhangyang Wang",
      "Tianlong Chen",
      "Kaidi Xu",
      "Ying Ding"
    ],
    "year": "2025",
    "journal": "arXiv:2502.14302v1",
    "doi": "",
    "abstract": "Advancements in Large Language Models (LLMs) and their increasing use in medical question-answering necessitate rigorous evaluation of their reliability. A critical challenge lies in hallucination, where models generate plausible yet factually incorrect outputs. In the medical domain, this poses serious risks to patient safety and clinical decision-making. To address this, we introduce MedHallu, the first benchmark specifically designed for medical hallucination detection. MedHallu comprises 10,000 high-quality question-answer pairs derived from PubMedQA, with hallucinated answers systematically generated through a controlled pipeline. Our experiments show that state-of-the-art LLMs, including GPT-4o, Llama-3.1, and the medically fine-tuned UltraMedical, struggle with this binary hallucination detection task, with the best model achieving an F1 score as low as 0.625 for detecting \"hard\" category hallucinations. Using bidirectional entailment clustering, we show that harder-to-detect hallucinations are semantically closer to ground truth. Through experiments, we also show incorporating domain-specific knowledge and introducing a \"not sure\" category as one of the answer categories improves the precision and F1 scores by up to 38% relative to baselines.",
    "source_database": "arxiv",
    "arxiv_id": "2502.14302v1"
  },
  {
    "title": "Exploring Patient Perspectives, Engagement, and Output Quality in Doctor-Supervised Use of Artificial Intelligence During Informed Consent Consultation With ChatGPT and Retrieval Augmented Generation (RAG): Quantitative Exploratory Study.",
    "authors": [
      "Donner S",
      "Knauer P",
      "Kienzle A",
      "Dinneen J",
      "Burger J",
      "Perka C",
      "Donner S"
    ],
    "year": "2025",
    "journal": "Journal of medical Internet research",
    "doi": "10.2196/73717",
    "pmid": "41124695",
    "abstract": "Comprehensive preoperative education is essential for optimizing outcomes and ensuring informed consent in patients undergoing total hip arthroplasty (THA). Emerging artificial intelligence (AI) tools, such as ChatGPT, offer scalable support for patient education, but their clinical application requires rigorous evaluation to ensure accuracy, safety, and trust.",
    "source_database": "pubmed"
  },
  {
    "title": "Enhancing Large Language Models with Domain-specific Retrieval Augment Generation: A Case Study on Long-form Consumer Health Question Answering in Ophthalmology.",
    "authors": [
      "Gilson A",
      "Ai X",
      "Arunachalam T",
      "Chen Z",
      "Cheong KX",
      "Dave A",
      "Duic C",
      "Kibe M",
      "Kaminaka A",
      "Prasad M",
      "Siddig F",
      "Singer M",
      "Wong W",
      "Jin Q",
      "Keenan TDL",
      "Hu X",
      "Chew EY",
      "Lu Z",
      "Xu H",
      "Adelman RA",
      "Tham YC",
      "Chen Q"
    ],
    "year": "2024",
    "journal": "ArXiv",
    "doi": "",
    "pmid": "41031070",
    "abstract": "Despite the potential of Large Language Models (LLMs) in medicine, they may generate responses lacking supporting evidence or based on hallucinated evidence. While Retrieval Augment Generation (RAG) is popular to address this issue, few studies implemented and evaluated RAG in downstream domain-specific applications. We developed a RAG pipeline with ~70,000 ophthalmology-specific documents that retrieve relevant documents to augment LLMs during inference time. In a case study on long-form consumer health questions, we systematically evaluated the responses - including over 500 references - of LLMs with and without RAG on 100 questions with 10 healthcare professionals. The evaluation focuses on factuality of evidence, selection and ranking of evidence, attribution of evidence, and answer accuracy and completeness. LLMs without RAG provided 252 references in total. Of which, 45.3% hallucinated, 34.1% consisted of minor errors, and 20.6% were correct. In contrast, LLMs with RAG significantly improved accuracy (54.5% being correct) and reduced error rates (18.8% with minor hallucinations and 26.7% with errors). 62.5% of the top 10 documents retrieved by RAG were selected as the top references in the LLM response, with an average ranking of 4.9. The use of RAG also improved evidence attribution (increasing from 1.85 to 2.49 on a 5-point scale, P<0.001), albeit with slight decreases in accuracy (from 3.52 to 3.23, P=0.03) and completeness (from 3.47 to 3.27, P=0.17). The results demonstrate that LLMs frequently exhibited hallucinated and erroneous evidence in the responses, raising concerns for downstream applications in the medical domain. RAG substantially reduced the proportion of such evidence but encountered challenges. In contrast to existing studies, the results highlight that (1) LLMs may not select top-ranked documents by RAG, which results in hallucinated evidence remaining, (2) LLMs may miss top-ranked documents by RAG, and (3) irrelevant documents by RAG downgrade response accuracy and completeness, especially in challenging tasks such as long-form question answering. In conclusion, in long-form medical question answering, the RAG approach demonstrated improved effectiveness over non-RAG approach. Nevertheless, there are still challenges in evidence retrieval, selection, and attribution, highlighting the need for further development in domain-specific LLM and RAG techniques.",
    "source_database": "pubmed"
  },
  {
    "title": "Automated Literature Review Using NLP Techniques and LLM-Based Retrieval-Augmented Generation",
    "authors": [
      "Nurshat Fateh Ali",
      "Md. Mahdi Mohtasim",
      "Shakil Mosharrof",
      "T. Gopi Krishna"
    ],
    "year": "2024",
    "journal": "arXiv:2411.18583v1",
    "doi": "",
    "abstract": "This research presents and compares multiple approaches to automate the generation of literature reviews using several Natural Language Processing (NLP) techniques and retrieval-augmented generation (RAG) with a Large Language Model (LLM). The ever-increasing number of research articles provides a huge challenge for manual literature review. It has resulted in an increased demand for automation. Developing a system capable of automatically generating the literature reviews from only the PDF files as input is the primary objective of this research work. The effectiveness of several Natural Language Processing (NLP) strategies, such as the frequency-based method (spaCy), the transformer model (Simple T5), and retrieval-augmented generation (RAG) with Large Language Model (GPT-3.5-turbo), is evaluated to meet the primary objective. The SciTLDR dataset is chosen for this research experiment and three distinct techniques are utilized to implement three different systems for auto-generating the literature reviews. The ROUGE scores are used for the evaluation of all three systems. Based on the evaluation, the Large Language Model GPT-3.5-turbo achieved the highest ROUGE-1 score, 0.364. The transformer model comes in second place and spaCy is at the last position. Finally, a graphical user interface is created for the best system based on the large language model.",
    "source_database": "arxiv",
    "arxiv_id": "2411.18583v1"
  },
  {
    "title": "Utilizing Metadata for Better Retrieval-Augmented Generation",
    "authors": [
      "Raquib Bin Yousuf",
      "Shengzhe Xu",
      "Mandar Sharma",
      "Andrew Neeser",
      "Chris Latimer",
      "Naren Ramakrishnan"
    ],
    "year": "2026",
    "journal": "arXiv:2601.11863v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation systems depend on retrieving semantically relevant document chunks to support accurate, grounded outputs from large language models. In structured and repetitive corpora such as regulatory filings, chunk similarity alone often fails to distinguish between documents with overlapping language. Practitioners often flatten metadata into input text as a heuristic, but the impact and trade-offs of this practice remain poorly understood. We present a systematic study of metadata-aware retrieval strategies, comparing plain-text baselines with approaches that embed metadata directly. Our evaluation spans metadata-as-text (prefix and suffix), a dual-encoder unified embedding that fuses metadata and content in a single index, dual-encoder late-fusion retrieval, and metadata-aware query reformulation. Across multiple retrieval metrics and question types, we find that prefixing and unified embeddings consistently outperform plain-text baselines, with the unified at times exceeding prefixing while being easier to maintain. Beyond empirical comparisons, we analyze embedding space, showing that metadata integration improves effectiveness by increasing intra-document cohesion, reducing inter-document confusion, and widening the separation between relevant and irrelevant chunks. Field-level ablations show that structural cues provide strong disambiguating signals. Our code, evaluation framework, and the RAGMATE-10K dataset are publicly hosted.",
    "source_database": "arxiv",
    "arxiv_id": "2601.11863v1"
  },
  {
    "title": "RAG Makes Guardrails Unsafe? Investigating Robustness of Guardrails under RAG-style Contexts",
    "authors": [
      "Yining She",
      "Daniel W. Peterson",
      "Marianne Menglin Liu",
      "Vikas Upadhyay",
      "Mohammad Hossein Chaghazardi",
      "Eunsuk Kang",
      "Dan Roth"
    ],
    "year": "2025",
    "journal": "arXiv:2510.05310v1",
    "doi": "",
    "abstract": "With the increasing adoption of large language models (LLMs), ensuring the safety of LLM systems has become a pressing concern. External LLM-based guardrail models have emerged as a popular solution to screen unsafe inputs and outputs, but they are themselves fine-tuned or prompt-engineered LLMs that are vulnerable to data distribution shifts. In this paper, taking Retrieval Augmentation Generation (RAG) as a case study, we investigated how robust LLM-based guardrails are against additional information embedded in the context. Through a systematic evaluation of 3 Llama Guards and 2 GPT-oss models, we confirmed that inserting benign documents into the guardrail context alters the judgments of input and output guardrails in around 11% and 8% of cases, making them unreliable. We separately analyzed the effect of each component in the augmented context: retrieved documents, user query, and LLM-generated response. The two mitigation methods we tested only bring minor improvements. These results expose a context-robustness gap in current guardrails and motivate training and evaluation protocols that are robust to retrieval and query composition.",
    "source_database": "arxiv",
    "arxiv_id": "2510.05310v1"
  },
  {
    "title": "Ask-EDA: A Design Assistant Empowered by LLM, Hybrid RAG and Abbreviation De-hallucination",
    "authors": [
      "Luyao Shi",
      "Michael Kazda",
      "Bradley Sears",
      "Nick Shropshire",
      "Ruchir Puri"
    ],
    "year": "2024",
    "journal": "arXiv:2406.06575v1",
    "doi": "",
    "abstract": "Electronic design engineers are challenged to find relevant information efficiently for a myriad of tasks within design construction, verification and technology development. Large language models (LLM) have the potential to help improve productivity by serving as conversational agents that effectively function as subject-matter experts. In this paper we demonstrate Ask-EDA, a chat agent designed to serve as a 24x7 expert available to provide guidance to design engineers. Ask-EDA leverages LLM, hybrid retrieval augmented generation (RAG) and abbreviation de-hallucination (ADH) techniques to deliver more relevant and accurate responses. We curated three evaluation datasets, namely q2a-100, cmds-100 and abbr-100. Each dataset is tailored to assess a distinct aspect: general design question answering, design command handling and abbreviation resolution. We demonstrated that hybrid RAG offers over a 40% improvement in Recall on the q2a-100 dataset and over a 60% improvement on the cmds-100 dataset compared to not using RAG, while ADH yields over a 70% enhancement in Recall on the abbr-100 dataset. The evaluation results show that Ask-EDA can effectively respond to design-related inquiries.",
    "source_database": "arxiv",
    "arxiv_id": "2406.06575v1"
  },
  {
    "title": "Enhancing LLM Factual Accuracy with RAG to Counter Hallucinations: A Case Study on Domain-Specific Queries in Private Knowledge-Bases",
    "authors": [
      "Jiarui Li",
      "Ye Yuan",
      "Zehua Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2403.10446v1",
    "doi": "",
    "abstract": "We proposed an end-to-end system design towards utilizing Retrieval Augmented Generation (RAG) to improve the factual accuracy of Large Language Models (LLMs) for domain-specific and time-sensitive queries related to private knowledge-bases. Our system integrates RAG pipeline with upstream datasets processing and downstream performance evaluation. Addressing the challenge of LLM hallucinations, we finetune models with a curated dataset which originates from CMU's extensive resources and annotated with the teacher model. Our experiments demonstrate the system's effectiveness in generating more accurate answers to domain-specific and time-sensitive inquiries. The results also revealed the limitations of fine-tuning LLMs with small-scale and skewed datasets. This research highlights the potential of RAG systems in augmenting LLMs with external datasets for improved performance in knowledge-intensive tasks. Our code and models are available on Github.",
    "source_database": "arxiv",
    "arxiv_id": "2403.10446v1"
  },
  {
    "title": "RAC: Retrieval-Augmented Clarification for Faithful Conversational Search",
    "authors": [
      "Ahmed Rayane Kebir",
      "Vincent Guigue",
      "Lynda Said Lhadj",
      "Laure Soulier"
    ],
    "year": "2026",
    "journal": "arXiv:2601.11722v1",
    "doi": "",
    "abstract": "Clarification questions help conversational search systems resolve ambiguous or underspecified user queries. While prior work has focused on fluency and alignment with user intent, especially through facet extraction, much less attention has been paid to grounding clarifications in the underlying corpus. Without such grounding, systems risk asking questions that cannot be answered from the available documents. We introduce RAC (Retrieval-Augmented Clarification), a framework for generating corpus-faithful clarification questions. After comparing several indexing strategies for retrieval, we fine-tune a large language model to make optimal use of research context and to encourage the generation of evidence-based question. We then apply contrastive preference optimization to favor questions supported by retrieved passages over ungrounded alternatives. Evaluated on four benchmarks, RAC demonstrate significant improvements over baselines. In addition to LLM-as-Judge assessments, we introduce novel metrics derived from NLI and data-to-text to assess how well questions are anchored in the context, and we demonstrate that our approach consistently enhances faithfulness.",
    "source_database": "arxiv",
    "arxiv_id": "2601.11722v1"
  },
  {
    "title": "Inference-Time Safety For Code LLMs Via Retrieval-Augmented Revision",
    "authors": [
      "Manisha Mukherjee",
      "Vincent J. Hellendoorn"
    ],
    "year": "2026",
    "journal": "arXiv:2603.01494v1",
    "doi": "",
    "abstract": "Large Language Models (LLMs) are increasingly deployed for code generation in high-stakes software development, yet their limited transparency in security reasoning and brittleness to evolving vulnerability patterns raise critical trustworthiness concerns. Models trained on static datasets cannot readily adapt to newly discovered vulnerabilities or changing security standards without retraining, leading to the repeated generation of unsafe code.   We present a principled approach to trustworthy code generation by design that operates as an inference-time safety mechanism. Our approach employs retrieval-augmented generation to surface relevant security risks in generated code and retrieve related security discussions from a curated Stack Overflow knowledge base, which are then used to guide an LLM during code revision. This design emphasizes three aspects relevant to trustworthiness: (1) interpretability, through transparent safety interventions grounded in expert community explanations; (2) robustness, by allowing adaptation to evolving security practices without model retraining; and (3) safety alignment, through real-time intervention before unsafe code reaches deployment.   Across real-world and benchmark datasets, our approach improves the security of LLM-generated code compared to prompting alone, while introducing no new vulnerabilities as measured by static analysis. These results suggest that principled, retrieval-augmented inference-time interventions can serve as a complementary mechanism for improving the safety of LLM-based code generation, and highlight the ongoing value of community knowledge in supporting trustworthy AI deployment.",
    "source_database": "arxiv",
    "arxiv_id": "2603.01494v1"
  },
  {
    "title": "Performance of Large Language Models on the Acute Coronary Syndrome Guidelines Using Retrieval-Augmented Generation.",
    "authors": [
      "Alexandrou M",
      "Kumar S",
      "Mahtani AU",
      "Strepkos D",
      "Carvalho PEP",
      "Mutlu D",
      "Ser OS",
      "Rempakos A",
      "Mastrodemos OC",
      "Rangan BV",
      "Jalli S",
      "Sandoval Y",
      "Brilakis ES"
    ],
    "year": "2025",
    "journal": "JACC. Cardiovascular interventions",
    "doi": "10.1016/j.jcin.2025.08.019",
    "pmid": "41161918",
    "abstract": "Large language models (LLMs) are increasingly applied in interventional cardiology, but hallucinations limit their clinical utility.",
    "source_database": "pubmed"
  },
  {
    "title": "Fact-Controlled Diagnosis of Hallucinations in Medical Text Summarization",
    "authors": [
      "Suhas BN",
      "Han-Chin Shing",
      "Lei Xu",
      "Mitch Strong",
      "Jon Burnsky",
      "Jessica Ofor",
      "Jordan R. Mason",
      "Susan Chen",
      "Sundararajan Srinivasan",
      "Chaitanya Shivade",
      "Jack Moriarty",
      "Joseph Paul Cohen"
    ],
    "year": "2025",
    "journal": "arXiv:2506.00448v1",
    "doi": "",
    "abstract": "Hallucinations in large language models (LLMs) during summarization of patient-clinician dialogues pose significant risks to patient care and clinical decision-making. However, the phenomenon remains understudied in the clinical domain, with uncertainty surrounding the applicability of general-domain hallucination detectors. The rarity and randomness of hallucinations further complicate their investigation. In this paper, we conduct an evaluation of hallucination detection methods in the medical domain, and construct two datasets for the purpose: A fact-controlled Leave-N-out dataset -- generated by systematically removing facts from source dialogues to induce hallucinated content in summaries; and a natural hallucination dataset -- arising organically during LLM-based medical summarization. We show that general-domain detectors struggle to detect clinical hallucinations, and that performance on fact-controlled hallucinations does not reliably predict effectiveness on natural hallucinations. We then develop fact-based approaches that count hallucinations, offering explainability not available with existing methods. Notably, our LLM-based detectors, which we developed using fact-controlled hallucinations, generalize well to detecting real-world clinical hallucinations. This research contributes a suite of specialized metrics supported by expert-annotated datasets to advance faithful clinical summarization systems.",
    "source_database": "arxiv",
    "arxiv_id": "2506.00448v1"
  },
  {
    "title": "Vendi-RAG: Adaptively Trading-Off Diversity And Quality Significantly Improves Retrieval Augmented Generation With LLMs",
    "authors": [
      "Mohammad Reza Rezaei",
      "Adji Bousso Dieng"
    ],
    "year": "2025",
    "journal": "arXiv:2502.11228v2",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) enhances large language models (LLMs) for domain-specific question-answering (QA) tasks by leveraging external knowledge sources. However, traditional RAG systems primarily focus on relevance-based retrieval and often struggle with redundancy, especially when reasoning requires connecting information from multiple sources. This paper introduces Vendi-RAG, a framework based on an iterative process that jointly optimizes retrieval diversity and answer quality. This joint optimization leads to significantly higher accuracy for multi-hop QA tasks. Vendi-RAG leverages the Vendi Score (VS), a flexible similarity-based diversity metric, to promote semantic diversity in document retrieval. It then uses an LLM judge that evaluates candidate answers, generated after a reasoning step, and outputs a score that the retriever uses to balance relevance and diversity among the retrieved documents during each iteration. Experiments on three challenging datasets -- HotpotQA, MuSiQue, and 2WikiMultiHopQA -- demonstrate Vendi-RAG's effectiveness in multi-hop reasoning tasks. The framework achieves significant accuracy improvements over traditional single-step and multi-step RAG approaches, with accuracy increases reaching up to +4.2% on HotpotQA, +4.1% on 2WikiMultiHopQA, and +1.3% on MuSiQue compared to Adaptive-RAG, the current best baseline. The benefits of Vendi-RAG are even more pronounced as the number of retrieved documents increases. Finally, we evaluated Vendi-RAG across different LLM backbones, including GPT-3.5, GPT-4, and GPT-4o-mini, and observed consistent improvements, demonstrating that the framework's advantages are model-agnostic.",
    "source_database": "arxiv",
    "arxiv_id": "2502.11228v2"
  },
  {
    "title": "PrismRAG: Boosting RAG Factuality with Distractor Resilience and Strategized Reasoning",
    "authors": [
      "Mohammad Kachuee",
      "Teja Gollapudi",
      "Minseok Kim",
      "Yin Huang",
      "Kai Sun",
      "Xiao Yang",
      "Jiaqi Wang",
      "Nirav Shah",
      "Yue Liu",
      "Aaron Colak",
      "Anuj Kumar",
      "Wen-tau Yih",
      "Xin Luna Dong"
    ],
    "year": "2025",
    "journal": "arXiv:2507.18857v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) often falls short when retrieved context includes confusing semi-relevant passages, or when answering questions require deep contextual understanding and reasoning. We propose an efficient fine-tuning framework, called PrismRAG, that (i) trains the model with distractor-aware QA pairs mixing gold evidence with subtle distractor passages, and (ii) instills reasoning-centric habits that make the LLM plan, rationalize, and synthesize without relying on extensive human engineered instructions. Evaluated across 12 open-book RAG QA benchmarks spanning diverse application domains and scenarios, PrismRAG improves average factuality by 5.4%, outperforming state-of-the-art solutions.",
    "source_database": "arxiv",
    "arxiv_id": "2507.18857v1"
  },
  {
    "title": "From LLM Reasoning to Autonomous AI Agents: A Comprehensive Review",
    "authors": [
      "M. Ferrag",
      "N. Tihanyi",
      "M. Debbah"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2504.19678",
    "pmid": "",
    "abstract": "Large language models and autonomous AI agents have evolved rapidly, resulting in a diverse array of evaluation benchmarks, frameworks, and collaboration protocols. Driven by the growing need for standardized evaluation and integration, we systematically consolidate these fragmented efforts into a unified framework. However, the landscape remains fragmented and lacks a unified taxonomy or comprehensive survey. Therefore, we present a side-by-side comparison of benchmarks developed between 2019 and 2025 that evaluate these models and agents across multiple domains. In addition, we propose a taxonomy of approximately 60 benchmarks that cover general and academic knowledge reasoning, mathematical problem-solving, code generation and software engineering, factual grounding and retrieval, domain-specific evaluations, multimodal and embodied tasks, task orchestration, and interactive assessments. Furthermore, we review AI-agent frameworks introduced between 2023 and 2025 that integrate large language models with modular toolkits to enable autonomous decision-making and multi-step reasoning. Moreover, we present real-world applications of autonomous AI agents in materials science, biomedical research, academic ideation, software engineering, synthetic data generation, chemical reasoning, mathematical problem-solving, geographic information systems, multimedia, healthcare, and finance. We then survey key agent-to-agent collaboration protocols, namely the Agent Communication Protocol (ACP), the Model Context Protocol (MCP), and the Agent-to-Agent Protocol (A2A). Finally, we discuss recommendations for future research, focusing on advanced reasoning strategies, failure modes in multi-agent LLM systems, automated scientific discovery, dynamic tool integration via reinforcement learning, integrated search capabilities, and security vulnerabilities in agent protocols.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "EVOR: Evolving Retrieval for Code Generation",
    "authors": [
      "Hongjin Su",
      "Shuyang Jiang",
      "Yuhang Lai",
      "Haoyuan Wu",
      "Boao Shi",
      "Che Liu",
      "Qian Liu",
      "Tao Yu"
    ],
    "year": "2024",
    "journal": "arXiv:2402.12317v2",
    "doi": "",
    "abstract": "Recently the retrieval-augmented generation (RAG) has been successfully applied in code generation. However, existing pipelines for retrieval-augmented code generation (RACG) employ static knowledge bases with a single source, limiting the adaptation capabilities of Large Language Models (LLMs) to domains they have insufficient knowledge of. In this work, we develop a novel pipeline, EVOR, that employs the synchronous evolution of both queries and diverse knowledge bases. On two realistic settings where the external knowledge is required to solve code generation tasks, we compile four new datasets associated with frequently updated libraries and long-tail programming languages, named EVOR-BENCH. Extensive experiments demonstrate that EVOR achieves two to four times of execution accuracy compared to other methods such as Reflexion (Shinn et al., 2024), DocPrompting (Zhou et al., 2023), etc. We demonstrate that EVOR is flexible and can be easily combined with them to achieve further improvement. Further analysis reveals that EVOR benefits from the synchronous evolution of queries and documents and the diverse information sources in the knowledge base. We hope that our studies will inspire more insights into the design of advanced RACG pipelines in future research. Our model, code, and data are available at https://arks-codegen.github.io.",
    "source_database": "arxiv",
    "arxiv_id": "2402.12317v2"
  },
  {
    "title": "Ragas: Automated Evaluation of Retrieval Augmented Generation",
    "authors": [
      "Shahul Es",
      "Jithin James",
      "Luis Espinosa-Anke",
      "Steven Schockaert"
    ],
    "year": "2023",
    "journal": "arXiv:2309.15217v2",
    "doi": "",
    "abstract": "We introduce Ragas (Retrieval Augmented Generation Assessment), a framework for reference-free evaluation of Retrieval Augmented Generation (RAG) pipelines. RAG systems are composed of a retrieval and an LLM based generation module, and provide LLMs with knowledge from a reference textual database, which enables them to act as a natural language layer between a user and textual databases, reducing the risk of hallucinations. Evaluating RAG architectures is, however, challenging because there are several dimensions to consider: the ability of the retrieval system to identify relevant and focused context passages, the ability of the LLM to exploit such passages in a faithful way, or the quality of the generation itself. With Ragas, we put forward a suite of metrics which can be used to evaluate these different dimensions \\textit{without having to rely on ground truth human annotations}. We posit that such a framework can crucially contribute to faster evaluation cycles of RAG architectures, which is especially important given the fast adoption of LLMs.",
    "source_database": "arxiv",
    "arxiv_id": "2309.15217v2"
  },
  {
    "title": "IGMiRAG: Intuition-Guided Retrieval-Augmented Generation with Adaptive Mining of In-Depth Memory",
    "authors": [
      "Xingliang Hou",
      "Yuyan Liu",
      "Qi Sun",
      "haoxiu wang",
      "Hao Hu",
      "Shaoyi Du",
      "Zhiqiang Tian"
    ],
    "year": "2026",
    "journal": "arXiv:2602.07525v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) equips large language models (LLMs) with reliable knowledge memory. To strengthen cross-text associations, recent research integrates graphs and hypergraphs into RAG to capture pairwise and multi-entity relations as structured links. However, their misaligned memory organization necessitates costly, disjointed retrieval. To address these limitations, we propose IGMiRAG, a framework inspired by human intuition-guided reasoning. It constructs a hierarchical heterogeneous hypergraph to align multi-granular knowledge, incorporating deductive pathways to simulate realistic memory structures. During querying, IGMiRAG distills intuitive strategies via a question parser to control mining depth and memory window, and activates instantaneous memories as anchors using dual-focus retrieval. Mirroring human intuition, the framework guides retrieval resource allocation dynamically. Furthermore, we design a bidirectional diffusion algorithm that navigates deductive paths to mine in-depth memories, emulating human reasoning processes. Extensive evaluations indicate IGMiRAG outperforms the state-of-the-art baseline by 4.8% EM and 5.0% F1 overall, with token costs adapting to task complexity (average 6.3k+, minimum 3.0k+). This work presents a cost-effective RAG paradigm that improves both efficiency and effectiveness.",
    "source_database": "arxiv",
    "arxiv_id": "2602.07525v1"
  },
  {
    "title": "Reconstructing Context: Evaluating Advanced Chunking Strategies for Retrieval-Augmented Generation",
    "authors": [
      "Carlo Merola",
      "Jaspinder Singh"
    ],
    "year": "2025",
    "journal": "arXiv:2504.19754v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) has become a transformative approach for enhancing large language models (LLMs) by grounding their outputs in external knowledge sources. Yet, a critical question persists: how can vast volumes of external knowledge be managed effectively within the input constraints of LLMs? Traditional methods address this by chunking external documents into smaller, fixed-size segments. While this approach alleviates input limitations, it often fragments context, resulting in incomplete retrieval and diminished coherence in generation. To overcome these shortcomings, two advanced techniques, late chunking and contextual retrieval, have been introduced, both aiming to preserve global context. Despite their potential, their comparative strengths and limitations remain unclear. This study presents a rigorous analysis of late chunking and contextual retrieval, evaluating their effectiveness and efficiency in optimizing RAG systems. Our results indicate that contextual retrieval preserves semantic coherence more effectively but requires greater computational resources. In contrast, late chunking offers higher efficiency but tends to sacrifice relevance and completeness.",
    "source_database": "arxiv",
    "arxiv_id": "2504.19754v1"
  },
  {
    "title": "Blended RAG: Improving RAG (Retriever-Augmented Generation) Accuracy with Semantic Search and Hybrid Query-Based Retrievers",
    "authors": [
      "Kunal Sawarkar",
      "Abhilasha Mangal",
      "Shivam Raj Solanki"
    ],
    "year": "2024",
    "journal": "arXiv:2404.07220v2",
    "doi": "https://doi.org/10.1109/MIPR62202.2024.00031",
    "abstract": "Retrieval-Augmented Generation (RAG) is a prevalent approach to infuse a private knowledge base of documents with Large Language Models (LLM) to build Generative Q\\&A (Question-Answering) systems. However, RAG accuracy becomes increasingly challenging as the corpus of documents scales up, with Retrievers playing an outsized role in the overall RAG accuracy by extracting the most relevant document from the corpus to provide context to the LLM. In this paper, we propose the 'Blended RAG' method of leveraging semantic search techniques, such as Dense Vector indexes and Sparse Encoder indexes, blended with hybrid query strategies. Our study achieves better retrieval results and sets new benchmarks for IR (Information Retrieval) datasets like NQ and TREC-COVID datasets. We further extend such a 'Blended Retriever' to the RAG system to demonstrate far superior results on Generative Q\\&A datasets like SQUAD, even surpassing fine-tuning performance.",
    "source_database": "arxiv",
    "arxiv_id": "2404.07220v2"
  },
  {
    "title": "Expert Mind: A Retrieval-Augmented Architecture for Expert Knowledge Preservation in the Energy Sector",
    "authors": [
      "Diego Ezequiel Cervera"
    ],
    "year": "2026",
    "journal": "arXiv:2603.14541v1",
    "doi": "",
    "abstract": "The departure of subject-matter experts from industrial organizations results in the irreversible loss of tacit knowledge that is rarely captured through conventional documentation practices. This paper proposes Expert Mind, an experimental system that leverages Retrieval-Augmented Generation (RAG), large language models (LLMs), and multimodal capture techniques to preserve, structure, and make queryable the deep expertise of organizational knowledge holders. Drawing on the specific context of the energy sector, where decades of operational experience risk being lost to an aging workforce, we describe the system architecture, processing pipeline, ethical framework, and evaluation methodology. The proposed system addresses the knowledge elicitation problem through structured interviews, think-aloud sessions, and text corpus ingestion, which are subsequently embedded into a vector store and queried through a conversational interface. Preliminary design considerations suggest Expert Mind can significantly reduce knowledge transfer latency and improve onboarding efficiency. Ethical dimensions including informed consent, intellectual property, and the right to erasure are addressed as first-class design constraints.",
    "source_database": "arxiv",
    "arxiv_id": "2603.14541v1"
  },
  {
    "title": "RAG-Star: Enhancing Deliberative Reasoning with Retrieval Augmented Verification and Refinement",
    "authors": [
      "Jinhao Jiang",
      "Jiayi Chen",
      "Junyi Li",
      "Ruiyang Ren",
      "Shijie Wang",
      "Wayne Xin Zhao",
      "Yang Song",
      "Tao Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2412.12881v1",
    "doi": "",
    "abstract": "Existing large language models (LLMs) show exceptional problem-solving capabilities but might struggle with complex reasoning tasks. Despite the successes of chain-of-thought and tree-based search methods, they mainly depend on the internal knowledge of LLMs to search over intermediate reasoning steps, limited to dealing with simple tasks involving fewer reasoning steps. In this paper, we propose \\textbf{RAG-Star}, a novel RAG approach that integrates the retrieved information to guide the tree-based deliberative reasoning process that relies on the inherent knowledge of LLMs. By leveraging Monte Carlo Tree Search, RAG-Star iteratively plans intermediate sub-queries and answers for reasoning based on the LLM itself. To consolidate internal and external knowledge, we propose an retrieval-augmented verification that utilizes query- and answer-aware reward modeling to provide feedback for the inherent reasoning of LLMs. Our experiments involving Llama-3.1-8B-Instruct and GPT-4o demonstrate that RAG-Star significantly outperforms previous RAG and reasoning methods.",
    "source_database": "arxiv",
    "arxiv_id": "2412.12881v1"
  },
  {
    "title": "T-RAG: Lessons from the LLM Trenches",
    "authors": [
      "Masoomali Fatehkia",
      "Ji Kim Lucas",
      "Sanjay Chawla"
    ],
    "year": "2024",
    "journal": "arXiv:2402.07483v2",
    "doi": "",
    "abstract": "Large Language Models (LLM) have shown remarkable language capabilities fueling attempts to integrate them into applications across a wide range of domains. An important application area is question answering over private enterprise documents where the main considerations are data security, which necessitates applications that can be deployed on-prem, limited computational resources and the need for a robust application that correctly responds to queries. Retrieval-Augmented Generation (RAG) has emerged as the most prominent framework for building LLM-based applications. While building a RAG is relatively straightforward, making it robust and a reliable application requires extensive customization and relatively deep knowledge of the application domain. We share our experiences building and deploying an LLM application for question answering over private organizational documents. Our application combines the use of RAG with a finetuned open-source LLM. Additionally, our system, which we call Tree-RAG (T-RAG), uses a tree structure to represent entity hierarchies within the organization. This is used to generate a textual description to augment the context when responding to user queries pertaining to entities within the organization's hierarchy. Our evaluations, including a Needle in a Haystack test, show that this combination performs better than a simple RAG or finetuning implementation. Finally, we share some lessons learned based on our experiences building an LLM application for real-world use.",
    "source_database": "arxiv",
    "arxiv_id": "2402.07483v2"
  },
  {
    "title": "Collab-RAG: Boosting Retrieval-Augmented Generation for Complex Question Answering via White-Box and Black-Box LLM Collaboration",
    "authors": [
      "Ran Xu",
      "Wenqi Shi",
      "Yuchen Zhuang",
      "Yue Yu",
      "Joyce C. Ho",
      "Haoyu Wang",
      "Carl Yang"
    ],
    "year": "2025",
    "journal": "arXiv:2504.04915v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) systems often struggle to handle multi-hop question-answering tasks accurately due to irrelevant context retrieval and limited complex reasoning capabilities. We introduce Collab-RAG, a collaborative training framework that leverages mutual enhancement between a white-box small language model (SLM) and a blackbox large language model (LLM) for RAG. Specifically, the SLM decomposes complex queries into simpler sub-questions, thus enhancing the accuracy of the retrieval and facilitating more effective reasoning by the black-box LLM. Concurrently, the black-box LLM provides feedback signals to improve the SLM's decomposition capability. We observe that Collab-RAG relies solely on supervision from an affordable black-box LLM without additional distillation from frontier LLMs, yet demonstrates strong generalization across multiple black-box LLMs. Experimental evaluations across five multi-hop QA datasets demonstrate that Collab-RAG substantially outperforms existing black-box-only and SLM fine-tuning baselines by 1.8%-14.2% on average. In particular, our fine-tuned 3B SLM surpasses a frozen 32B LLM in question decomposition, highlighting the efficiency of Collab-RAG in improving reasoning and retrieval for complex questions. The code of Collab-RAG is available on https://github.com/ritaranx/Collab-RAG/.",
    "source_database": "arxiv",
    "arxiv_id": "2504.04915v1"
  },
  {
    "title": "RAG over Tables: Hierarchical Memory Index, Multi-Stage Retrieval, and Benchmarking",
    "authors": [
      "Jiaru Zou",
      "Dongqi Fu",
      "Sirui Chen",
      "Xinrui He",
      "Zihao Li",
      "Yada Zhu",
      "Jiawei Han",
      "Jingrui He"
    ],
    "year": "2025",
    "journal": "arXiv:2504.01346v4",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enhances Large Language Models (LLMs) by integrating them with an external knowledge base to improve the answer relevance and accuracy. In real-world scenarios, beyond pure text, a substantial amount of knowledge is stored in tables, and user questions often require retrieving answers that are distributed across multiple tables. Retrieving knowledge from a table corpora (i.e., various individual tables) for a question remains nascent, at least, for (i) how to understand intra- and inter-table knowledge effectively, (ii) how to filter unnecessary tables and how to retrieve the most relevant tables efficiently, (iii) how to prompt LLMs to infer over the retrieval, (iv) how to evaluate the corresponding performance in a realistic setting. Facing the above challenges, in this paper, we first propose a table-corpora-aware RAG framework, named T-RAG, which consists of the hierarchical memory index, multi-stage retrieval, and graph-aware prompting for effective and efficient table knowledge retrieval and inference. Further, we first develop a multi-table question answering benchmark named MultiTableQA, which spans 3 different task types, 57,193 tables, and 23,758 questions in total, and the sources are all from real-world scenarios. Based on MultiTableQA, we did the holistic comparison over table retrieval methods, RAG methods, and table-to-graph representation learning methods, where T-RAG shows the leading accuracy, recall, and running time performance. Also, under T-RAG, we evaluate the inference ability upgrade of different LLMs. Code and Data are available at https://github.com/jiaruzouu/T-RAG",
    "source_database": "arxiv",
    "arxiv_id": "2504.01346v4"
  },
  {
    "title": "Telco-RAG: Navigating the Challenges of Retrieval-Augmented Language Models for Telecommunications",
    "authors": [
      "Andrei-Laurentiu Bornea",
      "Fadhel Ayed",
      "Antonio De Domenico",
      "Nicola Piovesan",
      "Ali Maatouk"
    ],
    "year": "2024",
    "journal": "arXiv:2404.15939v3",
    "doi": "",
    "abstract": "The application of Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) systems in the telecommunication domain presents unique challenges, primarily due to the complex nature of telecom standard documents and the rapid evolution of the field. The paper introduces Telco-RAG, an open-source RAG framework designed to handle the specific needs of telecommunications standards, particularly 3rd Generation Partnership Project (3GPP) documents. Telco-RAG addresses the critical challenges of implementing a RAG pipeline on highly technical content, paving the way for applying LLMs in telecommunications and offering guidelines for RAG implementation in other technical domains.",
    "source_database": "arxiv",
    "arxiv_id": "2404.15939v3"
  },
  {
    "title": "LLMForum-RAG: A Multilingual, Multi-domain Framework for Factual Reasoning via Weighted Retrieval and LLM Collaboration",
    "authors": [
      "Soham Chaudhuri",
      "Dipanjan Saha",
      "Dipankar Das"
    ],
    "year": "2025",
    "journal": "",
    "doi": "10.18653/v1/2025.findings-ijcnlp.88",
    "pmid": "",
    "abstract": "LLMs have emerged as a transformative technology, enabling a wide range of tasks such as text generation, summarization, question answering, and more. The use of RAG with LLM is on the rise to provide deeper knowledge bases of various domains. In the present study, we propose a RAG framework that employs weighted Rocchio mechanism for retrieval and LLM collaborative forum with supervision for generation. Our framework is evaluated in two downstream tasks: a biomedical question answering (BioASQ-QA) and a multilingual claim verification (e.g. in English, Hindi, and Bengali) to showcase its adapt-ability across various domains and languages. The proposed retriever is capable to achieve substantial improvement over BM25 of +8% (BioASQ-QA), +15% (English), +5% (Hindi), and +20% (Bengali) for Recall@5. In veracity classification, our framework achieves an average answer correctness of 0 . 78 on BioASQ-QA while achieving F1-score of 0 . 59 , 0 . 56 , and 0 . 41 for English, Hindi and Bengali languages, respectively. These results demonstrate the effectiveness and robustness of our framework for retrieval and generation in multilingual and multi-domain settings.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Retrieval-Augmented Multi-LLM Ensemble for Industrial Part Specification Extraction",
    "authors": [
      "Muzakkiruddin Ahmed Mohammed",
      "John R. Talburt",
      "Leon Claasssens",
      "Adriaan Marais"
    ],
    "year": "2025",
    "journal": "2025 17th International Conference on Knowledge and System Engineering (KSE)",
    "doi": "10.1109/KSE68178.2025.11309590",
    "pmid": "",
    "abstract": "Industrial part specification extraction from unstructured text remains a persistent challenge in manufacturing, procurement, and maintenance, where manual processing is both time-consuming and error-prone. This paper introduces RAGsemble, a retrieval-augmented multi-LLM ensemble framework that orchestrates nine state-of-the-art Large Language Models (LLMs) within a structured three-phase pipeline. RAGsemble addresses key limitations of single-model systems by combining the complementary strengths of model families including Gemini (2.0, 2.5, 1.5), OpenAI (GPT-4o, o4-mini), Mistral Large, and Gemma (1B, 4B, 3n-e4b), while grounding outputs in factual data using FAISS-based semantic retrieval. The system architecture consists of three stages: (1) parallel extraction by diverse LLMs, (2) targeted research augmentation leveraging high-performing models, and (3) intelligent synthesis with conflict resolution and confidence-aware scoring. RAG integration provides real-time access to structured part databases, enabling the system to validate, refine, and enrich outputs through similarity-based reference retrieval. Experimental results using real industrial datasets demonstrate significant gains in extraction accuracy, technical completeness, and structured output quality compared to leading single-LLM baselines. Key contributions include a scalable ensemble architecture for industrial domains, seamless RAG integration throughout the pipeline, comprehensive quality assessment mechanisms, and a production-ready solution suitable for deployment in knowledgeintensive manufacturing environments.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Neural Probe-Based Hallucination Detection for Large Language Models",
    "authors": [
      "Shize Liang",
      "Hongzhi Wang"
    ],
    "year": "2025",
    "journal": "arXiv:2512.20949v1",
    "doi": "",
    "abstract": "Large language models(LLMs) excel at text generation and knowledge question-answering tasks, but they are prone to generating hallucinated content, severely limiting their application in high-risk domains. Current hallucination detection methods based on uncertainty estimation and external knowledge retrieval suffer from the limitation that they still produce erroneous content at high confidence levels and rely heavily on retrieval efficiency and knowledge coverage. In contrast, probe methods that leverage the model's hidden-layer states offer real-time and lightweight advantages. However, traditional linear probes struggle to capture nonlinear structures in deep semantic spaces.To overcome these limitations, we propose a neural network-based framework for token-level hallucination detection. By freezing language model parameters, we employ lightweight MLP probes to perform nonlinear modeling of high-level hidden states. A multi-objective joint loss function is designed to enhance detection stability and semantic disambiguity. Additionally, we establish a layer position-probe performance response model, using Bayesian optimization to automatically search for optimal probe insertion layers and achieve superior training results.Experimental results on LongFact, HealthBench, and TriviaQA demonstrate that MLP probes significantly outperform state-of-the-art methods in accuracy, recall, and detection capability under low false-positive conditions.",
    "source_database": "arxiv",
    "arxiv_id": "2512.20949v1"
  },
  {
    "title": "Applying generative AI with retrieval augmented generation to summarize and extract key clinical information from electronic health records.",
    "authors": [
      "Alkhalaf M",
      "Yu P",
      "Yin M",
      "Deng C"
    ],
    "year": "2024",
    "journal": "Journal of biomedical informatics",
    "doi": "10.1016/j.jbi.2024.104662",
    "pmid": "38880236",
    "abstract": "Malnutrition is a prevalent issue in aged care facilities (RACFs), leading to adverse health outcomes. The ability to efficiently extract key clinical information from a large volume of data in electronic health records (EHR) can improve understanding about the extent of the problem and developing effective interventions. This research aimed to test the efficacy of zero-shot prompt engineering applied to generative artificial intelligence (AI) models on their own and in combination with retrieval augmented generation (RAG), for the automating tasks of summarizing both structured and unstructured data in EHR and extracting important malnutrition information.",
    "source_database": "pubmed"
  },
  {
    "title": "RAG based Question-Answering for Contextual Response Prediction System",
    "authors": [
      "Sriram Veturi",
      "Saurabh Vaichal",
      "Reshma Lal Jagadheesh",
      "Nafis Irtiza Tripto",
      "Nian Yan"
    ],
    "year": "2024",
    "journal": "arXiv:2409.03708v2",
    "doi": "",
    "abstract": "Large Language Models (LLMs) have shown versatility in various Natural Language Processing (NLP) tasks, including their potential as effective question-answering systems. However, to provide precise and relevant information in response to specific customer queries in industry settings, LLMs require access to a comprehensive knowledge base to avoid hallucinations. Retrieval Augmented Generation (RAG) emerges as a promising technique to address this challenge. Yet, developing an accurate question-answering framework for real-world applications using RAG entails several challenges: 1) data availability issues, 2) evaluating the quality of generated content, and 3) the costly nature of human evaluation. In this paper, we introduce an end-to-end framework that employs LLMs with RAG capabilities for industry use cases. Given a customer query, the proposed system retrieves relevant knowledge documents and leverages them, along with previous chat history, to generate response suggestions for customer service agents in the contact centers of a major retail company. Through comprehensive automated and human evaluations, we show that this solution outperforms the current BERT-based algorithms in accuracy and relevance. Our findings suggest that RAG-based LLMs can be an excellent support to human customer service representatives by lightening their workload.",
    "source_database": "arxiv",
    "arxiv_id": "2409.03708v2"
  },
  {
    "title": "Med-HALT: Medical Domain Hallucination Test for Large Language Models",
    "authors": [
      "Ankit Pal",
      "Logesh Kumar Umapathi",
      "Malaikannan Sankarasubbu"
    ],
    "year": "2023",
    "journal": "arXiv:2307.15343v2",
    "doi": "",
    "abstract": "This research paper focuses on the challenges posed by hallucinations in large language models (LLMs), particularly in the context of the medical domain. Hallucination, wherein these models generate plausible yet unverified or incorrect information, can have serious consequences in healthcare applications. We propose a new benchmark and dataset, Med-HALT (Medical Domain Hallucination Test), designed specifically to evaluate and reduce hallucinations. Med-HALT provides a diverse multinational dataset derived from medical examinations across various countries and includes multiple innovative testing modalities. Med-HALT includes two categories of tests reasoning and memory-based hallucination tests, designed to assess LLMs's problem-solving and information retrieval abilities.   Our study evaluated leading LLMs, including Text Davinci, GPT-3.5, LlaMa-2, MPT, and Falcon, revealing significant differences in their performance. The paper provides detailed insights into the dataset, promoting transparency and reproducibility. Through this work, we aim to contribute to the development of safer and more reliable language models in healthcare. Our benchmark can be found at medhalt.github.io",
    "source_database": "arxiv",
    "arxiv_id": "2307.15343v2"
  },
  {
    "title": "Improving Scientific Hypothesis Generation with Knowledge Grounded Large Language Models",
    "authors": [
      "Guangzhi Xiong",
      "Eric Xie",
      "Amir Hassan Shariatmadari",
      "Sikun Guo",
      "Stefan Bekiranov",
      "Aidong Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2411.02382v1",
    "doi": "",
    "abstract": "Large language models (LLMs) have demonstrated remarkable capabilities in various scientific domains, from natural language processing to complex problem-solving tasks. Their ability to understand and generate human-like text has opened up new possibilities for advancing scientific research, enabling tasks such as data analysis, literature review, and even experimental design. One of the most promising applications of LLMs in this context is hypothesis generation, where they can identify novel research directions by analyzing existing knowledge. However, despite their potential, LLMs are prone to generating ``hallucinations'', outputs that are plausible-sounding but factually incorrect. Such a problem presents significant challenges in scientific fields that demand rigorous accuracy and verifiability, potentially leading to erroneous or misleading conclusions. To overcome these challenges, we propose KG-CoI (Knowledge Grounded Chain of Ideas), a novel system that enhances LLM hypothesis generation by integrating external, structured knowledge from knowledge graphs (KGs). KG-CoI guides LLMs through a structured reasoning process, organizing their output as a chain of ideas (CoI), and includes a KG-supported module for the detection of hallucinations. With experiments on our newly constructed hypothesis generation dataset, we demonstrate that KG-CoI not only improves the accuracy of LLM-generated hypotheses but also reduces the hallucination in their reasoning chains, highlighting its effectiveness in advancing real-world scientific research.",
    "source_database": "arxiv",
    "arxiv_id": "2411.02382v1"
  },
  {
    "title": "Retrieving, Rethinking and Revising: The Chain-of-Verification Can Improve Retrieval Augmented Generation",
    "authors": [
      "Bolei He",
      "Nuo Chen",
      "Xinran He",
      "Lingyong Yan",
      "Zhenkai Wei",
      "Jinchang Luo",
      "Zhen-Hua Ling"
    ],
    "year": "2024",
    "journal": "arXiv:2410.05801v1",
    "doi": "",
    "abstract": "Recent Retrieval Augmented Generation (RAG) aims to enhance Large Language Models (LLMs) by incorporating extensive knowledge retrieved from external sources. However, such approach encounters some challenges: Firstly, the original queries may not be suitable for precise retrieval, resulting in erroneous contextual knowledge; Secondly, the language model can easily generate inconsistent answer with external references due to their knowledge boundary limitation. To address these issues, we propose the chain-of-verification (CoV-RAG) to enhance the external retrieval correctness and internal generation consistency. Specifically, we integrate the verification module into the RAG, engaging in scoring, judgment, and rewriting. To correct external retrieval errors, CoV-RAG retrieves new knowledge using a revised query. To correct internal generation errors, we unify QA and verification tasks with a Chain-of-Thought (CoT) reasoning during training. Our comprehensive experiments across various LLMs demonstrate the effectiveness and adaptability compared with other strong baselines. Especially, our CoV-RAG can significantly surpass the state-of-the-art baselines using different LLM backbones.",
    "source_database": "arxiv",
    "arxiv_id": "2410.05801v1"
  },
  {
    "title": "Evaluating Web Retrieval-Assisted Large Language Models With and Without Whitelisting for Evidence-Based Neurology: Comparative Study.",
    "authors": [
      "Masanneck L",
      "Epping PZ",
      "Meuth SG",
      "Pawlitzki M"
    ],
    "year": "2025",
    "journal": "Journal of medical Internet research",
    "doi": "10.2196/79379",
    "pmid": "41159599",
    "abstract": "Large language models (LLMs) coupled with real-time web retrieval are reshaping how clinicians and patients locate medical evidence, and as major search providers fuse LLMs into their interfaces, this hybrid approach might become the new \"gateway\" to the internet. However, open-web retrieval exposes models to nonprofessional sources, risking hallucinations and factual errors that might jeopardize evidence-based care.",
    "source_database": "pubmed"
  },
  {
    "title": "Engineering the RAG Stack: A Comprehensive Review of the Architecture and Trust Frameworks for Retrieval-Augmented Generation Systems",
    "authors": [
      "Dean Wampler",
      "Dave Nielson",
      "Alireza Seddighi"
    ],
    "year": "2025",
    "journal": "arXiv:2601.05264v1",
    "doi": "",
    "abstract": "This article provides a comprehensive systematic literature review of academic studies, industrial applications, and real-world deployments from 2018 to 2025, providing a practical guide and detailed overview of modern Retrieval-Augmented Generation (RAG) architectures. RAG offers a modular approach for integrating external knowledge without increasing the capacity of the model as LLM systems expand. Research and engineering practices have been fragmented as a result of the increasing diversity of RAG methodologies, which encompasses a variety of fusion mechanisms, retrieval strategies, and orchestration approaches. We provide quantitative assessment frameworks, analyze the implications for trust and alignment, and systematically consolidate existing RAG techniques into a unified taxonomy. This document is a practical framework for the deployment of resilient, secure, and domain-adaptable RAG systems, synthesizing insights from academic literature, industry reports, and technical implementation guides. It also functions as a technical reference.",
    "source_database": "arxiv",
    "arxiv_id": "2601.05264v1"
  },
  {
    "title": "RMIT-ADM+S at the MMU-RAG NeurIPS 2025 Competition",
    "authors": [
      "Kun Ran",
      "Marwah Alaofi",
      "Danula Hettiachchi",
      "Chenglong Ma",
      "Khoi Nguyen Dinh Anh",
      "Khoi Vo Nguyen",
      "Sachin Pathiyan Cherumanal",
      "Lida Rashidi",
      "Falk Scholer",
      "Damiano Spina",
      "Shuoqi Sun",
      "Oleg Zendel"
    ],
    "year": "2026",
    "journal": "arXiv:2602.20735v1",
    "doi": "",
    "abstract": "This paper presents the award-winning RMIT-ADM+S system for the Text-to-Text   track of the NeurIPS~2025 MMU-RAG Competition. We introduce Routing-to-RAG   (R2RAG), a research-focused retrieval-augmented generation (RAG)   architecture composed of lightweight components that dynamically adapt the   retrieval strategy based on inferred query complexity and evidence   sufficiency. The system uses smaller LLMs, enabling operation on a single   consumer-grade GPU while supporting complex research tasks. It builds on the   G-RAG system, winner of the ACM~SIGIR~2025 LiveRAG Challenge, and extends it   with modules informed by qualitative review of outputs. R2RAG won the Best   Dynamic Evaluation award in the Open Source category, demonstrating high   effectiveness with careful design and efficient use of resources.",
    "source_database": "arxiv",
    "arxiv_id": "2602.20735v1"
  },
  {
    "title": "SRAG: RAG with Structured Data Improves Vector Retrieval",
    "authors": [
      "Shalin Shah",
      "Srikanth Ryali",
      "Ramasubbu Venkatesh"
    ],
    "year": "2026",
    "journal": "",
    "doi": "",
    "pmid": "",
    "abstract": "Retrieval Augmented Generation (RAG) provides the necessary informational grounding to LLMs in the form of chunks retrieved from a vector database or through web search. RAG could also use knowledge graph triples as a means of providing factual information to an LLM. However, the retrieval is only based on representational similarity between a question and the contents. The performance of RAG depends on the numeric vector representations of the query and the chunks. To improve these representations, we propose Structured RAG (SRAG), which adds structured information to a query as well as the chunks in the form of topics, sentiments, query and chunk types (e.g., informational, quantitative), knowledge graph triples and semantic tags. Experiments indicate that this method significantly improves the retrieval process. Using GPT-5 as an LLM-as-a-judge, results show that the method improves the score given to answers in a question answering system by 30% (p-value = 2e-13) (with tighter bounds). The strongest improvement is in comparative, analytical and predictive questions. The results suggest that our method enables broader, more diverse, and episodic-style retrieval. Tail risk analysis shows that SRAG attains very large gains more often, with losses remaining minor in magnitude.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "MedHallBench: A New Benchmark for Assessing Hallucination in Medical Large Language Models",
    "authors": [
      "Kaiwen Zuo",
      "Yirui Jiang"
    ],
    "year": "2024",
    "journal": "arXiv:2412.18947v4",
    "doi": "",
    "abstract": "Medical Large Language Models (MLLMs) have demonstrated potential in healthcare applications, yet their propensity for hallucinations -- generating medically implausible or inaccurate information -- presents substantial risks to patient care. This paper introduces MedHallBench, a comprehensive benchmark framework for evaluating and mitigating hallucinations in MLLMs. Our methodology integrates expert-validated medical case scenarios with established medical databases to create a robust evaluation dataset. The framework employs a sophisticated measurement system that combines automated ACHMI (Automatic Caption Hallucination Measurement in Medical Imaging) scoring with rigorous clinical expert evaluations and utilizes reinforcement learning methods to achieve automatic annotation. Through an optimized reinforcement learning from human feedback (RLHF) training pipeline specifically designed for medical applications, MedHallBench enables thorough evaluation of MLLMs across diverse clinical contexts while maintaining stringent accuracy standards. We conducted comparative experiments involving various models, utilizing the benchmark to establish a baseline for widely adopted large language models (LLMs). Our findings indicate that ACHMI provides a more nuanced understanding of the effects of hallucinations compared to traditional metrics, thereby highlighting its advantages in hallucination assessment. This research establishes a foundational framework for enhancing MLLMs' reliability in healthcare settings and presents actionable strategies for addressing the critical challenge of AI hallucinations in medical applications.",
    "source_database": "arxiv",
    "arxiv_id": "2412.18947v4"
  },
  {
    "title": "RAG-Gym: Systematic Optimization of Language Agents for Retrieval-Augmented Generation",
    "authors": [
      "Guangzhi Xiong",
      "Qiao Jin",
      "Xiao Wang",
      "Yin Fang",
      "Haolin Liu",
      "Yifan Yang",
      "Fangyuan Chen",
      "Zhixing Song",
      "Dengyu Wang",
      "Minjia Zhang",
      "Zhiyong Lu",
      "Aidong Zhang"
    ],
    "year": "2025",
    "journal": "arXiv:2502.13957v2",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) has shown great promise for knowledge-intensive tasks and recently advanced with agentic RAG, where language agents engage in multi-round interactions with external knowledge sources for adaptive information retrieval. However, existing agentic RAG methods often depend on ad-hoc prompt engineering and lack a unified optimization framework. We introduce RAG-Gym, a comprehensive platform that systematically explores three optimization dimensions: (1) prompt engineering, (2) actor tuning, and (3) critic training. For prompt engineering, we propose Re$^2$Search, a novel agent incorporating reasoning reflection that significantly outperforms standard prompts. In actor tuning, we evaluate three popular post-training algorithms with fine-grained process supervision and identify direct preference optimization as the most effective. We further demonstrate that a trained critic can enhance inference by selecting higher-quality intermediate reasoning steps. Together, these findings lead to the optimized Re$^2$Search++ agent, which surpasses most recent methods like Search-R1 by a relative increase of 3.2% to 11.6% in average F1. Finally, we examine the impact of different reward sources and analyze scaling properties in training and inference, offering practical insights for agentic RAG optimization. The project homepage is available at https://rag-gym.github.io.",
    "source_database": "arxiv",
    "arxiv_id": "2502.13957v2"
  },
  {
    "title": "CaresAI at BioCreative IX Track 1 -- LLM for Biomedical QA",
    "authors": [
      "Reem Abdel-Salam",
      "Mary Adewunmi",
      "Modinat A. Abayomi"
    ],
    "year": "2025",
    "journal": "arXiv:2509.00806v1",
    "doi": "",
    "abstract": "Large language models (LLMs) are increasingly evident for accurate question answering across various domains. However, rigorous evaluation of their performance on complex question-answering (QA) capabilities is essential before deployment in real-world biomedical and healthcare applications. This paper presents our approach to the MedHopQA track of the BioCreative IX shared task, which focuses on multi-hop biomedical question answering involving diseases, genes, and chemicals. We adopt a supervised fine-tuning strategy leveraging LLaMA 3 8B, enhanced with a curated biomedical question-answer dataset compiled from external sources including BioASQ, MedQuAD, and TREC. Three experimental setups are explored: fine-tuning on combined short and long answers, short answers only, and long answers only. While our models demonstrate strong domain understanding, achieving concept-level accuracy scores of up to 0.8, their Exact Match (EM) scores remain significantly lower, particularly in the test phase. We introduce a two-stage inference pipeline for precise short-answer extraction to mitigate verbosity and improve alignment with evaluation metrics. Despite partial improvements, challenges persist in generating strictly formatted outputs. Our findings highlight the gap between semantic understanding and exact answer evaluation in biomedical LLM applications, motivating further research in output control and post-processing strategies.",
    "source_database": "arxiv",
    "arxiv_id": "2509.00806v1"
  },
  {
    "title": "Retrieval Augmented Generation (RAG) for Fintech: Agentic Design and Evaluation",
    "authors": [
      "Thomas Cook",
      "Richard Osuagwu",
      "Liman Tsatiashvili",
      "Vrynsia Vrynsia",
      "Koustav Ghosal",
      "Maraim Masoud",
      "Riccardo Mattivi"
    ],
    "year": "2025",
    "journal": "arXiv:2510.25518v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) systems often face limitations in specialized domains such as fintech, where domain-specific ontologies, dense terminology, and acronyms complicate effective retrieval and synthesis. This paper introduces an agentic RAG architecture designed to address these challenges through a modular pipeline of specialized agents. The proposed system supports intelligent query reformulation, iterative sub-query decomposition guided by keyphrase extraction, contextual acronym resolution, and cross-encoder-based context re-ranking. We evaluate our approach against a standard RAG baseline using a curated dataset of 85 question--answer--reference triples derived from an enterprise fintech knowledge base. Experimental results demonstrate that the agentic RAG system outperforms the baseline in retrieval precision and relevance, albeit with increased latency. These findings suggest that structured, multi-agent methodologies offer a promising direction for enhancing retrieval robustness in complex, domain-specific settings.",
    "source_database": "arxiv",
    "arxiv_id": "2510.25518v1"
  },
  {
    "title": "LightRAG-Driven Medical QA: Leveraging Domain-Specific for Efficient LLM Reasoning",
    "authors": [
      "Rishabh Kushwaha",
      "Reshma Swain",
      "Bal Krishna Saraswat"
    ],
    "year": "2025",
    "journal": "2025 5th International Conference on Advancement in Electronics & Communication Engineering (AECE)",
    "doi": "10.1109/AECE67531.2025.11386653",
    "pmid": "",
    "abstract": "Large Language Model (LLM) have demonstrated great capabilities in medical question answering. But fake facts and projections restrict the efficiency of their real world. Past studies have incorporated RAG methods such as Naive RAG and GraphRAG to enhance the aspect of factual grounding. Efficiency retention and recall and precision optimization are challenging to these particular methods. In this paper, we apply the LightRAG framework to the medical domain as a lightweight and domain-specific retrieval procedure that enhances the reasoning skills of LLMs. BERTScore used to evaluate LightRAG against Naive RAG and GraphRAG based on semantic similarity evaluations and retrieval scores. The findings demonstrate that LightRAG works better with the F1 score of 0.83, Recall 0.85 and Precision 0.81. It is more contextually retrieved than Naive RAG and more accurate and stable than Graph-RAG. Providing a superior tradeoff between accuracy and completeness, LightRAG offers correct and comprehensive answers to medical questions. Dataset take from Gale resource and content length 3,53,624. Such results indicate that LightRAG may help professionals, assist students and common people.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Efficient Hallucination Detection: Adaptive Bayesian Estimation of Semantic Entropy with Guided Semantic Exploration",
    "authors": [
      "Qiyao Sun",
      "Xingming Li",
      "Xixiang He",
      "Ao Cheng",
      "Xuanyu Ji",
      "Hailun Lu",
      "Runke Huang",
      "Qingyong Hu"
    ],
    "year": "2026",
    "journal": "arXiv:2603.22812v1",
    "doi": "",
    "abstract": "Large language models (LLMs) have achieved remarkable success in various natural language processing tasks, yet they remain prone to generating factually incorrect outputs known as hallucinations. While recent approaches have shown promise for hallucination detection by repeatedly sampling from LLMs and quantifying the semantic inconsistency among the generated responses, they rely on fixed sampling budgets that fail to adapt to query complexity, resulting in computational inefficiency. We propose an Adaptive Bayesian Estimation framework for Semantic Entropy with Guided Semantic Exploration, which dynamically adjusts sampling requirements based on observed uncertainty. Our approach employs a hierarchical Bayesian framework to model the semantic distribution, enabling dynamic control of sampling iterations through variance-based thresholds that terminate generation once sufficient certainty is achieved. We also develop a perturbation-based importance sampling strategy to systematically explore the semantic space. Extensive experiments on four QA datasets demonstrate that our method achieves superior hallucination detection performance with significant efficiency gains. In low-budget scenarios, our approach requires about 50% fewer samples to achieve comparable detection performance to existing methods, while delivers an average AUROC improvement of 12.6% under the same sampling budget.",
    "source_database": "arxiv",
    "arxiv_id": "2603.22812v1"
  },
  {
    "title": "Quantifying Hallucinations in Language Language Models on Medical Textbooks",
    "authors": [
      "Brandon C. Colelough",
      "Davis Bartels",
      "Dina Demner-Fushman"
    ],
    "year": "2026",
    "journal": "arXiv:2603.09986v1",
    "doi": "",
    "abstract": "Hallucinations, the tendency for large language models to provide responses with factually incorrect and unsupported claims, is a serious problem within natural language processing for which we do not yet have an effective solution to mitigate against. Existing benchmarks for medical QA rarely evaluate this behavior against a fixed evidence source. We ask how often hallucinations occur on textbook-grounded QA and how responses to medical QA prompts vary across models. We conduct two experiments: the first experiment to determine the prevalence of hallucinations for a prominent open source large language model (LLaMA-70B-Instruct) in medical QA given novel prompts, and the second experiment to determine the prevalence of hallucinations and clinician preference to model responses. We observed, in experiment one, with the passages provided, LLaMA-70B-Instruct hallucinated in 19.7\\% of answers (95\\% CI 18.6 to 20.7) even though 98.8\\% of prompt responses received maximal plausibility, and observed in experiment two, across models, lower hallucination rates aligned with higher usefulness scores ($\u03c1=-0.71$, $p=0.058$). Clinicians produced high agreement (quadratic weighted $\u03ba=0.92$) and ($\u03c4_b=0.06$ to $0.18$, $\u03ba=0.57$ to $0.61$) for experiments 1 and ,2 respectively",
    "source_database": "arxiv",
    "arxiv_id": "2603.09986v1"
  },
  {
    "title": "Effective prompt design for large language models in clinical practice.",
    "authors": [
      "Callens S"
    ],
    "year": "2026",
    "journal": "Acta clinica Belgica",
    "doi": "10.1080/17843286.2026.2613903",
    "pmid": "41524451",
    "abstract": "Large language models (LLMs) have emerged as transformative healthcare tools for clinical documentation, diagnostic reasoning, and medical education. However, effective utilization requires understanding prompt engineering principles-the strategic design of inputs to optimize performance while mitigating hallucination, bias, and outdated information.",
    "source_database": "pubmed"
  },
  {
    "title": "MUST-RAG: MUSical Text Question Answering with Retrieval Augmented Generation",
    "authors": [
      "Daeyong Kwon",
      "SeungHeon Doh",
      "Juhan Nam"
    ],
    "year": "2025",
    "journal": "arXiv:2507.23334v2",
    "doi": "",
    "abstract": "Recent advancements in Large language models (LLMs) have demonstrated remarkable capabilities across diverse domains. While they exhibit strong zero-shot performance on various tasks, LLMs' effectiveness in music-related applications remains limited due to the relatively small proportion of music-specific knowledge in their training data. To address this limitation, we propose MusT-RAG, a comprehensive framework based on Retrieval Augmented Generation (RAG) to adapt general-purpose LLMs for text-only music question answering (MQA) tasks. RAG is a technique that provides external knowledge to LLMs by retrieving relevant context information when generating answers to questions. To optimize RAG for the music domain, we (1) propose MusWikiDB, a music-specialized vector database for the retrieval stage, and (2) utilizes context information during both inference and fine-tuning processes to effectively transform general-purpose LLMs into music-specific models. Our experiment demonstrates that MusT-RAG significantly outperforms traditional fine-tuning approaches in enhancing LLMs' music domain adaptation capabilities, showing consistent improvements across both in-domain and out-of-domain MQA benchmarks. Additionally, our MusWikiDB proves substantially more effective than general Wikipedia corpora, delivering superior performance and computational efficiency.",
    "source_database": "arxiv",
    "arxiv_id": "2507.23334v2"
  },
  {
    "title": "Video Enriched Retrieval Augmented Generation Using Aligned Video Captions",
    "authors": [
      "Kevin Dela Rosa"
    ],
    "year": "2024",
    "journal": "arXiv:2405.17706v1",
    "doi": "",
    "abstract": "In this work, we propose the use of \"aligned visual captions\" as a mechanism for integrating information contained within videos into retrieval augmented generation (RAG) based chat assistant systems. These captions are able to describe the visual and audio content of videos in a large corpus while having the advantage of being in a textual format that is both easy to reason about & incorporate into large language model (LLM) prompts, but also typically require less multimedia content to be inserted into the multimodal LLM context window, where typical configurations can aggressively fill up the context window by sampling video frames from the source video. Furthermore, visual captions can be adapted to specific use cases by prompting the original foundational model / captioner for particular visual details or fine tuning. In hopes of helping advancing progress in this area, we curate a dataset and describe automatic evaluation procedures on common RAG tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2405.17706v1"
  },
  {
    "title": "OLAPH: Improving Factuality in Biomedical Long-form Question Answering",
    "authors": [
      "Minbyul Jeong",
      "Hyeon Hwang",
      "Chanwoong Yoon",
      "Taewhoo Lee",
      "Jaewoo Kang"
    ],
    "year": "2024",
    "journal": "arXiv:2405.12701v3",
    "doi": "",
    "abstract": "In the medical domain, numerous scenarios necessitate the long-form generation ability of large language models (LLMs). Specifically, when addressing patients' questions, it is essential that the model's response conveys factual claims, highlighting the need for an automated method to evaluate those claims. Thus, we introduce MedLFQA, a benchmark dataset reconstructed using long-form question-answering datasets related to the biomedical domain. We use MedLFQA to facilitate a cost-effective automatic evaluations of factuality. We also propose OLAPH, a simple and novel framework that utilizes cost-effective and multifaceted automatic evaluation to construct a synthetic preference set and answers questions in our preferred manner. Our framework leads us to train LLMs step-by-step to reduce hallucinations and include crucial medical claims. We highlight that, even on evaluation metrics not used during training, LLMs trained with our OLAPH framework demonstrate significant performance improvement in factuality. Our findings reveal that a 7B LLM trained with our OLAPH framework can provide long answers comparable to the medical experts' answers in terms of factuality. We believe that our work could shed light on gauging the long-text generation ability of LLMs in the medical domain. Our code and datasets are available.",
    "source_database": "arxiv",
    "arxiv_id": "2405.12701v3"
  },
  {
    "title": "Detecting and Evaluating Medical Hallucinations in Large Vision Language Models",
    "authors": [
      "Jiawei Chen",
      "Dingkang Yang",
      "Tong Wu",
      "Yue Jiang",
      "Xiaolu Hou",
      "Mingcheng Li",
      "Shunli Wang",
      "Dongling Xiao",
      "Ke Li",
      "Lihua Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2406.10185v1",
    "doi": "",
    "abstract": "Large Vision Language Models (LVLMs) are increasingly integral to healthcare applications, including medical visual question answering and imaging report generation. While these models inherit the robust capabilities of foundational Large Language Models (LLMs), they also inherit susceptibility to hallucinations-a significant concern in high-stakes medical contexts where the margin for error is minimal. However, currently, there are no dedicated methods or benchmarks for hallucination detection and evaluation in the medical field. To bridge this gap, we introduce Med-HallMark, the first benchmark specifically designed for hallucination detection and evaluation within the medical multimodal domain. This benchmark provides multi-tasking hallucination support, multifaceted hallucination data, and hierarchical hallucination categorization. Furthermore, we propose the MediHall Score, a new medical evaluative metric designed to assess LVLMs' hallucinations through a hierarchical scoring system that considers the severity and type of hallucination, thereby enabling a granular assessment of potential clinical impacts. We also present MediHallDetector, a novel Medical LVLM engineered for precise hallucination detection, which employs multitask training for hallucination detection. Through extensive experimental evaluations, we establish baselines for popular LVLMs using our benchmark. The findings indicate that MediHall Score provides a more nuanced understanding of hallucination impacts compared to traditional metrics and demonstrate the enhanced performance of MediHallDetector. We hope this work can significantly improve the reliability of LVLMs in medical applications. All resources of this work will be released soon.",
    "source_database": "arxiv",
    "arxiv_id": "2406.10185v1"
  },
  {
    "title": "Enhancing Large Language Models for Improved Accuracy and Safety in Medical Question Answering: Comparative Study.",
    "authors": [
      "Wang D",
      "Ye J",
      "Li J",
      "Liang J",
      "Zhang Q",
      "Hu Q",
      "Pan C",
      "Wang D",
      "Liu Z",
      "Shi W",
      "Guo M",
      "Li F",
      "Du W",
      "Zheng YF"
    ],
    "year": "2025",
    "journal": "JMIR medical education",
    "doi": "10.2196/70190",
    "pmid": "41329953",
    "abstract": "Large language models (LLMs) offer the potential to improve virtual patient-physician communication and reduce health care professionals' workload. However, limitations in accuracy, outdated knowledge, and safety issues restrict their effective use in real clinical settings. Addressing these challenges is crucial for making LLMs a reliable health care tool.",
    "source_database": "pubmed"
  },
  {
    "title": "Tree of Reviews: A Tree-based Dynamic Iterative Retrieval Framework for Multi-hop Question Answering",
    "authors": [
      "Li Jiapeng",
      "Liu Runze",
      "Li Yabo",
      "Zhou Tong",
      "Li Mingling",
      "Chen Xiang"
    ],
    "year": "2024",
    "journal": "arXiv:2404.14464v1",
    "doi": "",
    "abstract": "Multi-hop question answering is a knowledge-intensive complex problem. Large Language Models (LLMs) use their Chain of Thoughts (CoT) capability to reason complex problems step by step, and retrieval-augmentation can effectively alleviate factual errors caused by outdated and unknown knowledge in LLMs. Recent works have introduced retrieval-augmentation in the CoT reasoning to solve multi-hop question answering. However, these chain methods have the following problems: 1) Retrieved irrelevant paragraphs may mislead the reasoning; 2) An error in the chain structure may lead to a cascade of errors.   In this paper, we propose a dynamic retrieval framework called Tree of Reviews (ToR), where the root node is the question, and the other nodes are paragraphs from retrieval, extending different reasoning paths from the root node to other nodes. Our framework dynamically decides to initiate a new search, reject, or accept based on the paragraphs on the reasoning paths. Compared to related work, we introduce a tree structure to handle each retrieved paragraph separately, alleviating the misleading effect of irrelevant paragraphs on the reasoning path; the diversity of reasoning path extension reduces the impact of a single reasoning error on the whole. We conducted experiments on three different multi-hop question answering datasets. The results show that compared to the baseline methods, ToR achieves state-of-the-art performance in both retrieval and response generation. In addition, we propose two tree-based search optimization strategies, pruning and effective expansion, to reduce time overhead and increase the diversity of path extension. We will release our code.",
    "source_database": "arxiv",
    "arxiv_id": "2404.14464v1"
  },
  {
    "title": "Localising In-Domain Adaptation of Transformer-Based Biomedical Language Models",
    "authors": [
      "Tommaso Mario Buonocore",
      "Claudio Crema",
      "Alberto Redolfi",
      "Riccardo Bellazzi",
      "Enea Parimbelli"
    ],
    "year": "2022",
    "journal": "arXiv:2212.10422v3",
    "doi": "https://doi.org/10.1016/j.jbi.2023.104431",
    "abstract": "In the era of digital healthcare, the huge volumes of textual information generated every day in hospitals constitute an essential but underused asset that could be exploited with task-specific, fine-tuned biomedical language representation models, improving patient care and management. For such specialized domains, previous research has shown that fine-tuning models stemming from broad-coverage checkpoints can largely benefit additional training rounds over large-scale in-domain resources. However, these resources are often unreachable for less-resourced languages like Italian, preventing local medical institutions to employ in-domain adaptation. In order to reduce this gap, our work investigates two accessible approaches to derive biomedical language models in languages other than English, taking Italian as a concrete use-case: one based on neural machine translation of English resources, favoring quantity over quality; the other based on a high-grade, narrow-scoped corpus natively written in Italian, thus preferring quality over quantity. Our study shows that data quantity is a harder constraint than data quality for biomedical adaptation, but the concatenation of high-quality data can improve model performance even when dealing with relatively size-limited corpora. The models published from our investigations have the potential to unlock important research opportunities for Italian hospitals and academia. Finally, the set of lessons learned from the study constitutes valuable insights towards a solution to build biomedical language models that are generalizable to other less-resourced languages and different domain settings.",
    "source_database": "arxiv",
    "arxiv_id": "2212.10422v3"
  },
  {
    "title": "Diagnosing and Addressing Pitfalls in KG-RAG Datasets: Toward More Reliable Benchmarking",
    "authors": [
      "Liangliang Zhang",
      "Zhuorui Jiang",
      "H. Chi",
      "Haoyang Chen",
      "Mohammed Elkoumy",
      "Fali Wang",
      "Qiong Wu",
      "Zhengyi Zhou",
      "Shirui Pan",
      "Suhang Wang",
      "Yao Ma"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2505.23495",
    "pmid": "",
    "abstract": "Knowledge Graph Question Answering (KGQA) systems rely on high-quality benchmarks to evaluate complex multi-hop reasoning. However, despite their widespread use, popular datasets such as WebQSP and CWQ suffer from critical quality issues, including inaccurate or incomplete ground-truth annotations, poorly constructed questions that are ambiguous, trivial, or unanswerable, and outdated or inconsistent knowledge. Through a manual audit of 16 popular KGQA datasets, including WebQSP and CWQ, we find that the average factual correctness rate is only 57 %. To address these issues, we introduce KGQAGen, an LLM-in-the-loop framework that systematically resolves these pitfalls. KGQAGen combines structured knowledge grounding, LLM-guided generation, and symbolic verification to produce challenging and verifiable QA instances. Using KGQAGen, we construct KGQAGen-10k, a ten-thousand scale benchmark grounded in Wikidata, and evaluate a diverse set of KG-RAG models. Experimental results demonstrate that even state-of-the-art systems struggle on this benchmark, highlighting its ability to expose limitations of existing models. Our findings advocate for more rigorous benchmark construction and position KGQAGen as a scalable framework for advancing KGQA evaluation.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Empathy Is Not What Changed: Clinical Assessment of Psychological Safety Across GPT Model Generations",
    "authors": [
      "Michael Keeman",
      "Anastasia Keeman"
    ],
    "year": "2026",
    "journal": "arXiv:2603.09997v1",
    "doi": "",
    "abstract": "When OpenAI deprecated GPT-4o in early 2026, thousands of users protested under #keep4o, claiming newer models had \"lost their empathy.\" No published study has tested this claim. We conducted the first clinical measurement, evaluating three OpenAI model generations (GPT-4o, o4-mini, GPT-5-mini) across 14 emotionally challenging conversational scenarios in mental health and AI companion domains, producing 2,100 scored AI responses assessed on six psychological safety dimensions using clinically-grounded rubrics.   Empathy scores are statistically indistinguishable across all three models (Kruskal-Wallis H=4.33, p=0.115). What changed is the safety posture: crisis detection improved monotonically from GPT-4o to GPT-5-mini (H=13.88, p=0.001), while advice safety declined (H=16.63, p<0.001). Per-turn trajectory analysis -- a novel methodological contribution -- reveals these shifts are sharpest during mid-conversation crisis moments invisible to aggregate scoring. In a self-harm scenario involving a minor, GPT-4o scored 3.6/10 on crisis detection during early disclosure turns; GPT-5-mini never dropped below 7.8.   What users perceived as \"lost empathy\" was a shift from a cautious model that missed crises to an alert model that sometimes says too much -- a trade-off with real consequences for vulnerable users, currently invisible to both the people who feel it and the developers who create it.",
    "source_database": "arxiv",
    "arxiv_id": "2603.09997v1"
  },
  {
    "title": "Empowering large language models for automated clinical assessment with generation-augmented retrieval and hierarchical chain-of-thought.",
    "authors": [
      "Gu Z",
      "Jia W",
      "Piccardi M",
      "Yu P"
    ],
    "year": "2025",
    "journal": "Artificial intelligence in medicine",
    "doi": "10.1016/j.artmed.2025.103078",
    "pmid": "39978047",
    "abstract": "Understanding and extracting valuable information from electronic health records (EHRs) is important for improving healthcare delivery and health outcomes. Large language models (LLMs) have demonstrated significant proficiency in natural language understanding and processing, offering promises for automating the typically labor-intensive and time-consuming analytical tasks with EHRs. Despite the active application of LLMs in the healthcare setting, many foundation models lack real-world healthcare relevance. Applying LLMs to EHRs is still in its early stage. To advance this field, in this study, we pioneer a generation-augmented prompting paradigm \"GAPrompt\" to empower generic LLMs for automated clinical assessment, in particular, quantitative stroke severity assessment, using data extracted from EHRs.",
    "source_database": "pubmed"
  },
  {
    "title": "UniBiomed: A Universal Foundation Model for Grounded Biomedical Image Interpretation",
    "authors": [
      "Linshan Wu",
      "Yuxiang Nie",
      "Sunan He",
      "Jiaxin Zhuang",
      "Luyang Luo",
      "Tao Li",
      "Zhuoyao Xie",
      "Dexuan Chen",
      "Yinghua Zhao",
      "Neeraj Mahboobani",
      "Varut Vardhanabhuti",
      "Ronald Cheong Kin Chan",
      "Yifan Peng",
      "Pranav Rajpurkar",
      "Hao Chen"
    ],
    "year": "2025",
    "journal": "arXiv:2504.21336v3",
    "doi": "",
    "abstract": "The integration of AI-assisted biomedical image analysis into clinical practice demands AI-generated findings that are not only accurate but also interpretable to clinicians. However, existing biomedical AI models generally lack the ability to simultaneously generate diagnostic findings and localize corresponding biomedical objects. This limitation makes it challenging for clinicians to correlate AI-generated findings with visual evidence (e.g., tiny lesions) in images and interpret the results of AI models. To address this challenge, we introduce UniBiomed, the first universal foundation model for grounded biomedical image interpretation, which is capable of generating accurate diagnostic findings and simultaneously segmenting the corresponding biomedical targets. UniBiomed is based on a novel integration of Multi-modal Large Language Model and Segment Anything Model, which can effectively unify diverse biomedical tasks in universal training for advancing grounded interpretation. To develop UniBiomed, we curate a large-scale dataset comprising over 27 million triplets of images, region annotations, and text descriptions across ten biomedical imaging modalities. Extensive validation on 70 internal and 14 external datasets demonstrated the state-of-the-art performance of UniBiomed in diverse biomedical tasks, including image segmentation, disease recognition, region-aware diagnosis, vision question answering, and report generation. In summary, UniBiomed is a powerful and versatile biomedical foundation model, unlocking the untapped grounded interpretation capability for optimizing AI-assisted biomedical image analysis.",
    "source_database": "arxiv",
    "arxiv_id": "2504.21336v3"
  },
  {
    "title": "Exploring the Vulnerability of the Content Moderation Guardrail in Large Language Models via Intent Manipulation",
    "authors": [
      "Jun Zhuang",
      "Haibo Jin",
      "Ye Zhang",
      "Zhengjian Kang",
      "Wenbin Zhang",
      "Gaby G. Dagher",
      "Haohan Wang"
    ],
    "year": "2025",
    "journal": "arXiv:2505.18556v2",
    "doi": "",
    "abstract": "Intent detection, a core component of natural language understanding, has considerably evolved as a crucial mechanism in safeguarding large language models (LLMs). While prior work has applied intent detection to enhance LLMs' moderation guardrails, showing a significant success against content-level jailbreaks, the robustness of these intent-aware guardrails under malicious manipulations remains under-explored. In this work, we investigate the vulnerability of intent-aware guardrails and demonstrate that LLMs exhibit implicit intent detection capabilities. We propose a two-stage intent-based prompt-refinement framework, IntentPrompt, that first transforms harmful inquiries into structured outlines and further reframes them into declarative-style narratives by iteratively optimizing prompts via feedback loops to enhance jailbreak success for red-teaming purposes. Extensive experiments across four public benchmarks and various black-box LLMs indicate that our framework consistently outperforms several cutting-edge jailbreak methods and evades even advanced Intent Analysis (IA) and Chain-of-Thought (CoT)-based defenses. Specifically, our \"FSTR+SPIN\" variant achieves attack success rates ranging from 88.25% to 96.54% against CoT-based defenses on the o1 model, and from 86.75% to 97.12% on the GPT-4o model under IA-based defenses. These findings highlight a critical weakness in LLMs' safety mechanisms and suggest that intent manipulation poses a growing challenge to content moderation guardrails.",
    "source_database": "arxiv",
    "arxiv_id": "2505.18556v2"
  },
  {
    "title": "Unified Hallucination Detection for Multimodal Large Language Models",
    "authors": [
      "Xiang Chen",
      "Chenxi Wang",
      "Yida Xue",
      "Ningyu Zhang",
      "Xiaoyan Yang",
      "Qiang Li",
      "Yue Shen",
      "Lei Liang",
      "Jinjie Gu",
      "Huajun Chen"
    ],
    "year": "2024",
    "journal": "arXiv:2402.03190v4",
    "doi": "",
    "abstract": "Despite significant strides in multimodal tasks, Multimodal Large Language Models (MLLMs) are plagued by the critical issue of hallucination. The reliable detection of such hallucinations in MLLMs has, therefore, become a vital aspect of model evaluation and the safeguarding of practical application deployment. Prior research in this domain has been constrained by a narrow focus on singular tasks, an inadequate range of hallucination categories addressed, and a lack of detailed granularity. In response to these challenges, our work expands the investigative horizons of hallucination detection. We present a novel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate the evaluation of advancements in hallucination detection methods. Additionally, we unveil a novel unified multimodal hallucination detection framework, UNIHD, which leverages a suite of auxiliary tools to validate the occurrence of hallucinations robustly. We demonstrate the effectiveness of UNIHD through meticulous evaluation and comprehensive analysis. We also provide strategic insights on the application of specific tools for addressing various categories of hallucinations.",
    "source_database": "arxiv",
    "arxiv_id": "2402.03190v4"
  },
  {
    "title": "ANAH-v2: Scaling Analytical Hallucination Annotation of Large Language Models",
    "authors": [
      "Yuzhe Gu",
      "Ziwei Ji",
      "Wenwei Zhang",
      "Chengqi Lyu",
      "Dahua Lin",
      "Kai Chen"
    ],
    "year": "2024",
    "journal": "arXiv:2407.04693v2",
    "doi": "",
    "abstract": "Large language models (LLMs) exhibit hallucinations in long-form question-answering tasks across various domains and wide applications. Current hallucination detection and mitigation datasets are limited in domains and sizes, which struggle to scale due to prohibitive labor costs and insufficient reliability of existing hallucination annotators. To facilitate the scalable oversight of LLM hallucinations, this paper introduces an iterative self-training framework that simultaneously and progressively scales up the hallucination annotation dataset and improves the accuracy of the hallucination annotator. Based on the Expectation Maximization (EM) algorithm, in each iteration, the framework first applies a hallucination annotation pipeline to annotate a scaled dataset and then trains a more accurate hallucination annotator on the dataset. This new hallucination annotator is adopted in the hallucination annotation pipeline used for the next iteration. Extensive experimental results demonstrate that the finally obtained hallucination annotator with only 7B parameters surpasses the performance of GPT-4 and obtains new state-of-the-art hallucination detection results on HaluEval and HalluQA by zero-shot inference. Such an annotator can not only evaluate the hallucination levels of various LLMs on the large-scale dataset but also help to mitigate the hallucination of LLMs generations, with the Natural Language Inference (NLI) metric increasing from 25% to 37% on HaluEval.",
    "source_database": "arxiv",
    "arxiv_id": "2407.04693v2"
  },
  {
    "title": "SoftTiger: A Clinical Foundation Model for Healthcare Workflows",
    "authors": [
      "Ye Chen",
      "Igor Couto",
      "Wei Cai",
      "Cong Fu",
      "Bruno Dorneles"
    ],
    "year": "2024",
    "journal": "arXiv:2403.00868v3",
    "doi": "",
    "abstract": "We introduce SoftTiger, a clinical large language model (CLaM) designed as a foundation model for healthcare workflows. The narrative and unstructured nature of clinical notes is a major obstacle for healthcare intelligentization. We address a critical problem of structuring clinical notes into clinical data, according to international interoperability standards. We collect and annotate data for three subtasks, namely, international patient summary, clinical impression and medical encounter. We then supervised fine-tuned a state-of-the-art LLM using public and credentialed clinical data. The training is orchestrated in a way that the target model can first support basic clinical tasks such as abbreviation expansion and temporal information extraction, and then learn to perform more complex downstream clinical tasks. Moreover, we address several modeling challenges in the healthcare context, e.g., extra long context window. Our blind pairwise evaluation shows that SoftTiger outperforms other popular open-source models and GPT-3.5, comparable to Gemini-pro, with a mild gap from GPT-4. We believe that LLMs may become a step-stone towards healthcare digitalization and democratization. Therefore, we publicly release SoftTiger models at scales of 13 billion and 70 billion parameters, as well as datasets and code for our innovative scalable evaluation, hopefully, making a significant contribution to the healthcare industry.",
    "source_database": "arxiv",
    "arxiv_id": "2403.00868v3"
  },
  {
    "title": "Multi-Task Retrieval-Augmented Text Generation with Relevance Sampling",
    "authors": [
      "Sebastian Hofst\u00e4tter",
      "Jiecao Chen",
      "Karthik Raman",
      "Hamed Zamani"
    ],
    "year": "2022",
    "journal": "arXiv:2207.03030v1",
    "doi": "",
    "abstract": "This paper studies multi-task training of retrieval-augmented generation models for knowledge-intensive tasks. We propose to clean the training set by utilizing a distinct property of knowledge-intensive generation: The connection of query-answer pairs to items in the knowledge base. We filter training examples via a threshold of confidence on the relevance labels, whether a pair is answerable by the knowledge base or not. We train a single Fusion-in-Decoder (FiD) generator on seven combined tasks of the KILT benchmark. The experimental results suggest that our simple yet effective approach substantially improves competitive baselines on two strongly imbalanced tasks; and shows either smaller improvements or no significant regression on the remaining tasks. Furthermore, we demonstrate our multi-task training with relevance label sampling scales well with increased model capacity and achieves state-of-the-art results in five out of seven KILT tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2207.03030v1"
  },
  {
    "title": "Why LLM Safety Guardrails Collapse After Fine-tuning: A Similarity Analysis Between Alignment and Fine-tuning Datasets",
    "authors": [
      "Lei Hsiung",
      "Tianyu Pang",
      "Yung-Chen Tang",
      "Linyue Song",
      "Tsung-Yi Ho",
      "Pin-Yu Chen",
      "Yaoqing Yang"
    ],
    "year": "2025",
    "journal": "arXiv:2506.05346v1",
    "doi": "",
    "abstract": "Recent advancements in large language models (LLMs) have underscored their vulnerability to safety alignment jailbreaks, particularly when subjected to downstream fine-tuning. However, existing mitigation strategies primarily focus on reactively addressing jailbreak incidents after safety guardrails have been compromised, removing harmful gradients during fine-tuning, or continuously reinforcing safety alignment throughout fine-tuning. As such, they tend to overlook a critical upstream factor: the role of the original safety-alignment data. This paper therefore investigates the degradation of safety guardrails through the lens of representation similarity between upstream alignment datasets and downstream fine-tuning tasks. Our experiments demonstrate that high similarity between these datasets significantly weakens safety guardrails, making models more susceptible to jailbreaks. Conversely, low similarity between these two types of datasets yields substantially more robust models and thus reduces harmfulness score by up to 10.33%. By highlighting the importance of upstream dataset design in the building of durable safety guardrails and reducing real-world vulnerability to jailbreak attacks, these findings offer actionable insights for fine-tuning service providers.",
    "source_database": "arxiv",
    "arxiv_id": "2506.05346v1"
  },
  {
    "title": "Benchmarking LLM Guardrails in Handling Multilingual Toxicity",
    "authors": [
      "Yahan Yang",
      "Soham Dan",
      "Dan Roth",
      "Insup Lee"
    ],
    "year": "2024",
    "journal": "arXiv:2410.22153v1",
    "doi": "",
    "abstract": "With the ubiquity of Large Language Models (LLMs), guardrails have become crucial to detect and defend against toxic content. However, with the increasing pervasiveness of LLMs in multilingual scenarios, their effectiveness in handling multilingual toxic inputs remains unclear. In this work, we introduce a comprehensive multilingual test suite, spanning seven datasets and over ten languages, to benchmark the performance of state-of-the-art guardrails. We also investigates the resilience of guardrails against recent jailbreaking techniques, and assess the impact of in-context safety policies and language resource availability on guardrails' performance. Our findings show that existing guardrails are still ineffective at handling multilingual toxicity and lack robustness against jailbreaking prompts. This work aims to identify the limitations of guardrails and to build a more reliable and trustworthy LLMs in multilingual scenarios.",
    "source_database": "arxiv",
    "arxiv_id": "2410.22153v1"
  },
  {
    "title": "Knowledge-Driven Agentic Scientific Corpus Distillation Framework for Biomedical Large Language Models Training",
    "authors": [
      "Meng Xiao",
      "Xunxin Cai",
      "Qingqing Long",
      "Chengrui Wang",
      "Yuanchun Zhou",
      "Hengshu Zhu"
    ],
    "year": "2025",
    "journal": "arXiv:2504.19565v3",
    "doi": "",
    "abstract": "Corpus distillation for biomedical large language models (LLMs) seeks to address the pressing challenge of insufficient quantity and quality in open-source annotated scientific corpora, which remains a bottleneck for effective LLM training in biomedical research. This paper proposes a knowledge-driven, agentic framework for scientific corpus distillation, tailored explicitly for LLM training in the biomedical domain, addressing the challenge posed by the complex hierarchy of biomedical knowledge. Central to our approach is a collaborative multi-agent architecture, where specialized agents, each guided by the Medical Subject Headings (MeSH) hierarchy, work in concert to autonomously extract, synthesize, and self-evaluate high-quality textual data from vast scientific literature. This agentic framework collectively generates and refines domain-specific question-answer pairs, ensuring comprehensive coverage and consistency with biomedical ontologies while minimizing manual involvement. Extensive experimental results show that language models trained on our multi-agent distilled datasets achieve notable improvements in biomedical question-answering tasks, outperforming both strong life sciences LLM baselines and advanced proprietary models. Notably, our AI-Ready dataset enables Llama3-70B to surpass GPT-4 with MedPrompt and Med-PaLM-2, despite their larger scale. Detailed ablation studies and case analyses further validate the effectiveness and synergy of each agent within the framework, highlighting the potential of multi-agent collaboration in biomedical LLM training.",
    "source_database": "arxiv",
    "arxiv_id": "2504.19565v3"
  },
  {
    "title": "Unifying Large Language Models and Knowledge Graphs: A Roadmap",
    "authors": [
      "Shirui Pan",
      "Linhao Luo",
      "Yufei Wang",
      "Chen Chen",
      "Jiapu Wang",
      "Xindong Wu"
    ],
    "year": "2023",
    "journal": "arXiv:2306.08302v3",
    "doi": "https://doi.org/10.1109/TKDE.2024.3352100",
    "abstract": "Large language models (LLMs), such as ChatGPT and GPT4, are making new waves in the field of natural language processing and artificial intelligence, due to their emergent ability and generalizability. However, LLMs are black-box models, which often fall short of capturing and accessing factual knowledge. In contrast, Knowledge Graphs (KGs), Wikipedia and Huapu for example, are structured knowledge models that explicitly store rich factual knowledge. KGs can enhance LLMs by providing external knowledge for inference and interpretability. Meanwhile, KGs are difficult to construct and evolving by nature, which challenges the existing methods in KGs to generate new facts and represent unseen knowledge. Therefore, it is complementary to unify LLMs and KGs together and simultaneously leverage their advantages. In this article, we present a forward-looking roadmap for the unification of LLMs and KGs. Our roadmap consists of three general frameworks, namely, 1) KG-enhanced LLMs, which incorporate KGs during the pre-training and inference phases of LLMs, or for the purpose of enhancing understanding of the knowledge learned by LLMs; 2) LLM-augmented KGs, that leverage LLMs for different KG tasks such as embedding, completion, construction, graph-to-text generation, and question answering; and 3) Synergized LLMs + KGs, in which LLMs and KGs play equal roles and work in a mutually beneficial way to enhance both LLMs and KGs for bidirectional reasoning driven by both data and knowledge. We review and summarize existing efforts within these three frameworks in our roadmap and pinpoint their future research directions.",
    "source_database": "arxiv",
    "arxiv_id": "2306.08302v3"
  },
  {
    "title": "AutoHall: Automated Factuality Hallucination Dataset Generation for Large Language Models",
    "authors": [
      "Zouying Cao",
      "Yifei Yang",
      "XiaoJing Li",
      "Hai Zhao"
    ],
    "year": "2023",
    "journal": "arXiv:2310.00259v3",
    "doi": "",
    "abstract": "Large language models (LLMs) have gained broad applications across various domains but still struggle with hallucinations. Currently, hallucinations occur frequently in the generation of factual content and pose a great challenge to trustworthy LLMs. However, hallucination detection is hindered by the laborious and expensive manual annotation of hallucinatory content. Meanwhile, as different LLMs exhibit distinct types and rates of hallucination, the collection of hallucination datasets is inherently model-specific, which also increases the cost. To address this issue, this paper proposes a method called $\\textbf{AutoHall}$ for $\\underline{Auto}$matically constructing model-specific $\\underline{Hall}$ucination datasets based on existing fact-checking datasets. The empirical results reveal variations in hallucination proportions and types among different models. Moreover, we introduce a zero-resource and black-box hallucination detection method based on self-contradiction to recognize the hallucination in our constructed dataset, achieving superior detection performance compared to baselines. Further analysis on our dataset provides insight into factors that may contribute to LLM hallucinations. Our codes and datasets are publicly available at https://github.com/zouyingcao/AutoHall.",
    "source_database": "arxiv",
    "arxiv_id": "2310.00259v3"
  },
  {
    "title": "Knowledge-tuning Large Language Models with Structured Medical Knowledge Bases for Reliable Response Generation in Chinese",
    "authors": [
      "Haochun Wang",
      "Sendong Zhao",
      "Zewen Qiang",
      "Zijian Li",
      "Nuwa Xi",
      "Yanrui Du",
      "MuZhen Cai",
      "Haoqiang Guo",
      "Yuhan Chen",
      "Haoming Xu",
      "Bing Qin",
      "Ting Liu"
    ],
    "year": "2023",
    "journal": "arXiv:2309.04175v1",
    "doi": "https://doi.org/10.1145/3686807",
    "abstract": "Large Language Models (LLMs) have demonstrated remarkable success in diverse natural language processing (NLP) tasks in general domains. However, LLMs sometimes generate responses with the hallucination about medical facts due to limited domain knowledge. Such shortcomings pose potential risks in the utilization of LLMs within medical contexts. To address this challenge, we propose knowledge-tuning, which leverages structured medical knowledge bases for the LLMs to grasp domain knowledge efficiently and facilitate reliable response generation. We also release cMedKnowQA, a Chinese medical knowledge question-answering dataset constructed from medical knowledge bases to assess the medical knowledge proficiency of LLMs. Experimental results show that the LLMs which are knowledge-tuned with cMedKnowQA, can exhibit higher levels of accuracy in response generation compared with vanilla instruction-tuning and offer a new reliable way for the domain adaptation of LLMs.",
    "source_database": "arxiv",
    "arxiv_id": "2309.04175v1"
  },
  {
    "title": "Fact Grounded Attention: Eliminating Hallucination in Large Language Models Through Attention Level Knowledge Integration",
    "authors": [
      "Aayush Gupta"
    ],
    "year": "2025",
    "journal": "arXiv:2509.25252v2",
    "doi": "",
    "abstract": "\"The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge.\" Large Language Models have conquered natural language but remain prisoners of their own probabilistic nature--confidently hallucinating facts they never truly knew. We present Fact Grounded Attention (FGA), a novel architectural modification that transforms unreliable language models into deterministic truth tellers by injecting verifiable knowledge directly into the attention mechanism. Unlike existing approaches that patch hallucinations after generation or prepend retrieved text, FGA intervenes at the mathematical heart of the transformer--the pre-softmax attention scores--creating a model that cannot hallucinate when facts exist in its knowledge base. Our experiments across 1,107 technical queries spanning smartphones, laptops, and electric vehicles demonstrate a transformation from 6.3% accuracy in vanilla Llama 3.2 to 99.7% accuracy with FGA. More critically, knowledge updates occur in under one second without retraining, compared to hours for parameter editing approaches. FGA doesn't just reduce hallucination--it eliminates it entirely for verifiable facts, marking a fundamental shift from probabilistic approximation to deterministic precision in neural language generation.",
    "source_database": "arxiv",
    "arxiv_id": "2509.25252v2"
  },
  {
    "title": "Document Understanding for Healthcare Referrals",
    "authors": [
      "Jimit Mistry",
      "Natalia M. Arzeno"
    ],
    "year": "2023",
    "journal": "arXiv:2309.13184v1",
    "doi": "https://doi.org/10.1109/ICHI57859.2023.00067",
    "abstract": "Reliance on scanned documents and fax communication for healthcare referrals leads to high administrative costs and errors that may affect patient care. In this work we propose a hybrid model leveraging LayoutLMv3 along with domain-specific rules to identify key patient, physician, and exam-related entities in faxed referral documents. We explore some of the challenges in applying a document understanding model to referrals, which have formats varying by medical practice, and evaluate model performance using MUC-5 metrics to obtain appropriate metrics for the practical use case. Our analysis shows the addition of domain-specific rules to the transformer model yields greatly increased precision and F1 scores, suggesting a hybrid model trained on a curated dataset can increase efficiency in referral management.",
    "source_database": "arxiv",
    "arxiv_id": "2309.13184v1"
  },
  {
    "title": "Ingest-And-Ground: Dispelling Hallucinations from Continually-Pretrained LLMs with RAG",
    "authors": [
      "Chenhao Fang",
      "Derek Larson",
      "Shitong Zhu",
      "Sophie Zeng",
      "Wendy Summer",
      "Yanqing Peng",
      "Yuriy Hulovatyy",
      "Rajeev Rao",
      "Gabriel Forgues",
      "Arya Pudota",
      "Alex Goncalves",
      "Herv\u00e9 Robert"
    ],
    "year": "2024",
    "journal": "arXiv:2410.02825v2",
    "doi": "",
    "abstract": "This paper presents new methods that have the potential to improve privacy process efficiency with LLM and RAG. To reduce hallucination, we continually pre-train the base LLM model with a privacy-specific knowledge base and then augment it with a semantic RAG layer. Our evaluations demonstrate that this approach enhances the model performance (as much as doubled metrics compared to out-of-box LLM) in handling privacy-related queries, by grounding responses with factual information which reduces inaccuracies.",
    "source_database": "arxiv",
    "arxiv_id": "2410.02825v2"
  },
  {
    "title": "Ingest-And-Ground: Dispelling Hallucinations from Continually-Pretrained LLMs with RAG",
    "authors": [
      "Chenhao Fang",
      "Derek Larson",
      "Shitong Zhu",
      "Sophie Zeng",
      "Wendy Summer",
      "Yanqing Peng",
      "Yuriy Hulovatyy",
      "Rajeev Rao",
      "Gabriel Forgues",
      "Arya Pudota",
      "Alex Goncalves",
      "Herv'e Robert"
    ],
    "year": "2024",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2410.02825",
    "pmid": "",
    "abstract": "This paper presents new methods that have the potential to improve privacy process efficiency with LLM and RAG. To reduce hallucination, we continually pre-train the base LLM model with a privacy-specific knowledge base and then augment it with a semantic RAG layer. Our evaluations demonstrate that this approach enhances the model performance (as much as doubled metrics compared to out-of-box LLM) in handling privacy-related queries, by grounding responses with factual information which reduces inaccuracies.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Detecting hallucinations in large language models using semantic entropy.",
    "authors": [
      "Farquhar S",
      "Kossen J",
      "Kuhn L",
      "Gal Y"
    ],
    "year": "2024",
    "journal": "Nature",
    "doi": "10.1038/s41586-024-07421-0",
    "pmid": "38898292",
    "abstract": "Large language model (LLM) systems, such as ChatGPT",
    "source_database": "pubmed"
  },
  {
    "title": "PediatricsGPT: Large Language Models as Chinese Medical Assistants for Pediatric Applications",
    "authors": [
      "Dingkang Yang",
      "Jinjie Wei",
      "Dongling Xiao",
      "Shunli Wang",
      "Tong Wu",
      "Gang Li",
      "Mingcheng Li",
      "Shuaibing Wang",
      "Jiawei Chen",
      "Yue Jiang",
      "Qingyao Xu",
      "Ke Li",
      "Peng Zhai",
      "Lihua Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2405.19266v4",
    "doi": "",
    "abstract": "Developing intelligent pediatric consultation systems offers promising prospects for improving diagnostic efficiency, especially in China, where healthcare resources are scarce. Despite recent advances in Large Language Models (LLMs) for Chinese medicine, their performance is sub-optimal in pediatric applications due to inadequate instruction data and vulnerable training procedures. To address the above issues, this paper builds PedCorpus, a high-quality dataset of over 300,000 multi-task instructions from pediatric textbooks, guidelines, and knowledge graph resources to fulfil diverse diagnostic demands. Upon well-designed PedCorpus, we propose PediatricsGPT, the first Chinese pediatric LLM assistant built on a systematic and robust training pipeline. In the continuous pre-training phase, we introduce a hybrid instruction pre-training mechanism to mitigate the internal-injected knowledge inconsistency of LLMs for medical domain adaptation. Immediately, the full-parameter Supervised Fine-Tuning (SFT) is utilized to incorporate the general medical knowledge schema into the models. After that, we devise a direct following preference optimization to enhance the generation of pediatrician-like humanistic responses. In the parameter-efficient secondary SFT phase, a mixture of universal-specific experts strategy is presented to resolve the competency conflict between medical generalist and pediatric expertise mastery. Extensive results based on the metrics, GPT-4, and doctor evaluations on distinct doctor downstream tasks show that PediatricsGPT consistently outperforms previous Chinese medical LLMs. Our model and dataset will be open-source for community development.",
    "source_database": "arxiv",
    "arxiv_id": "2405.19266v4"
  },
  {
    "title": "Commander-GPT: Fully Unleashing the Sarcasm Detection Capability of Multi-Modal Large Language Models",
    "authors": [
      "Yazhou Zhang",
      "Chunwang Zou",
      "Bo Wang",
      "Jing Qin"
    ],
    "year": "2025",
    "journal": "arXiv:2503.18681v3",
    "doi": "",
    "abstract": "Sarcasm detection, as a crucial research direction in the field of Natural Language Processing (NLP), has attracted widespread attention. Traditional sarcasm detection tasks have typically focused on single-modal approaches (e.g., text), but due to the implicit and subtle nature of sarcasm, such methods often fail to yield satisfactory results. In recent years, researchers have shifted the focus of sarcasm detection to multi-modal approaches. However, effectively leveraging multi-modal information to accurately identify sarcastic content remains a challenge that warrants further exploration. Leveraging the powerful integrated processing capabilities of Multi-Modal Large Language Models (MLLMs) for various information sources, we propose an innovative multi-modal Commander-GPT framework. Inspired by military strategy, we first decompose the sarcasm detection task into six distinct sub-tasks. A central commander (decision-maker) then assigns the best-suited large language model to address each specific sub-task. Ultimately, the detection results from each model are aggregated to identify sarcasm. We conducted extensive experiments on MMSD and MMSD 2.0, utilizing four multi-modal large language models and six prompting strategies. Our experiments demonstrate that our approach achieves state-of-the-art performance, with a 19.3% improvement in F1 score, without necessitating fine-tuning or ground-truth rationales.",
    "source_database": "arxiv",
    "arxiv_id": "2503.18681v3"
  },
  {
    "title": "MedHal: An Evaluation Dataset for Medical Hallucination Detection",
    "authors": [
      "Gaya Mehenni",
      "Fabrice Lamarche",
      "Odette Rios-Ibacache",
      "John Kildea",
      "Amal Zouaq"
    ],
    "year": "2025",
    "journal": "arXiv:2504.08596v2",
    "doi": "",
    "abstract": "We present MedHal, a novel large-scale dataset specifically designed to evaluate if models can detect hallucinations in medical texts. Current hallucination detection methods face significant limitations when applied to specialized domains like medicine, where they can have disastrous consequences. Existing medical datasets are either too small, containing only a few hundred samples, or focus on a single task like Question Answering or Natural Language Inference. MedHal addresses these gaps by: (1) incorporating diverse medical text sources and tasks; (2) providing a substantial volume of annotated samples suitable for training medical hallucination detection models; and (3) including explanations for factual inconsistencies to guide model learning. We demonstrate MedHal's utility by training and evaluating a baseline medical hallucination detection model, showing improvements over general-purpose hallucination detection approaches. This resource enables more efficient evaluation of medical text generation systems while reducing reliance on costly expert review, potentially accelerating the development of medical AI research.",
    "source_database": "arxiv",
    "arxiv_id": "2504.08596v2"
  },
  {
    "title": "Editing Factual Knowledge and Explanatory Ability of Medical Large Language Models",
    "authors": [
      "Derong Xu",
      "Ziheng Zhang",
      "Zhihong Zhu",
      "Zhenxi Lin",
      "Qidong Liu",
      "Xian Wu",
      "Tong Xu",
      "Wanyu Wang",
      "Yuyang Ye",
      "Xiangyu Zhao",
      "Enhong Chen",
      "Yefeng Zheng"
    ],
    "year": "2024",
    "journal": "arXiv:2402.18099v3",
    "doi": "",
    "abstract": "Model editing aims to precisely alter the behaviors of large language models (LLMs) in relation to specific knowledge, while leaving unrelated knowledge intact. This approach has proven effective in addressing issues of hallucination and outdated information in LLMs. However, the potential of using model editing to modify knowledge in the medical field remains largely unexplored, even though resolving hallucination is a pressing need in this area. Our observations indicate that current methods face significant challenges in dealing with specialized and complex knowledge in medical domain. Therefore, we propose MedLaSA, a novel Layer-wise Scalable Adapter strategy for medical model editing. MedLaSA harnesses the strengths of both adding extra parameters and locate-then-edit methods for medical model editing. We utilize causal tracing to identify the association of knowledge in neurons across different layers, and generate a corresponding scale set from the association value for each piece of knowledge. Subsequently, we incorporate scalable adapters into the dense layers of LLMs. These adapters are assigned scaling values based on the corresponding specific knowledge, which allows for the adjustment of the adapter's weight and rank. The more similar the content, the more consistent the scale between them. This ensures precise editing of semantically identical knowledge while avoiding impact on unrelated knowledge. To evaluate the editing impact on the behaviours of LLMs, we propose two model editing studies for medical domain: (1) editing factual knowledge for medical specialization and (2) editing the explanatory ability for complex knowledge. We build two novel medical benchmarking datasets and introduce a series of challenging and comprehensive metrics. Extensive experiments on medical LLMs demonstrate the editing efficiency of MedLaSA, without affecting unrelated knowledge.",
    "source_database": "arxiv",
    "arxiv_id": "2402.18099v3"
  },
  {
    "title": "A scoping review on multimodal deep learning in biomedical images and texts",
    "authors": [
      "Zhaoyi Sun",
      "Mingquan Lin",
      "Qingqing Zhu",
      "Qianqian Xie",
      "Fei Wang",
      "Zhiyong Lu",
      "Yifan Peng"
    ],
    "year": "2023",
    "journal": "arXiv:2307.07362v3",
    "doi": "https://doi.org/10.1016/j.jbi.2023.104482",
    "abstract": "Computer-assisted diagnostic and prognostic systems of the future should be capable of simultaneously processing multimodal data. Multimodal deep learning (MDL), which involves the integration of multiple sources of data, such as images and text, has the potential to revolutionize the analysis and interpretation of biomedical data. However, it only caught researchers' attention recently. To this end, there is a critical need to conduct a systematic review on this topic, identify the limitations of current work, and explore future directions. In this scoping review, we aim to provide a comprehensive overview of the current state of the field and identify key concepts, types of studies, and research gaps with a focus on biomedical images and texts joint learning, mainly because these two were the most commonly available data types in MDL research. This study reviewed the current uses of multimodal deep learning on five tasks: (1) Report generation, (2) Visual question answering, (3) Cross-modal retrieval, (4) Computer-aided diagnosis, and (5) Semantic segmentation. Our results highlight the diverse applications and potential of MDL and suggest directions for future research in the field. We hope our review will facilitate the collaboration of natural language processing (NLP) and medical imaging communities and support the next generation of decision-making and computer-assisted diagnostic system development.",
    "source_database": "arxiv",
    "arxiv_id": "2307.07362v3"
  },
  {
    "title": "ConsistencyAI: A Benchmark to Assess LLMs' Factual Consistency When Responding to Different Demographic Groups",
    "authors": [
      "Peter Banyas",
      "Shristi Sharma",
      "Alistair Simmons",
      "Atharva Vispute"
    ],
    "year": "2025",
    "journal": "arXiv:2510.13852v2",
    "doi": "",
    "abstract": "Is an LLM telling you different facts than it's telling me? This paper introduces ConsistencyAI, an independent benchmark for measuring the factual consistency of large language models (LLMs) for different personas. ConsistencyAI tests whether, when users of different demographics ask identical questions, the model responds with factually inconsistent answers. Designed without involvement from LLM providers, this benchmark offers impartial evaluation and accountability. In our experiment, we queried 19 LLMs with prompts that requested 5 facts for each of 15 topics. We repeated this query 100 times for each LLM, each time adding prompt context from a different persona selected from a subset of personas modeling the general population. We processed the responses into sentence embeddings, computed cross-persona cosine similarity, and computed the weighted average of cross-persona cosine similarity to calculate factual consistency scores. In 100-persona experiments, scores ranged from 0.9065 to 0.7896, and the mean was 0.8656, which we adopt as a benchmark threshold. xAI's Grok-3 is most consistent, while several lightweight models rank lowest. Consistency varies by topic: the job market is least consistent, G7 world leaders most consistent, and issues like vaccines or the Israeli-Palestinian conflict diverge by provider. These results show that both the provider and the topic shape the factual consistency. We release our code and interactive demo to support reproducible evaluation and encourage persona-invariant prompting strategies.",
    "source_database": "arxiv",
    "arxiv_id": "2510.13852v2"
  },
  {
    "title": "GraphRAG-Enabled Local Large Language Model for Gestational Diabetes Mellitus: Development of a Proof-of-Concept.",
    "authors": [
      "Evangelista E",
      "Ruba F",
      "Bukhari S",
      "Nazir A",
      "Sharma R"
    ],
    "year": "2026",
    "journal": "JMIR diabetes",
    "doi": "10.2196/76454",
    "pmid": "41490382",
    "abstract": "Gestational diabetes mellitus (GDM) is a prevalent chronic condition that affects maternal and fetal health outcomes worldwide, increasingly in underserved populations. While generative artificial intelligence (AI) and large language models (LLMs) have shown promise in health care, their application in GDM management remains underexplored.",
    "source_database": "pubmed"
  },
  {
    "title": "Using Bottleneck Adapters to Identify Cancer in Clinical Notes under Low-Resource Constraints",
    "authors": [
      "Omid Rohanian",
      "Hannah Jauncey",
      "Mohammadmahdi Nouriborji",
      "Vinod Kumar Chauhan",
      "Bronner P. Gon\u00e7alves",
      "Christiana Kartsonaki",
      "ISARIC Clinical Characterisation Group",
      "Laura Merson",
      "David Clifton"
    ],
    "year": "2022",
    "journal": "arXiv:2210.09440v2",
    "doi": "",
    "abstract": "Processing information locked within clinical health records is a challenging task that remains an active area of research in biomedical NLP. In this work, we evaluate a broad set of machine learning techniques ranging from simple RNNs to specialised transformers such as BioBERT on a dataset containing clinical notes along with a set of annotations indicating whether a sample is cancer-related or not.   Furthermore, we specifically employ efficient fine-tuning methods from NLP, namely, bottleneck adapters and prompt tuning, to adapt the models to our specialised task. Our evaluations suggest that fine-tuning a frozen BERT model pre-trained on natural language and with bottleneck adapters outperforms all other strategies, including full fine-tuning of the specialised BioBERT model. Based on our findings, we suggest that using bottleneck adapters in low-resource situations with limited access to labelled data or processing capacity could be a viable strategy in biomedical text mining. The code used in the experiments are going to be made available at https://github.com/omidrohanian/bottleneck-adapters.",
    "source_database": "arxiv",
    "arxiv_id": "2210.09440v2"
  },
  {
    "title": "Soft Inductive Bias Approach via Explicit Reasoning Perspectives in Inappropriate Utterance Detection Using Large Language Models",
    "authors": [
      "Ju-Young Kim",
      "Ji-Hong Park",
      "Se-Yeon Lee",
      "Sujin Park",
      "Gun-Woo Kim"
    ],
    "year": "2025",
    "journal": "arXiv:2512.08480v1",
    "doi": "",
    "abstract": "Recent incidents in certain online games and communities, where anonymity is guaranteed, show that unchecked inappropriate remarks frequently escalate into verbal abuse and even criminal behavior, raising significant social concerns. Consequently, there is a growing need for research on techniques that can detect inappropriate utterances within conversational texts to help build a safer communication environment. Although large-scale language models trained on Korean corpora and chain-of-thought reasoning have recently gained attention, research applying these approaches to inappropriate utterance detection remains limited. In this study, we propose a soft inductive bias approach that explicitly defines reasoning perspectives to guide the inference process, thereby promoting rational decision-making and preventing errors that may arise during reasoning. We fine-tune a Korean large language model using the proposed method and conduct both quantitative performance comparisons and qualitative evaluations across different training strategies. Experimental results show that the Kanana-1.5 model achieves an average accuracy of 87.0046, improving by approximately 3.89 percent over standard supervised learning. These findings indicate that the proposed method goes beyond simple knowledge imitation by large language models and enables more precise and consistent judgments through constrained reasoning perspectives, demonstrating its effectiveness for inappropriate utterance detection.",
    "source_database": "arxiv",
    "arxiv_id": "2512.08480v1"
  },
  {
    "title": "Knowledge Overshadowing Causes Amalgamated Hallucination in Large Language Models",
    "authors": [
      "Yuji Zhang",
      "Sha Li",
      "Jiateng Liu",
      "Pengfei Yu",
      "Yi R. Fung",
      "Jing Li",
      "Manling Li",
      "Heng Ji"
    ],
    "year": "2024",
    "journal": "arXiv:2407.08039v1",
    "doi": "",
    "abstract": "Hallucination is often regarded as a major impediment for using large language models (LLMs), especially for knowledge-intensive tasks. Even when the training corpus consists solely of true statements, language models still generate hallucinations in the form of amalgamations of multiple facts. We coin this phenomenon as ``knowledge overshadowing'': when we query knowledge from a language model with multiple conditions, some conditions overshadow others, leading to hallucinated outputs. This phenomenon partially stems from training data imbalance, which we verify on both pretrained models and fine-tuned models, over a wide range of LM model families and sizes.From a theoretical point of view, knowledge overshadowing can be interpreted as over-generalization of the dominant conditions (patterns). We show that the hallucination rate grows with both the imbalance ratio (between the popular and unpopular condition) and the length of dominant condition description, consistent with our derived generalization bound. Finally, we propose to utilize overshadowing conditions as a signal to catch hallucination before it is produced, along with a training-free self-contrastive decoding method to alleviate hallucination during inference. Our proposed approach showcases up to 82% F1 for hallucination anticipation and 11.2% to 39.4% hallucination control, with different models and datasets.",
    "source_database": "arxiv",
    "arxiv_id": "2407.08039v1"
  },
  {
    "title": "(Im)possibility of Automated Hallucination Detection in Large Language Models",
    "authors": [
      "Amin Karbasi",
      "Omar Montasser",
      "John Sous",
      "Grigoris Velegkas"
    ],
    "year": "2025",
    "journal": "arXiv:2504.17004v2",
    "doi": "",
    "abstract": "Is automated hallucination detection possible? In this work, we introduce a theoretical framework to analyze the feasibility of automatically detecting hallucinations produced by large language models (LLMs). Inspired by the classical Gold-Angluin framework for language identification and its recent adaptation to language generation by Kleinberg and Mullainathan, we investigate whether an algorithm, trained on examples drawn from an unknown target language $K$ (selected from a countable collection) and given access to an LLM, can reliably determine whether the LLM's outputs are correct or constitute hallucinations.   First, we establish an equivalence between hallucination detection and the classical task of language identification. We prove that any hallucination detection method can be converted into a language identification method, and conversely, algorithms solving language identification can be adapted for hallucination detection. Given the inherent difficulty of language identification, this implies that hallucination detection is fundamentally impossible for most language collections if the detector is trained using only correct examples from the target language.   Second, we show that the use of expert-labeled feedback, i.e., training the detector with both positive examples (correct statements) and negative examples (explicitly labeled incorrect statements), dramatically changes this conclusion. Under this enriched training regime, automated hallucination detection becomes possible for all countable language collections.   These results highlight the essential role of expert-labeled examples in training hallucination detectors and provide theoretical support for feedback-based methods, such as reinforcement learning with human feedback (RLHF), which have proven critical for reliable LLM deployment.",
    "source_database": "arxiv",
    "arxiv_id": "2504.17004v2"
  },
  {
    "title": "Thyro-GenAI: A Chatbot Using Retrieval-Augmented Generative Models for Personalized Thyroid Disease Management.",
    "authors": [
      "Shin M",
      "Song J",
      "Kim MG",
      "Yu HW",
      "Choe EK",
      "Chai YJ"
    ],
    "year": "2025",
    "journal": "Journal of clinical medicine",
    "doi": "10.3390/jcm14072450",
    "pmid": "40217905",
    "abstract": null,
    "source_database": "pubmed"
  },
  {
    "title": "A Review of Large Language Models in Medical Education, Clinical Decision Support, and Healthcare Administration.",
    "authors": [
      "Vrdoljak J",
      "Boban Z",
      "Vilovi\u0107 M",
      "Kumri\u0107 M",
      "Bo\u017ei\u0107 J"
    ],
    "year": "2025",
    "journal": "Healthcare (Basel, Switzerland)",
    "doi": "10.3390/healthcare13060603",
    "pmid": "40150453",
    "abstract": null,
    "source_database": "pubmed"
  },
  {
    "title": "AR-RAG: Autoregressive Retrieval Augmentation for Image Generation",
    "authors": [
      "Jingyuan Qi",
      "Zhiyang Xu",
      "Qifan Wang",
      "Lifu Huang"
    ],
    "year": "2025",
    "journal": "arXiv:2506.06962v3",
    "doi": "",
    "abstract": "We introduce Autoregressive Retrieval Augmentation (AR-RAG), a novel paradigm that enhances image generation by autoregressively incorporating knearest neighbor retrievals at the patch level. Unlike prior methods that perform a single, static retrieval before generation and condition the entire generation on fixed reference images, AR-RAG performs context-aware retrievals at each generation step, using prior-generated patches as queries to retrieve and incorporate the most relevant patch-level visual references, enabling the model to respond to evolving generation needs while avoiding limitations (e.g., over-copying, stylistic bias, etc.) prevalent in existing methods. To realize AR-RAG, we propose two parallel frameworks: (1) Distribution-Augmentation in Decoding (DAiD), a training-free plug-and-use decoding strategy that directly merges the distribution of model-predicted patches with the distribution of retrieved patches, and (2) Feature-Augmentation in Decoding (FAiD), a parameter-efficient fine-tuning method that progressively smooths the features of retrieved patches via multi-scale convolution operations and leverages them to augment the image generation process. We validate the effectiveness of AR-RAG on widely adopted benchmarks, including Midjourney-30K, GenEval and DPG-Bench, demonstrating significant performance gains over state-of-the-art image generation models.",
    "source_database": "arxiv",
    "arxiv_id": "2506.06962v3"
  },
  {
    "title": "Privacy-preserving machine learning for healthcare: open challenges and future perspectives",
    "authors": [
      "Alejandro Guerra-Manzanares",
      "L. Julian Lechuga Lopez",
      "Michail Maniatakos",
      "Farah E. Shamout"
    ],
    "year": "2023",
    "journal": "arXiv:2303.15563v1",
    "doi": "https://doi.org/10.1007/978-3-031-39539-0_3",
    "abstract": "Machine Learning (ML) has recently shown tremendous success in modeling various healthcare prediction tasks, ranging from disease diagnosis and prognosis to patient treatment. Due to the sensitive nature of medical data, privacy must be considered along the entire ML pipeline, from model training to inference. In this paper, we conduct a review of recent literature concerning Privacy-Preserving Machine Learning (PPML) for healthcare. We primarily focus on privacy-preserving training and inference-as-a-service, and perform a comprehensive review of existing trends, identify challenges, and discuss opportunities for future research directions. The aim of this review is to guide the development of private and efficient ML models in healthcare, with the prospects of translating research efforts into real-world settings.",
    "source_database": "arxiv",
    "arxiv_id": "2303.15563v1"
  },
  {
    "title": "Lightweight Transformers for Clinical Natural Language Processing",
    "authors": [
      "Omid Rohanian",
      "Mohammadmahdi Nouriborji",
      "Hannah Jauncey",
      "Samaneh Kouchaki",
      "ISARIC Clinical Characterisation Group",
      "Lei Clifton",
      "Laura Merson",
      "David A. Clifton"
    ],
    "year": "2023",
    "journal": "arXiv:2302.04725v1",
    "doi": "https://doi.org/10.1017/S1351324923000542",
    "abstract": "Specialised pre-trained language models are becoming more frequent in NLP since they can potentially outperform models trained on generic texts. BioBERT and BioClinicalBERT are two examples of such models that have shown promise in medical NLP tasks. Many of these models are overparametrised and resource-intensive, but thanks to techniques like Knowledge Distillation (KD), it is possible to create smaller versions that perform almost as well as their larger counterparts. In this work, we specifically focus on development of compact language models for processing clinical texts (i.e. progress notes, discharge summaries etc). We developed a number of efficient lightweight clinical transformers using knowledge distillation and continual learning, with the number of parameters ranging from 15 million to 65 million. These models performed comparably to larger models such as BioBERT and ClinicalBioBERT and significantly outperformed other compact models trained on general or biomedical data. Our extensive evaluation was done across several standard datasets and covered a wide range of clinical text-mining tasks, including Natural Language Inference, Relation Extraction, Named Entity Recognition, and Sequence Classification. To our knowledge, this is the first comprehensive study specifically focused on creating efficient and compact transformers for clinical NLP tasks. The models and code used in this study can be found on our Huggingface profile at https://huggingface.co/nlpie and Github page at https://github.com/nlpie-research/Lightweight-Clinical-Transformers, respectively, promoting reproducibility of our results.",
    "source_database": "arxiv",
    "arxiv_id": "2302.04725v1"
  },
  {
    "title": "Are Large Language Models Ready for Healthcare? A Comparative Study on Clinical Language Understanding",
    "authors": [
      "Yuqing Wang",
      "Yun Zhao",
      "Linda Petzold"
    ],
    "year": "2023",
    "journal": "arXiv:2304.05368v3",
    "doi": "",
    "abstract": "Large language models (LLMs) have made significant progress in various domains, including healthcare. However, the specialized nature of clinical language understanding tasks presents unique challenges and limitations that warrant further investigation. In this study, we conduct a comprehensive evaluation of state-of-the-art LLMs, namely GPT-3.5, GPT-4, and Bard, within the realm of clinical language understanding tasks. These tasks span a diverse range, including named entity recognition, relation extraction, natural language inference, semantic textual similarity, document classification, and question-answering. We also introduce a novel prompting strategy, self-questioning prompting (SQP), tailored to enhance LLMs' performance by eliciting informative questions and answers pertinent to the clinical scenarios at hand. Our evaluation underscores the significance of task-specific learning strategies and prompting techniques for improving LLMs' effectiveness in healthcare-related tasks. Additionally, our in-depth error analysis on the challenging relation extraction task offers valuable insights into error distribution and potential avenues for improvement using SQP. Our study sheds light on the practical implications of employing LLMs in the specialized domain of healthcare, serving as a foundation for future research and the development of potential applications in healthcare settings.",
    "source_database": "arxiv",
    "arxiv_id": "2304.05368v3"
  },
  {
    "title": "DeepCodeSeek: Real-Time API Retrieval for Context-Aware Code Generation",
    "authors": [
      "Esakkivel Esakkiraja",
      "Denis Akhiyarov",
      "Aditya Shanmugham",
      "Chitra Ganapathy"
    ],
    "year": "2025",
    "journal": "arXiv:2509.25716v1",
    "doi": "",
    "abstract": "Current search techniques are limited to standard RAG query-document applications. In this paper, we propose a novel technique to expand the code and index for predicting the required APIs, directly enabling high-quality, end-to-end code generation for auto-completion and agentic AI applications. We address the problem of API leaks in current code-to-code benchmark datasets by introducing a new dataset built from real-world ServiceNow Script Includes that capture the challenge of unclear API usage intent in the code. Our evaluation metrics show that this method achieves 87.86% top-40 retrieval accuracy, allowing the critical context with APIs needed for successful downstream code generation. To enable real-time predictions, we develop a comprehensive post-training pipeline that optimizes a compact 0.6B reranker through synthetic dataset generation, supervised fine-tuning, and reinforcement learning. This approach enables our compact reranker to outperform a much larger 8B model while maintaining 2.5x reduced latency, effectively addressing the nuances of enterprise-specific code without the computational overhead of larger models.",
    "source_database": "arxiv",
    "arxiv_id": "2509.25716v1"
  },
  {
    "title": "Predicting Failures of LLMs to Link Biomedical Ontology Terms to Identifiers Evidence Across Models and Ontologies",
    "authors": [
      "Daniel B. Hier",
      "Steven Keith Platt",
      "Tayo Obafemi-Ajayi"
    ],
    "year": "2025",
    "journal": "arXiv:2509.04458v2",
    "doi": "",
    "abstract": "Large language models often perform well on biomedical NLP tasks but may fail to link ontology terms to their correct identifiers. We investigate why these failures occur by analyzing predictions across two major ontologies, Human Phenotype Ontology and Gene Ontology, and two high-performing models, GPT-4o and LLaMa 3.1 405B. We evaluate nine candidate features related to term familiarity, identifier usage, morphology, and ontology structure. Univariate and multivariate analyses show that exposure to ontology identifiers is the strongest predictor of linking success.",
    "source_database": "arxiv",
    "arxiv_id": "2509.04458v2"
  },
  {
    "title": "GaRAGe: A Benchmark with Grounding Annotations for RAG Evaluation",
    "authors": [
      "Ionut-Teodor Sorodoc",
      "Leonardo F. R. Ribeiro",
      "Rexhina Blloshmi",
      "Christopher Davis",
      "Adri\u00e0 de Gispert"
    ],
    "year": "2025",
    "journal": "arXiv:2506.07671v1",
    "doi": "",
    "abstract": "We present GaRAGe, a large RAG benchmark with human-curated long-form answers and annotations of each grounding passage, allowing a fine-grained evaluation of whether LLMs can identify relevant grounding when generating RAG answers. Our benchmark contains 2366 questions of diverse complexity, dynamism, and topics, and includes over 35K annotated passages retrieved from both private document sets and the Web, to reflect real-world RAG use cases. This makes it an ideal test bed to evaluate an LLM's ability to identify only the relevant information necessary to compose a response, or provide a deflective response when there is insufficient information. Evaluations of multiple state-of-the-art LLMs on GaRAGe show that the models tend to over-summarise rather than (a) ground their answers strictly on the annotated relevant passages (reaching at most a Relevance-Aware Factuality Score of 60%), or (b) deflect when no relevant grounding is available (reaching at most 31% true positive rate in deflections). The F1 in attribution to relevant sources is at most 58.9%, and we show that performance is particularly reduced when answering time-sensitive questions and when having to draw knowledge from sparser private grounding sources.",
    "source_database": "arxiv",
    "arxiv_id": "2506.07671v1"
  },
  {
    "title": "GaRAGe: A Benchmark with Grounding Annotations for RAG Evaluation",
    "authors": [
      "I. Sorodoc",
      "Leonardo F. R. Ribeiro",
      "Rexhina Blloshmi",
      "Christopher Davis",
      "A. D. Gispert"
    ],
    "year": "2025",
    "journal": "ArXiv",
    "doi": "10.48550/arXiv.2506.07671",
    "pmid": "",
    "abstract": "We present GaRAGe, a large RAG benchmark with human-curated long-form answers and annotations of each grounding passage, allowing a fine-grained evaluation of whether LLMs can identify relevant grounding when generating RAG answers. Our benchmark contains 2366 questions of diverse complexity, dynamism, and topics, and includes over 35K annotated passages retrieved from both private document sets and the Web, to reflect real-world RAG use cases. This makes it an ideal test bed to evaluate an LLM's ability to identify only the relevant information necessary to compose a response, or provide a deflective response when there is insufficient information. Evaluations of multiple state-of-the-art LLMs on GaRAGe show that the models tend to over-summarise rather than (a) ground their answers strictly on the annotated relevant passages (reaching at most a Relevance-Aware Factuality Score of 60%), or (b) deflect when no relevant grounding is available (reaching at most 31% true positive rate in deflections). The F1 in attribution to relevant sources is at most 58.9%, and we show that performance is particularly reduced when answering time-sensitive questions and when having to draw knowledge from sparser private grounding sources.",
    "source_database": "semantic_scholar"
  },
  {
    "title": "Reshaping Biomedical Scientific Literature in a RAG Pipeline for Question Answering",
    "authors": [
      "Ma\u00ebl Lesavourey",
      "Gilles Hubert"
    ],
    "year": "2025",
    "journal": "",
    "doi": "",
    "pmid": "",
    "abstract": "",
    "source_database": "semantic_scholar"
  },
  {
    "title": "How do language models learn facts? Dynamics, curricula and hallucinations",
    "authors": [
      "Nicolas Zucchet",
      "J\u00f6rg Bornschein",
      "Stephanie Chan",
      "Andrew Lampinen",
      "Razvan Pascanu",
      "Soham De"
    ],
    "year": "2025",
    "journal": "arXiv:2503.21676v2",
    "doi": "",
    "abstract": "Large language models accumulate vast knowledge during pre-training, yet the dynamics governing this acquisition remain poorly understood. This work investigates the learning dynamics of language models on a synthetic factual recall task, uncovering three key findings: First, language models learn in three phases, exhibiting a performance plateau before acquiring precise factual knowledge. Mechanistically, this plateau coincides with the formation of attention-based circuits that support recall. Second, the training data distribution significantly impacts learning dynamics, as imbalanced distributions lead to shorter plateaus. Finally, hallucinations emerge simultaneously with knowledge, and integrating new knowledge into the model through fine-tuning is challenging, as it quickly corrupts its existing parametric memories. Our results emphasize the importance of data distribution in knowledge acquisition and suggest novel data scheduling strategies to accelerate neural network training.",
    "source_database": "arxiv",
    "arxiv_id": "2503.21676v2"
  }
]