[
  {
    "title": "ChatTogoVar: a TogoVar-based retrieval-augmented generation system for precise genomic variant interpretation.",
    "authors": [
      "Mitsuhashi N",
      "Fujiwara T",
      "Yamaguchi A"
    ],
    "year": "2026",
    "journal": "Human genome variation",
    "doi": "10.1038/s41439-026-00344-4",
    "pmid": "41956998",
    "abstract": "Large language models (LLMs) have recently been adopted to assist in the interpretation of human genomic variants. However, general-purpose LLMs can produce incorrect outputs (commonly termed 'hallucinations'), particularly on specialized queries, raising concerns about their reliability for variant interpretation. Here, to mitigate this risk, we developed ChatTogoVar, a retrieval-augmented generation system that queries TogoVar, a variant database that integrates information, such as allele frequency and clinical significance, and incorporates the retrieved results into prompts. We constructed a benchmark of 150 questions sampled from a predefined pool of 1500 template-variant combinations (50 templates × 30 variants). For large-scale assessment, we used the full 1500-question pool for automated LLM-based scoring. ChatTogoVar achieved the highest score for 135/150 questions, outperforming both a general-purpose LLM and an existing specialized system. Furthermore, automatic evaluation of all 1500 questions by an LLM confirmed the same trend. These results suggest that integrating a reliable variant database with an LLM can improve the accuracy of variant interpretation and that ChatTogoVar may serve as a practical tool to support genomic medicine and personalized healthcare.",
    "source_database": "pubmed"
  },
  {
    "title": "Promises and challenges of applying large language models in the healthcare domain.",
    "authors": [
      "Wang Q",
      "Gong Z",
      "Lai Z",
      "Bu L",
      "Dahlweid FM",
      "Sun H"
    ],
    "year": "2026",
    "journal": "Frontiers in digital health",
    "doi": "10.3389/fdgth.2026.1772274",
    "pmid": "41924178",
    "abstract": "Large language models are rapidly moving from theoretical concepts to active clinical pilots. Current approaches diverge between general-purpose models, which adapt to healthcare via prompt engineering, and domain-specific models, which prioritize deep alignment with medical knowledge graphs to ensure safety. Despite reported benefits in documentation efficiency and diagnostic reasoning, significant challenges remain regarding hallucination, privacy, and the validity of evaluation metrics. This Mini Review synthesizes current evidence, contrasts these two modeling paradigms, highlights key controversies, and maps out future development routes including retrieval-augmented generation and agentic architectures.",
    "source_database": "pubmed"
  },
  {
    "title": "Harnessing Large Language Models in Neonatal Intraventricular Hemorrhage: Exploring Retrieval Augmented Generation Methodology for Prognostic Variable Discovery.",
    "authors": [
      "Arora T",
      "Beam K"
    ],
    "year": "2026",
    "journal": "American journal of perinatology",
    "doi": "10.1055/a-2838-5446",
    "pmid": "41871598",
    "abstract": "The objective of this study is to evaluate whether large language models (LLMs) can autonomously synthesize existing literature and accurately extract prognostic variables for neonatal intraventricular hemorrhage (IVH) and its outcomes while assessing their capability for clinical feature ranking and risk stratification.This pilot study employed a systematic literature review combined with retrieval-augmented generation (RAG) methodology. GPT 4 (OpenAI) and Claude Sonnet (4.0, Anthropic) were prompted to identify peer-reviewed studies utilizing machine learning and deep learning to predict IVH outcomes in preterm neonates. Data extraction was prompted to follow TRIPOD artificial intelligence (AI) guidelines, capturing study design, population characteristics, predictor variables, and outcome measures. Semi-automated RAG extraction was performed with manual validation to mitigate hallucination risk.LLMs initially identified 39 studies, with 28 meeting some or all the validation criteria after excluding references that were hallucinated. From these, 14 distinct prognostic predictors were extracted across four outcome domains: mortality, progression, complications, and resolution. Universal high-impact predictors included gestational age (13 mentions; 41%), birth weight (8 mentions, 25%), and Apgar scores (11 mentions, 34%). Variables were categorized into three clinical tiers based on frequency, outcome breadth, and modifiability. A preliminary risk stratification model demonstrated high-risk neonates (<28 weeks, <1,000g, Apgar <3) with estimated progression risk >70%, and mortality >50%, while low-risk neonates (>32 weeks, >1,500 g, Apgar > 5) showed favorable trajectories.This study demonstrates that LLMs can synthesize medical literature and extract clinically relevant prognostic variables for neonatal IVH outcomes. However, LLM outputs were susceptible to hallucinations and incomplete data synthesis, underscoring the need for rigorous clinical oversight and human validation to ensure reliability. The identified universal predictors provide a foundation for developing AI-assisted clinical decision support tools. Notable research gaps include the complete absence of resolution prediction studies and limited investigation of complication predictors, highlighting opportunities for future investigation in precision neonatology. · LLMs can synthesize medical literature. · LLMs can assist in creating a prognostic ontology. · Human oversight is critical when using LLMs for healthcare.",
    "source_database": "pubmed"
  },
  {
    "title": "Integrating Fine-Tuning and Retrieval-Augmented Generation for Healthcare AI Systems: A Scoping Review.",
    "authors": [
      "Collaco BG",
      "Srinivasagam P",
      "Gomez-Cabello CA",
      "Haider SA",
      "Genovese A",
      "Wood NG",
      "Bagaria S",
      "Lifson MA",
      "Forte AJ"
    ],
    "year": "2026",
    "journal": "Bioengineering (Basel, Switzerland)",
    "doi": "10.3390/bioengineering13020225",
    "pmid": "41749764",
    "abstract": "(1) Background: Large language models (LLMs) show promise in healthcare but are constrained by hallucinations, static knowledge, and limited domain specificity. Fine-tuning (FT) and retrieval-augmented generation (RAG) offer complementary solutions, with FT embedding domain reasoning and RAG enabling dynamic, up-to-date knowledge access. Hybrid FT + RAG frameworks have been proposed to improve factual accuracy and clinical reliability. This scoping review synthesizes current evidence on such hybrids in healthcare AI. (2) Methods: The search across PubMed, IEEE Xplore, Google Scholar, and Embase identified studies implementing explicit FT + RAG hybrids in healthcare or biomedical tasks. Eligible studies reported empirical evaluations of LLM performance or behavior. Data were extracted on base models, FT strategies, RAG architectures, applications, and performance outcomes. (3) Results: Seven studies met inclusion criteria. FT + RAG systems consistently outperformed FT-only or RAG-only approaches across QA, clinical summarization, report generation, and decision support tasks. Parameter-efficient FT methods (e.g., LoRA) were common, while RAG implementations varied (dense, hybrid, hierarchical, multimodal, federated). Reported benefits included improved accuracy, reduced hallucination, and greater clinician preference and feasibility in protected settings. (4) Conclusions: FT + RAG frameworks represent a promising direction for clinically grounded healthcare AI, combining domain-specific reasoning with transparent, up-to-date retrieval. Future work should prioritize standardized evaluation, workflow integration, and governance to enable safe deployment.",
    "source_database": "pubmed"
  },
  {
    "title": "Effective prompt design for large language models in clinical practice.",
    "authors": [
      "Callens S"
    ],
    "year": "2026",
    "journal": "Acta clinica Belgica",
    "doi": "10.1080/17843286.2026.2613903",
    "pmid": "41524451",
    "abstract": "Large language models (LLMs) have emerged as transformative healthcare tools for clinical documentation, diagnostic reasoning, and medical education. However, effective utilization requires understanding prompt engineering principles-the strategic design of inputs to optimize performance while mitigating hallucination, bias, and outdated information.",
    "source_database": "pubmed"
  },
  {
    "title": "GraphRAG-Enabled Local Large Language Model for Gestational Diabetes Mellitus: Development of a Proof-of-Concept.",
    "authors": [
      "Evangelista E",
      "Ruba F",
      "Bukhari S",
      "Nazir A",
      "Sharma R"
    ],
    "year": "2026",
    "journal": "JMIR diabetes",
    "doi": "10.2196/76454",
    "pmid": "41490382",
    "abstract": "Gestational diabetes mellitus (GDM) is a prevalent chronic condition that affects maternal and fetal health outcomes worldwide, increasingly in underserved populations. While generative artificial intelligence (AI) and large language models (LLMs) have shown promise in health care, their application in GDM management remains underexplored.",
    "source_database": "pubmed"
  },
  {
    "title": "Refine Medical Diagnosis Using Generation Augmented Retrieval and Clinical Practice Guidelines.",
    "authors": [
      "Li W",
      "Zhang H",
      "Zhang H",
      "Li Z",
      "Dong Z",
      "Chen Y",
      "Bidargaddi N",
      "Liu H"
    ],
    "year": "2025",
    "journal": "IEEE journal of biomedical and health informatics",
    "doi": "10.1109/JBHI.2025.3641931",
    "pmid": "41364573",
    "abstract": "Current medical language models, adapted from large language models, typically predict ICD code-based diagnosis from electronic health records (EHRs) because these labels are readily available. However, ICD codes do not capture the nuanced, context-rich reasoning clinicians use for diagnosis. Clinicians synthesize diverse patient data and reference clinical practice guidelines (CPGs) to make evidence-based decisions. This misalignment limits the clinical utility of existing models. We introduce GARMLE-G, a Generation-Augmented Retrieval framework that grounds medical language model outputs in authoritative CPGs. Unlike conventional Retrieval-Augmented Generation based approaches, GARMLE-G enables hallucination-free outputs by directly retrieving authoritative guideline content without relying on model-generated text. It (1) integrates LLM predictions with EHR data to create semantically rich queries, (2) retrieves relevant CPG knowledge snippets via embedding similarity, and (3) fuses guideline content with model output to generate clinically aligned recommendations. A prototype system for hypertension and coronary heart disease diagnosis was developed and evaluated on multiple metrics, demonstrating superior retrieval precision, semantic relevance, and clinical guideline adherence compared to RAG-based baselines, while maintaining a lightweight architecture suitable for localized healthcare deployment. This work provides a scalable, low-cost, and hallucination-free method for grounding medical language models in evidence-based clinical practice, with strong potential for broader clinical deployment.",
    "source_database": "pubmed"
  },
  {
    "title": "Enhancing Large Language Models for Improved Accuracy and Safety in Medical Question Answering: Comparative Study.",
    "authors": [
      "Wang D",
      "Ye J",
      "Li J",
      "Liang J",
      "Zhang Q",
      "Hu Q",
      "Pan C",
      "Wang D",
      "Liu Z",
      "Shi W",
      "Guo M",
      "Li F",
      "Du W",
      "Zheng YF"
    ],
    "year": "2025",
    "journal": "JMIR medical education",
    "doi": "10.2196/70190",
    "pmid": "41329953",
    "abstract": "Large language models (LLMs) offer the potential to improve virtual patient-physician communication and reduce health care professionals' workload. However, limitations in accuracy, outdated knowledge, and safety issues restrict their effective use in real clinical settings. Addressing these challenges is crucial for making LLMs a reliable health care tool.",
    "source_database": "pubmed"
  },
  {
    "title": "The Development and Evaluation of a Retrieval-Augmented Generation Large Language Model Virtual Assistant for Postoperative Instructions.",
    "authors": [
      "Haider SA",
      "Prabha S",
      "Gomez Cabello CA",
      "Genovese A",
      "Collaco B",
      "Wood N",
      "London J",
      "Bagaria S",
      "Tao C",
      "Forte AJ"
    ],
    "year": "2025",
    "journal": "Bioengineering (Basel, Switzerland)",
    "doi": "10.3390/bioengineering12111219",
    "pmid": "41301175",
    "abstract": "During postoperative recovery, patients and their caregivers often lack crucial information, leading to numerous repetitive inquiries that burden healthcare providers. Traditional discharge materials, including paper handouts and patient portals, are often static, overwhelming, or underutilized, leading to patient overwhelm and contributing to unnecessary ER visits and overall healthcare overutilization. Conversational chatbots offer a solution, but Natural Language Processing (NLP) systems are often inflexible and limited in understanding, while powerful Large Language Models (LLMs) are prone to generating \"hallucinations\".",
    "source_database": "pubmed"
  },
  {
    "title": "Retrieval-augmented generation for interpreting clinical laboratory regulations using large language models.",
    "authors": [
      "Nanua S",
      "Steward R",
      "Neely B",
      "Datto M",
      "Youens K"
    ],
    "year": "2025",
    "journal": "Journal of pathology informatics",
    "doi": "10.1016/j.jpi.2025.100520",
    "pmid": "41244595",
    "abstract": "Large language models (LLMs) have demonstrated strong performance on general knowledge tasks, but they have important limitations as standalone tools for question answering in specialized domains where accuracy and consistency are critical. Retrieval-augmented generation (RAG) is a strategy in which LLM outputs are grounded in dynamically retrieved source documents, offering advantages in accuracy, explainability, and maintainability. We developed and evaluated a custom RAG system called Raven, designed to answer laboratory regulatory questions using the part of the Code of Federal Regulations (CFR) pertaining to laboratory (42 CFR Part 493) as an authoritative source. Raven employed a vector search pipeline and a LLM to generate grounded responses via a chatbot-style interface. The system was tested using 103 synthetic laboratory regulatory questions, 88 of which were explicitly addressed in the CFR. Compared to answers generated manually by a board-certified pathologist, Raven's responses were judged to be totally complete and correct in 92.0% of those 88 cases, with little irrelevant content and a low potential for regulatory or medical error. Performance declined significantly on questions not addressed in the CFR, confirming the system's grounding in the source documents. Most suboptimal responses were attributable to faulty source document retrieval rather than model hallucination or misinterpretation. These findings demonstrate that a basic RAG system can produce useful, accurate, and verifiable answers to complex regulatory questions. With appropriate safeguards and with thoughtful integration into user workflows, tools like Raven may serve as valuable decision-support systems in laboratory medicine and other knowledge-intensive healthcare domains.",
    "source_database": "pubmed"
  },
  {
    "title": "Performance of Large Language Models on the Acute Coronary Syndrome Guidelines Using Retrieval-Augmented Generation.",
    "authors": [
      "Alexandrou M",
      "Kumar S",
      "Mahtani AU",
      "Strepkos D",
      "Carvalho PEP",
      "Mutlu D",
      "Ser OS",
      "Rempakos A",
      "Mastrodemos OC",
      "Rangan BV",
      "Jalli S",
      "Sandoval Y",
      "Brilakis ES"
    ],
    "year": "2025",
    "journal": "JACC. Cardiovascular interventions",
    "doi": "10.1016/j.jcin.2025.08.019",
    "pmid": "41161918",
    "abstract": "Large language models (LLMs) are increasingly applied in interventional cardiology, but hallucinations limit their clinical utility.",
    "source_database": "pubmed"
  },
  {
    "title": "Evaluating Web Retrieval-Assisted Large Language Models With and Without Whitelisting for Evidence-Based Neurology: Comparative Study.",
    "authors": [
      "Masanneck L",
      "Epping PZ",
      "Meuth SG",
      "Pawlitzki M"
    ],
    "year": "2025",
    "journal": "Journal of medical Internet research",
    "doi": "10.2196/79379",
    "pmid": "41159599",
    "abstract": "Large language models (LLMs) coupled with real-time web retrieval are reshaping how clinicians and patients locate medical evidence, and as major search providers fuse LLMs into their interfaces, this hybrid approach might become the new \"gateway\" to the internet. However, open-web retrieval exposes models to nonprofessional sources, risking hallucinations and factual errors that might jeopardize evidence-based care.",
    "source_database": "pubmed"
  },
  {
    "title": "Development and Evaluation of a Retrieval-Augmented Generation Chatbot for Orthopedic and Trauma Surgery Patient Education: Mixed-Methods Study.",
    "authors": [
      "Baur D",
      "Ansorg J",
      "Heyde CE",
      "Voelker A"
    ],
    "year": "2025",
    "journal": "JMIR AI",
    "doi": "10.2196/75262",
    "pmid": "41134117",
    "abstract": "Large language models are increasingly applied in health care for documentation, patient education, and clinical decision support. However, their factual reliability can be compromised by hallucinations and a lack of source traceability. Retrieval-augmented generation (RAG) enhances response accuracy by combining generative models with document retrieval mechanisms. While promising in medical contexts, RAG-based systems remain underexplored in orthopedic and trauma surgery patient education, particularly in non-English settings.",
    "source_database": "pubmed"
  },
  {
    "title": "MEGA-RAG: a retrieval-augmented generation framework with multi-evidence guided answer refinement for mitigating hallucinations of LLMs in public health.",
    "authors": [
      "Xu S",
      "Yan Z",
      "Dai C",
      "Wu F"
    ],
    "year": "2025",
    "journal": "Frontiers in public health",
    "doi": "10.3389/fpubh.2025.1635381",
    "pmid": "41132171",
    "abstract": "The increasing adoption of large language models (LLMs) in public health has raised significant concerns about hallucinations-factually inaccurate or misleading outputs that can compromise clinical communication and policy decisions.",
    "source_database": "pubmed"
  },
  {
    "title": "Exploring Patient Perspectives, Engagement, and Output Quality in Doctor-Supervised Use of Artificial Intelligence During Informed Consent Consultation With ChatGPT and Retrieval Augmented Generation (RAG): Quantitative Exploratory Study.",
    "authors": [
      "Donner S",
      "Knauer P",
      "Kienzle A",
      "Dinneen J",
      "Burger J",
      "Perka C",
      "Donner S"
    ],
    "year": "2025",
    "journal": "Journal of medical Internet research",
    "doi": "10.2196/73717",
    "pmid": "41124695",
    "abstract": "Comprehensive preoperative education is essential for optimizing outcomes and ensuring informed consent in patients undergoing total hip arthroplasty (THA). Emerging artificial intelligence (AI) tools, such as ChatGPT, offer scalable support for patient education, but their clinical application requires rigorous evaluation to ensure accuracy, safety, and trust.",
    "source_database": "pubmed"
  },
  {
    "title": "Enhancing Large Language Models with Domain-specific Retrieval Augment Generation: A Case Study on Long-form Consumer Health Question Answering in Ophthalmology.",
    "authors": [
      "Gilson A",
      "Ai X",
      "Arunachalam T",
      "Chen Z",
      "Cheong KX",
      "Dave A",
      "Duic C",
      "Kibe M",
      "Kaminaka A",
      "Prasad M",
      "Siddig F",
      "Singer M",
      "Wong W",
      "Jin Q",
      "Keenan TDL",
      "Hu X",
      "Chew EY",
      "Lu Z",
      "Xu H",
      "Adelman RA",
      "Tham YC",
      "Chen Q"
    ],
    "year": "2024",
    "journal": "ArXiv",
    "doi": "",
    "pmid": "41031070",
    "abstract": "Despite the potential of Large Language Models (LLMs) in medicine, they may generate responses lacking supporting evidence or based on hallucinated evidence. While Retrieval Augment Generation (RAG) is popular to address this issue, few studies implemented and evaluated RAG in downstream domain-specific applications. We developed a RAG pipeline with ~70,000 ophthalmology-specific documents that retrieve relevant documents to augment LLMs during inference time. In a case study on long-form consumer health questions, we systematically evaluated the responses - including over 500 references - of LLMs with and without RAG on 100 questions with 10 healthcare professionals. The evaluation focuses on factuality of evidence, selection and ranking of evidence, attribution of evidence, and answer accuracy and completeness. LLMs without RAG provided 252 references in total. Of which, 45.3% hallucinated, 34.1% consisted of minor errors, and 20.6% were correct. In contrast, LLMs with RAG significantly improved accuracy (54.5% being correct) and reduced error rates (18.8% with minor hallucinations and 26.7% with errors). 62.5% of the top 10 documents retrieved by RAG were selected as the top references in the LLM response, with an average ranking of 4.9. The use of RAG also improved evidence attribution (increasing from 1.85 to 2.49 on a 5-point scale, P<0.001), albeit with slight decreases in accuracy (from 3.52 to 3.23, P=0.03) and completeness (from 3.47 to 3.27, P=0.17). The results demonstrate that LLMs frequently exhibited hallucinated and erroneous evidence in the responses, raising concerns for downstream applications in the medical domain. RAG substantially reduced the proportion of such evidence but encountered challenges. In contrast to existing studies, the results highlight that (1) LLMs may not select top-ranked documents by RAG, which results in hallucinated evidence remaining, (2) LLMs may miss top-ranked documents by RAG, and (3) irrelevant documents by RAG downgrade response accuracy and completeness, especially in challenging tasks such as long-form question answering. In conclusion, in long-form medical question answering, the RAG approach demonstrated improved effectiveness over non-RAG approach. Nevertheless, there are still challenges in evidence retrieval, selection, and attribution, highlighting the need for further development in domain-specific LLM and RAG techniques.",
    "source_database": "pubmed"
  },
  {
    "title": "Large language models in clinical nutrition: an overview of its applications, capabilities, limitations, and potential future prospects.",
    "authors": [
      "Belkhouribchia J",
      "Pen JJ"
    ],
    "year": "2025",
    "journal": "Frontiers in nutrition",
    "doi": "10.3389/fnut.2025.1635682",
    "pmid": "40851903",
    "abstract": "The integration of large language models (LLMs) into clinical nutrition marks a transformative advancement, offering promising solutions for enhancing patient care, personalizing dietary recommendations, and supporting evidence-based clinical decision-making. Trained on extensive text corpora and powered by transformer-based architectures, LLMs demonstrate remarkable capabilities in natural language understanding and generation. This review provides an overview of their current and potential applications in clinical nutrition, focusing on key technologies including prompt engineering, fine-tuning, retrieval-augmented generation, and multimodal integration. These enhancements increase domain relevance, factual accuracy, and contextual responsiveness, enabling LLMs to deliver more reliable outputs in nutrition-related tasks. Recent studies have shown LLMs' utility in dietary planning, nutritional education, obesity management, and malnutrition risk assessment. Despite these advances, challenges remain. Limitations in reasoning, factual accuracy, and domain specificity, along with risks of bias and hallucination, underscore the need for rigorous validation and human oversight. Furthermore, ethical considerations, environmental costs, and infrastructural integration must be addressed before widespread adoption. Future directions include combining LLMs with predictive analytics, integrating them with electronic health records and wearables, and adapting them for multilingual, culturally sensitive dietary guidance. LLMs also hold potential as research and educational tools, assisting in literature synthesis and patient engagement. Their transformative promise depends on cross-disciplinary collaboration, responsible deployment, and clinician training. Ultimately, while LLMs are not a replacement for healthcare professionals, they offer powerful augmentation tools for delivering scalable, personalized, and data-driven nutritional care in an increasingly complex healthcare environment.",
    "source_database": "pubmed"
  },
  {
    "title": "Large language models for clinical decision support in gastroenterology and hepatology.",
    "authors": [
      "Wiest IC",
      "Bhat M",
      "Clusmann J",
      "Schneider CV",
      "Jiang X",
      "Kather JN"
    ],
    "year": "2025",
    "journal": "Nature reviews. Gastroenterology & hepatology",
    "doi": "10.1038/s41575-025-01108-1",
    "pmid": "40846793",
    "abstract": "Clinical decision making in gastroenterology and hepatology has become increasingly complex and challenging for physicians. This growing complexity can be addressed by computational tools that support clinical decisions. Although numerous clinical decision support systems (CDSS) have emerged, they have faced difficulties with real-world performance and generalizability, resulting in limited clinical adoption. Generative artificial intelligence (AI), particularly large language models (LLMs), are introducing new possibilities for CDSS by offering more flexible and adaptable support that better reflects complex clinical scenarios. LLMs can process unstructured text, including patient data and medical guidelines, and integrate various information sources with high accuracy, especially when augmented with retrieval-augmented generation. Thus, LLMs can provide dynamic, context-specific support by generating personalized treatment recommendations, identifying potential complications based on patient history, and enabling natural language interactions with health-care providers. However, important challenges persist, particularly regarding biases, hallucinations, interoperability barriers, and proper training of health-care providers. We examine the parallel evolution of the complexity in clinical management in gastroenterology and hepatology, and the technical developments leading to current generative AI models. We discuss how these advances are converging to create effective CDSS, providing a conceptual basis for further development and clinical adoption of these systems.",
    "source_database": "pubmed"
  },
  {
    "title": "Context-Aware Retrieval-Augmented Generation for Artificial Intelligence in Urology.",
    "authors": [
      "Sriram A",
      "N M",
      "Sundan B",
      "Krishnamoorthy S"
    ],
    "year": "2025",
    "journal": "Cureus",
    "doi": "10.7759/cureus.88167",
    "pmid": "40821282",
    "abstract": "Background Artificial intelligence (AI) is increasingly being used in healthcare, particularly for interpreting complex medical queries. However, conventional AI models often generate inaccurate or irrelevant responses that are commonly termed hallucinations, which may compromise patient safety. To address this, our study introduces a modified retrieval-augmented generation (RAG) framework tailored for the urology domain to enhance contextual relevance and accuracy in AI-generated responses. Methodology We developed a context-aware RAG system integrating PubMedBERT embeddings for encoding and retrieving urological literature stored in a Pinecone vector database. The system uses named entity recognition for domain-specific query filtering and incorporates dynamic memory to retain contextual flow during interactions. Response generation is powered by the LLaMA3-8B model via LangChain. A custom dataset of urology-related queries was used for evaluation, with a large language model-based scoring using the Deepseek-R1 model. Results The proposed framework demonstrated a significant reduction in hallucinations, with responses being more contextually relevant and evidence-based. Compared to baseline models, our system achieved an 89% performance improvement in generating medically appropriate answers. Integration of memory modules and named entity filtering further improved precision and reliability. Conclusions Our RAG-enhanced system shows strong potential for clinical use by producing trustworthy, context-aware responses in urology. It addresses key challenges in medical AI, including hallucination mitigation and domain relevance. Future work will focus on reducing inference latency and improving automated validation without manual oversight.",
    "source_database": "pubmed"
  },
  {
    "title": "Retrieval-augmented generation elevates local LLM quality in radiology contrast media consultation.",
    "authors": [
      "Wada A",
      "Tanaka Y",
      "Nishizawa M",
      "Yamamoto A",
      "Akashi T",
      "Hagiwara A",
      "Hayakawa Y",
      "Kikuta J",
      "Shimoji K",
      "Sano K",
      "Kamagata K",
      "Nakanishi A",
      "Aoki S"
    ],
    "year": "2025",
    "journal": "NPJ digital medicine",
    "doi": "10.1038/s41746-025-01802-z",
    "pmid": "40604147",
    "abstract": "Large language models (LLMs) demonstrate significant potential in healthcare applications, but clinical deployment is limited by privacy concerns and insufficient medical domain training. This study investigated whether retrieval-augmented generation (RAG) can improve locally deployable LLM for radiology contrast media consultation. In 100 synthetic iodinated contrast media consultations we compared Llama 3.2-11B (baseline and RAG) with three cloud-based models-GPT-4o mini, Gemini 2.0 Flash and Claude 3.5 Haiku. A blinded radiologist ranked the five replies per case, and three LLM-based judges scored accuracy, safety, structure, tone, applicability and latency. Under controlled conditions, RAG eliminated hallucinations (0% vs 8%; χ²₍Yates₎ = 6.38, p = 0.012) and improved mean rank by 1.3 (Z = -4.82, p < 0.001), though performance gaps with cloud models persist. The RAG-enhanced model remained faster (2.6 s vs 4.9-7.3 s) while the LLM-based judges preferred it over GPT-4o mini, though the radiologist ranked GPT-4o mini higher. RAG thus provides meaningful improvements for local clinical LLMs while maintaining the privacy benefits of on-premise deployment.",
    "source_database": "pubmed"
  },
  {
    "title": "Retrieval augmented generation for large language models in healthcare: A systematic review.",
    "authors": [
      "Amugongo LM",
      "Mascheroni P",
      "Brooks S",
      "Doering S",
      "Seidel J"
    ],
    "year": "2025",
    "journal": "PLOS digital health",
    "doi": "10.1371/journal.pdig.0000877",
    "pmid": "40498738",
    "abstract": "Large Language Models (LLMs) have demonstrated promising capabilities to solve complex tasks in critical sectors such as healthcare. However, LLMs are limited by their training data which is often outdated, the tendency to generate inaccurate (\"hallucinated\") content and a lack of transparency in the content they generate. To address these limitations, retrieval augmented generation (RAG) grounds the responses of LLMs by exposing them to external knowledge sources. However, in the healthcare domain there is currently a lack of systematic understanding of which datasets, RAG methodologies and evaluation frameworks are available. This review aims to bridge this gap by assessing RAG-based approaches employed by LLMs in healthcare, focusing on the different steps of retrieval, augmentation and generation. Additionally, we identify the limitations, strengths and gaps in the existing literature. Our synthesis shows that 78.9% of studies used English datasets and 21.1% of the datasets are in Chinese. We find that a range of techniques are employed RAG-based LLMs in healthcare, including Naive RAG, Advanced RAG, and Modular RAG. Surprisingly, proprietary models such as GPT-3.5/4 are the most used for RAG applications in healthcare. We find that there is a lack of standardised evaluation frameworks for RAG-based applications. In addition, the majority of the studies do not assess or address ethical considerations related to RAG in healthcare. It is important to account for ethical challenges that are inherent when AI systems are implemented in the clinical setting. Lastly, we highlight the need for further research and development to ensure responsible and effective adoption of RAG in the medical domain.",
    "source_database": "pubmed"
  },
  {
    "title": "Enhancing medical AI with retrieval-augmented generation: A mini narrative review.",
    "authors": [
      "Gargari OK",
      "Habibi G"
    ],
    "year": "2025",
    "journal": "Digital health",
    "doi": "10.1177/20552076251337177",
    "pmid": "40343063",
    "abstract": "Retrieval-augmented generation (RAG) is a powerful technique in artificial intelligence (AI) and machine learning that enhances the capabilities of large language models (LLMs) by integrating external data sources, allowing for more accurate, contextually relevant responses. In medical applications, RAG has the potential to improve diagnostic accuracy, clinical decision support, and patient care. This narrative review explores the application of RAG across various medical domains, including guideline interpretation, diagnostic assistance, clinical trial eligibility screening, clinical information retrieval, and information extraction from scientific literature. Studies highlight the benefits of RAG in providing accurate, up-to-date information, improving clinical outcomes, and streamlining processes. Notable applications include GPT-4 models enhanced with RAG to interpret hepatologic guidelines, assist in differential diagnosis, and aid in clinical trial screening. Furthermore, RAG-based systems have demonstrated superior performance over traditional methods in tasks such as patient diagnosis, clinical decision-making, and medical information extraction. Despite its advantages, challenges remain, particularly in model evaluation, cost-efficiency, and reducing AI hallucinations. This review emphasizes the potential of RAG in advancing medical AI applications and advocates for further optimization of retrieval mechanisms, embedding models, and collaboration between AI researchers and healthcare professionals to maximize RAG's impact on medical practice.",
    "source_database": "pubmed"
  },
  {
    "title": "Leveraging long context in retrieval augmented language models for medical question answering.",
    "authors": [
      "Zhang G",
      "Xu Z",
      "Jin Q",
      "Chen F",
      "Fang Y",
      "Liu Y",
      "Rousseau JF",
      "Xu Z",
      "Lu Z",
      "Weng C",
      "Peng Y"
    ],
    "year": "2025",
    "journal": "NPJ digital medicine",
    "doi": "10.1038/s41746-025-01651-w",
    "pmid": "40316710",
    "abstract": "While holding great promise for improving and facilitating healthcare through applications of medical literature summarization, large language models (LLMs) struggle to produce up-to-date responses on evolving topics due to outdated knowledge or hallucination. Retrieval-augmented generation (RAG) is a pivotal innovation that improves the accuracy and relevance of LLM responses by integrating LLMs with a search engine and external sources of knowledge. However, the quality of RAG responses can be largely impacted by the rank and density of key information in the retrieval results, such as the \"lost-in-the-middle\" problem. In this work, we aim to improve the robustness and reliability of the RAG workflow in the medical domain. Specifically, we propose a map-reduce strategy, BriefContext, to combat the \"lost-in-the-middle\" issue without modifying the model weights. We demonstrated the advantage of the workflow with various LLM backbones and on multiple QA datasets. This method promises to improve the safety and reliability of LLMs deployed in healthcare domains by reducing the risk of misinformation, ensuring critical clinical content is retained in generated responses, and enabling more trustworthy use of LLMs in critical tasks such as medical question answering, clinical decision support, and patient-facing applications.",
    "source_database": "pubmed"
  },
  {
    "title": "A Current Review of Generative AI in Medicine: Core Concepts, Applications, and Current Limitations.",
    "authors": [
      "Rouzrokh P",
      "Khosravi B",
      "Faghani S",
      "Moassefi M",
      "Shariatnia MM",
      "Rouzrokh P",
      "Erickson B"
    ],
    "year": "2025",
    "journal": "Current reviews in musculoskeletal medicine",
    "doi": "10.1007/s12178-025-09961-y",
    "pmid": "40304941",
    "abstract": "This review aims to offer a foundational overview of Generative Artificial Intelligence (AI) for healthcare professionals without an engineering background. It seeks to aid their understanding of Generative AI's current capabilities, applications, and limitations within the medical field.",
    "source_database": "pubmed"
  },
  {
    "title": "RAGing ahead in rheumatology: new language model architectures to tame artificial intelligence.",
    "authors": [
      "Benavent D",
      "Venerito V",
      "Michelena X"
    ],
    "year": "2025",
    "journal": "Therapeutic advances in musculoskeletal disease",
    "doi": "10.1177/1759720X251331529",
    "pmid": "40292012",
    "abstract": "Artificial intelligence (AI) is increasingly transforming rheumatology with research on disease detection, monitoring, and outcome prediction through the analysis of large datasets. The advent of generative models and large language models (LLMs) has expanded AI's capabilities, particularly in natural language processing (NLP) tasks such as question-answering and medical literature synthesis. While NLP has shown promise in identifying rheumatic diseases from electronic health records with high accuracy, LLMs face significant challenges, including hallucinations and a lack of domain-specific knowledge, which limit their reliability in specialized medical fields like rheumatology. Retrieval-augmented generation (RAG) emerges as a solution to these limitations by integrating LLMs with real-time access to external, domain-specific databases. RAG enhances the accuracy and relevance of AI-generated responses by retrieving pertinent information during the generation process, reducing hallucinations, and improving the trustworthiness of AI applications. This architecture allows for precise, context-aware outputs and can handle unstructured data effectively. Despite its success in other industries, the application of RAG in medicine, and specifically in rheumatology, remains underexplored. Potential applications in rheumatology include retrieving up-to-date clinical guidelines, summarizing complex patient histories from unstructured data, aiding in patient identification for clinical trials, enhancing pharmacovigilance efforts, and supporting personalized patient education. RAG also offers advantages in data privacy by enabling local data handling and reducing reliance on large, general-purpose models. Future directions involve integrating RAG with fine-tuned, smaller LLMs and exploring multimodal models that can process diverse data types. Challenges such as infrastructure costs, data privacy concerns, and the need for specialized evaluation metrics must be addressed. Nevertheless, RAG presents a promising opportunity to improve AI applications in rheumatology, offering a more precise, accountable, and sustainable approach to integrating advanced language models into clinical practice and research.",
    "source_database": "pubmed"
  },
  {
    "title": "Thyro-GenAI: A Chatbot Using Retrieval-Augmented Generative Models for Personalized Thyroid Disease Management.",
    "authors": [
      "Shin M",
      "Song J",
      "Kim MG",
      "Yu HW",
      "Choe EK",
      "Chai YJ"
    ],
    "year": "2025",
    "journal": "Journal of clinical medicine",
    "doi": "10.3390/jcm14072450",
    "pmid": "40217905",
    "abstract": null,
    "source_database": "pubmed"
  },
  {
    "title": "Evidence-based artificial intelligence: Implementing retrieval-augmented generation models to enhance clinical decision support in plastic surgery.",
    "authors": [
      "Ozmen BB",
      "Mathur P"
    ],
    "year": "2025",
    "journal": "Journal of plastic, reconstructive & aesthetic surgery : JPRAS",
    "doi": "10.1016/j.bjps.2025.03.053",
    "pmid": "40174259",
    "abstract": "The rapid advancement of large language models (LLMs) has generated significant enthusiasm within healthcare, especially in supporting clinical decision-making and patient management. However, inherent limitations including hallucinations, outdated clinical context, and unreliable references pose serious concerns for their clinical utility. Retrieval-Augmented Generation (RAG) models address these limitations by integrating validated, curated medical literature directly into AI workflows, significantly enhancing the accuracy, relevance, and transparency of generated outputs. This viewpoint discusses how RAG frameworks can specifically benefit plastic and reconstructive surgery by providing contextually accurate, evidence-based, and clinically grounded support for decision-making. Potential clinical applications include clinical decision support, efficient evidence synthesis, customizable patient education, informed consent materials, multilingual capabilities, and structured surgical documentation. By querying specialized databases that incorporate contemporary guidelines and literature, RAG models can markedly reduce inaccuracies and increase the reliability of AI-generated responses. However, the implementation of RAG technology demands rigorous database curation, regular updating with guidelines from surgical societies, and ongoing validation to maintain clinical relevance. Addressing challenges related to data privacy, governance, ethical considerations, and user training remains critical for successful clinical adoption. In conclusion, RAG models represent a significant advancement in overcoming traditional LLM limitations, promoting transparency and clinical accuracy with great potential for plastic surgery. Plastic surgeons and researchers are encouraged to explore and integrate these innovative generative AI frameworks to enhance patient care, surgical outcomes, communication, documentation quality, and education.",
    "source_database": "pubmed"
  },
  {
    "title": "Utilizing large language models for gastroenterology research: a conceptual framework.",
    "authors": [
      "Berry P",
      "Dhanakshirur RR",
      "Khanna S"
    ],
    "year": "2025",
    "journal": "Therapeutic advances in gastroenterology",
    "doi": "10.1177/17562848251328577",
    "pmid": "40171241",
    "abstract": "Large language models (LLMs) transform healthcare by assisting clinicians with decision-making, research, and patient management. In gastroenterology, LLMs have shown potential in clinical decision support, data extraction, and patient education. However, challenges such as bias, hallucinations, integration with clinical workflows, and regulatory compliance must be addressed for safe and effective implementation. This manuscript presents a structured framework for integrating LLMs into gastroenterology, using Hepatitis C treatment as a real-world application. The framework outlines key steps to ensure accuracy, safety, and clinical relevance while mitigating risks associated with artificial intelligence (AI)-driven healthcare tools. The framework includes defining clinical goals, assembling a multidisciplinary team, data collection and preparation, model selection, fine-tuning, calibration, hallucination mitigation, user interface development, integration with electronic health records, real-world validation, and continuous improvement. Retrieval-augmented generation and fine-tuning approaches are evaluated for optimizing model adaptability. Bias detection, reinforcement learning from human feedback, and structured prompt engineering are incorporated to enhance reliability. Ethical and regulatory considerations, including the Health Insurance Portability and Accountability Act, General Data Protection Regulation, and AI-specific guidelines (DECIDE-AI, SPIRIT-AI, CONSORT-AI), are addressed to ensure responsible AI deployment. LLMs have the potential to enhance decision-making, research efficiency, and patient care in gastroenterology, but responsible deployment requires bias mitigation, transparency, and ongoing validation. Future research should focus on multi-institutional validation and AI-assisted clinical trials to establish LLMs as reliable tools in gastroenterology.",
    "source_database": "pubmed"
  },
  {
    "title": "A Review of Large Language Models in Medical Education, Clinical Decision Support, and Healthcare Administration.",
    "authors": [
      "Vrdoljak J",
      "Boban Z",
      "Vilović M",
      "Kumrić M",
      "Božić J"
    ],
    "year": "2025",
    "journal": "Healthcare (Basel, Switzerland)",
    "doi": "10.3390/healthcare13060603",
    "pmid": "40150453",
    "abstract": null,
    "source_database": "pubmed"
  },
  {
    "title": "Empowering large language models for automated clinical assessment with generation-augmented retrieval and hierarchical chain-of-thought.",
    "authors": [
      "Gu Z",
      "Jia W",
      "Piccardi M",
      "Yu P"
    ],
    "year": "2025",
    "journal": "Artificial intelligence in medicine",
    "doi": "10.1016/j.artmed.2025.103078",
    "pmid": "39978047",
    "abstract": "Understanding and extracting valuable information from electronic health records (EHRs) is important for improving healthcare delivery and health outcomes. Large language models (LLMs) have demonstrated significant proficiency in natural language understanding and processing, offering promises for automating the typically labor-intensive and time-consuming analytical tasks with EHRs. Despite the active application of LLMs in the healthcare setting, many foundation models lack real-world healthcare relevance. Applying LLMs to EHRs is still in its early stage. To advance this field, in this study, we pioneer a generation-augmented prompting paradigm \"GAPrompt\" to empower generic LLMs for automated clinical assessment, in particular, quantitative stroke severity assessment, using data extracted from EHRs.",
    "source_database": "pubmed"
  },
  {
    "title": "Applying generative AI with retrieval augmented generation to summarize and extract key clinical information from electronic health records.",
    "authors": [
      "Alkhalaf M",
      "Yu P",
      "Yin M",
      "Deng C"
    ],
    "year": "2024",
    "journal": "Journal of biomedical informatics",
    "doi": "10.1016/j.jbi.2024.104662",
    "pmid": "38880236",
    "abstract": "Malnutrition is a prevalent issue in aged care facilities (RACFs), leading to adverse health outcomes. The ability to efficiently extract key clinical information from a large volume of data in electronic health records (EHR) can improve understanding about the extent of the problem and developing effective interventions. This research aimed to test the efficacy of zero-shot prompt engineering applied to generative artificial intelligence (AI) models on their own and in combination with retrieval augmented generation (RAG), for the automating tasks of summarizing both structured and unstructured data in EHR and extracting important malnutrition information.",
    "source_database": "pubmed"
  },
  {
    "title": "Integrating Retrieval-Augmented Generation with Large Language Models in Nephrology: Advancing Practical Applications.",
    "authors": [
      "Miao J",
      "Thongprayoon C",
      "Suppadungsuk S",
      "Garcia Valencia OA",
      "Cheungpasitporn W"
    ],
    "year": "2024",
    "journal": "Medicina (Kaunas, Lithuania)",
    "doi": "10.3390/medicina60030445",
    "pmid": "38541171",
    "abstract": "The integration of large language models (LLMs) into healthcare, particularly in nephrology, represents a significant advancement in applying advanced technology to patient care, medical research, and education. These advanced models have progressed from simple text processors to tools capable of deep language understanding, offering innovative ways to handle health-related data, thus improving medical practice efficiency and effectiveness. A significant challenge in medical applications of LLMs is their imperfect accuracy and/or tendency to produce hallucinations-outputs that are factually incorrect or irrelevant. This issue is particularly critical in healthcare, where precision is essential, as inaccuracies can undermine the reliability of these models in crucial decision-making processes. To overcome these challenges, various strategies have been developed. One such strategy is prompt engineering, like the chain-of-thought approach, which directs LLMs towards more accurate responses by breaking down the problem into intermediate steps or reasoning sequences. Another one is the retrieval-augmented generation (RAG) strategy, which helps address hallucinations by integrating external data, enhancing output accuracy and relevance. Hence, RAG is favored for tasks requiring up-to-date, comprehensive information, such as in clinical decision making or educational applications. In this article, we showcase the creation of a specialized ChatGPT model integrated with a RAG system, tailored to align with the KDIGO 2023 guidelines for chronic kidney disease. This example demonstrates its potential in providing specialized, accurate medical advice, marking a step towards more reliable and efficient nephrology practices.",
    "source_database": "pubmed"
  },
  {
    "title": "AR-RAG: Autoregressive Retrieval Augmentation for Image Generation",
    "authors": [
      "Jingyuan Qi",
      "Zhiyang Xu",
      "Qifan Wang",
      "Lifu Huang"
    ],
    "year": "2025",
    "journal": "arXiv:2506.06962v3",
    "doi": "",
    "abstract": "We introduce Autoregressive Retrieval Augmentation (AR-RAG), a novel paradigm that enhances image generation by autoregressively incorporating knearest neighbor retrievals at the patch level. Unlike prior methods that perform a single, static retrieval before generation and condition the entire generation on fixed reference images, AR-RAG performs context-aware retrievals at each generation step, using prior-generated patches as queries to retrieve and incorporate the most relevant patch-level visual references, enabling the model to respond to evolving generation needs while avoiding limitations (e.g., over-copying, stylistic bias, etc.) prevalent in existing methods. To realize AR-RAG, we propose two parallel frameworks: (1) Distribution-Augmentation in Decoding (DAiD), a training-free plug-and-use decoding strategy that directly merges the distribution of model-predicted patches with the distribution of retrieved patches, and (2) Feature-Augmentation in Decoding (FAiD), a parameter-efficient fine-tuning method that progressively smooths the features of retrieved patches via multi-scale convolution operations and leverages them to augment the image generation process. We validate the effectiveness of AR-RAG on widely adopted benchmarks, including Midjourney-30K, GenEval and DPG-Bench, demonstrating significant performance gains over state-of-the-art image generation models.",
    "source_database": "arxiv",
    "arxiv_id": "2506.06962v3"
  },
  {
    "title": "Intelligent Interaction Strategies for Context-Aware Cognitive Augmentation",
    "authors": [
      " Xiangrong",
      " Zhu",
      "Yuan Xu",
      "Tianjian Liu",
      "Jingwei Sun",
      "Yu Zhang",
      "Xin Tong"
    ],
    "year": "2025",
    "journal": "arXiv:2504.13684v1",
    "doi": "",
    "abstract": "Human cognition is constrained by processing limitations, leading to cognitive overload and inefficiencies in knowledge synthesis and decision-making. Large Language Models (LLMs) present an opportunity for cognitive augmentation, but their current reactive nature limits their real-world applicability. This position paper explores the potential of context-aware cognitive augmentation, where LLMs dynamically adapt to users' cognitive states and task environments to provide appropriate support. Through a think-aloud study in an exhibition setting, we examine how individuals interact with multi-modal information and identify key cognitive challenges in structuring, retrieving, and applying knowledge. Our findings highlight the need for AI-driven cognitive support systems that integrate real-time contextual awareness, personalized reasoning assistance, and socially adaptive interactions. We propose a framework for AI augmentation that seamlessly transitions between real-time cognitive support and post-experience knowledge organization, contributing to the design of more effective human-centered AI systems.",
    "source_database": "arxiv",
    "arxiv_id": "2504.13684v1"
  },
  {
    "title": "Factually: Exploring Wearable Fact-Checking for Augmented Truth Discernment",
    "authors": [
      "Chitralekha Gupta",
      "Hanjun Wu",
      "Praveen Sasikumar",
      "Shreyas Sridhar",
      "Priambudi Bagaskara",
      "Suranga Nanayakkara"
    ],
    "year": "2025",
    "journal": "arXiv:2504.17204v1",
    "doi": "",
    "abstract": "Wearable devices are transforming human capabilities by seamlessly augmenting cognitive functions. In this position paper, we propose a voice-based, interactive learning companion designed to amplify and extend cognitive abilities through informal learning. Our vision is threefold: (1) to enable users to discover new knowledge on-the-go through contextual interactive quizzes, fostering critical thinking and mindfulness, (2) to proactively detect misinformation, empowering users to critically assess information in real time, and (3) to provide spoken language correction and prompting hints for second language learning and effective communication. As an initial step toward this vision, we present Factually - a proactive, wearable fact-checking system integrated into devices like smartwatches or rings. Factually discreetly alerts users to potential falsehoods via vibrotactile feedback, helping them assess information critically. We demonstrate its utility through three illustrative scenarios, highlighting its potential to extend cognitive abilities for real-time misinformation detection. Early qualitative feedback suggests that Factually can enhance users' fact-checking capabilities, offering both practical and experiential benefits.",
    "source_database": "arxiv",
    "arxiv_id": "2504.17204v1"
  },
  {
    "title": "Designing AI Systems that Augment Human Performed vs. Demonstrated Critical Thinking",
    "authors": [
      "Katelyn Xiaoying Mei",
      "Nic Weber"
    ],
    "year": "2025",
    "journal": "arXiv:2504.14689v1",
    "doi": "",
    "abstract": "The recent rapid advancement of LLM-based AI systems has accelerated our search and production of information. While the advantages brought by these systems seemingly improve the performance or efficiency of human activities, they do not necessarily enhance human capabilities. Recent research has started to examine the impact of generative AI on individuals' cognitive abilities, especially critical thinking. Based on definitions of critical thinking across psychology and education, this position paper proposes the distinction between demonstrated and performed critical thinking in the era of generative AI and discusses the implication of this distinction in research and development of AI systems that aim to augment human critical thinking.",
    "source_database": "arxiv",
    "arxiv_id": "2504.14689v1"
  },
  {
    "title": "Automated Literature Review Using NLP Techniques and LLM-Based Retrieval-Augmented Generation",
    "authors": [
      "Nurshat Fateh Ali",
      "Md. Mahdi Mohtasim",
      "Shakil Mosharrof",
      "T. Gopi Krishna"
    ],
    "year": "2024",
    "journal": "arXiv:2411.18583v1",
    "doi": "",
    "abstract": "This research presents and compares multiple approaches to automate the generation of literature reviews using several Natural Language Processing (NLP) techniques and retrieval-augmented generation (RAG) with a Large Language Model (LLM). The ever-increasing number of research articles provides a huge challenge for manual literature review. It has resulted in an increased demand for automation. Developing a system capable of automatically generating the literature reviews from only the PDF files as input is the primary objective of this research work. The effectiveness of several Natural Language Processing (NLP) strategies, such as the frequency-based method (spaCy), the transformer model (Simple T5), and retrieval-augmented generation (RAG) with Large Language Model (GPT-3.5-turbo), is evaluated to meet the primary objective. The SciTLDR dataset is chosen for this research experiment and three distinct techniques are utilized to implement three different systems for auto-generating the literature reviews. The ROUGE scores are used for the evaluation of all three systems. Based on the evaluation, the Large Language Model GPT-3.5-turbo achieved the highest ROUGE-1 score, 0.364. The transformer model comes in second place and spaCy is at the last position. Finally, a graphical user interface is created for the best system based on the large language model.",
    "source_database": "arxiv",
    "arxiv_id": "2411.18583v1"
  },
  {
    "title": "EVOR: Evolving Retrieval for Code Generation",
    "authors": [
      "Hongjin Su",
      "Shuyang Jiang",
      "Yuhang Lai",
      "Haoyuan Wu",
      "Boao Shi",
      "Che Liu",
      "Qian Liu",
      "Tao Yu"
    ],
    "year": "2024",
    "journal": "arXiv:2402.12317v2",
    "doi": "",
    "abstract": "Recently the retrieval-augmented generation (RAG) has been successfully applied in code generation. However, existing pipelines for retrieval-augmented code generation (RACG) employ static knowledge bases with a single source, limiting the adaptation capabilities of Large Language Models (LLMs) to domains they have insufficient knowledge of. In this work, we develop a novel pipeline, EVOR, that employs the synchronous evolution of both queries and diverse knowledge bases. On two realistic settings where the external knowledge is required to solve code generation tasks, we compile four new datasets associated with frequently updated libraries and long-tail programming languages, named EVOR-BENCH. Extensive experiments demonstrate that EVOR achieves two to four times of execution accuracy compared to other methods such as Reflexion (Shinn et al., 2024), DocPrompting (Zhou et al., 2023), etc. We demonstrate that EVOR is flexible and can be easily combined with them to achieve further improvement. Further analysis reveals that EVOR benefits from the synchronous evolution of queries and documents and the diverse information sources in the knowledge base. We hope that our studies will inspire more insights into the design of advanced RACG pipelines in future research. Our model, code, and data are available at https://arks-codegen.github.io.",
    "source_database": "arxiv",
    "arxiv_id": "2402.12317v2"
  },
  {
    "title": "Riddle Me This! Stealthy Membership Inference for Retrieval-Augmented Generation",
    "authors": [
      "Ali Naseh",
      "Yuefeng Peng",
      "Anshuman Suri",
      "Harsh Chaudhari",
      "Alina Oprea",
      "Amir Houmansadr"
    ],
    "year": "2025",
    "journal": "arXiv:2502.00306v2",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) enables Large Language Models (LLMs) to generate grounded responses by leveraging external knowledge databases without altering model parameters. Although the absence of weight tuning prevents leakage via model parameters, it introduces the risk of inference adversaries exploiting retrieved documents in the model's context. Existing methods for membership inference and data extraction often rely on jailbreaking or carefully crafted unnatural queries, which can be easily detected or thwarted with query rewriting techniques common in RAG systems. In this work, we present Interrogation Attack (IA), a membership inference technique targeting documents in the RAG datastore. By crafting natural-text queries that are answerable only with the target document's presence, our approach demonstrates successful inference with just 30 queries while remaining stealthy; straightforward detectors identify adversarial prompts from existing methods up to ~76x more frequently than those generated by our attack. We observe a 2x improvement in TPR@1%FPR over prior inference attacks across diverse RAG configurations, all while costing less than $0.02 per document inference.",
    "source_database": "arxiv",
    "arxiv_id": "2502.00306v2"
  },
  {
    "title": "Ragas: Automated Evaluation of Retrieval Augmented Generation",
    "authors": [
      "Shahul Es",
      "Jithin James",
      "Luis Espinosa-Anke",
      "Steven Schockaert"
    ],
    "year": "2023",
    "journal": "arXiv:2309.15217v2",
    "doi": "",
    "abstract": "We introduce Ragas (Retrieval Augmented Generation Assessment), a framework for reference-free evaluation of Retrieval Augmented Generation (RAG) pipelines. RAG systems are composed of a retrieval and an LLM based generation module, and provide LLMs with knowledge from a reference textual database, which enables them to act as a natural language layer between a user and textual databases, reducing the risk of hallucinations. Evaluating RAG architectures is, however, challenging because there are several dimensions to consider: the ability of the retrieval system to identify relevant and focused context passages, the ability of the LLM to exploit such passages in a faithful way, or the quality of the generation itself. With Ragas, we put forward a suite of metrics which can be used to evaluate these different dimensions \\textit{without having to rely on ground truth human annotations}. We posit that such a framework can crucially contribute to faster evaluation cycles of RAG architectures, which is especially important given the fast adoption of LLMs.",
    "source_database": "arxiv",
    "arxiv_id": "2309.15217v2"
  },
  {
    "title": "FAIR-RAG: Faithful Adaptive Iterative Refinement for Retrieval-Augmented Generation",
    "authors": [
      "Mohammad Aghajani Asl",
      "Majid Asgari-Bidhendi",
      "Behrooz Minaei-Bidgoli"
    ],
    "year": "2025",
    "journal": "arXiv:2510.22344v1",
    "doi": "",
    "abstract": "While Retrieval-Augmented Generation (RAG) mitigates hallucination and knowledge staleness in Large Language Models (LLMs), existing frameworks often falter on complex, multi-hop queries that require synthesizing information from disparate sources. Current advanced RAG methods, employing iterative or adaptive strategies, lack a robust mechanism to systematically identify and fill evidence gaps, often propagating noise or failing to gather a comprehensive context. We introduce FAIR-RAG, a novel agentic framework that transforms the standard RAG pipeline into a dynamic, evidence-driven reasoning process. At its core is an Iterative Refinement Cycle governed by a module we term Structured Evidence Assessment (SEA). The SEA acts as an analytical gating mechanism: it deconstructs the initial query into a checklist of required findings and audits the aggregated evidence to identify confirmed facts and, critically, explicit informational gaps. These gaps provide a precise signal to an Adaptive Query Refinement agent, which generates new, targeted sub-queries to retrieve missing information. This cycle repeats until the evidence is verified as sufficient, ensuring a comprehensive context for a final, strictly faithful generation. We conducted experiments on challenging multi-hop QA benchmarks, including HotpotQA, 2WikiMultiHopQA, and MusiQue. In a unified experimental setup, FAIR-RAG significantly outperforms strong baselines. On HotpotQA, it achieves an F1-score of 0.453 -- an absolute improvement of 8.3 points over the strongest iterative baseline -- establishing a new state-of-the-art for this class of methods on these benchmarks. Our work demonstrates that a structured, evidence-driven refinement process with explicit gap analysis is crucial for unlocking reliable and accurate reasoning in advanced RAG systems for complex, knowledge-intensive tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2510.22344v1"
  },
  {
    "title": "MUST-RAG: MUSical Text Question Answering with Retrieval Augmented Generation",
    "authors": [
      "Daeyong Kwon",
      "SeungHeon Doh",
      "Juhan Nam"
    ],
    "year": "2025",
    "journal": "arXiv:2507.23334v2",
    "doi": "",
    "abstract": "Recent advancements in Large language models (LLMs) have demonstrated remarkable capabilities across diverse domains. While they exhibit strong zero-shot performance on various tasks, LLMs' effectiveness in music-related applications remains limited due to the relatively small proportion of music-specific knowledge in their training data. To address this limitation, we propose MusT-RAG, a comprehensive framework based on Retrieval Augmented Generation (RAG) to adapt general-purpose LLMs for text-only music question answering (MQA) tasks. RAG is a technique that provides external knowledge to LLMs by retrieving relevant context information when generating answers to questions. To optimize RAG for the music domain, we (1) propose MusWikiDB, a music-specialized vector database for the retrieval stage, and (2) utilizes context information during both inference and fine-tuning processes to effectively transform general-purpose LLMs into music-specific models. Our experiment demonstrates that MusT-RAG significantly outperforms traditional fine-tuning approaches in enhancing LLMs' music domain adaptation capabilities, showing consistent improvements across both in-domain and out-of-domain MQA benchmarks. Additionally, our MusWikiDB proves substantially more effective than general Wikipedia corpora, delivering superior performance and computational efficiency.",
    "source_database": "arxiv",
    "arxiv_id": "2507.23334v2"
  },
  {
    "title": "Open-Source Retrieval Augmented Generation Framework for Retrieving Accurate Medication Insights from Formularies for African Healthcare Workers",
    "authors": [
      "Axum AI",
      " :",
      "J. Owoyemi",
      "S. Abubakar",
      "A. Owoyemi",
      "T. O. Togunwa",
      "F. C. Madubuko",
      "S. Oyatoye",
      "Z. Oyetolu",
      "K. Akyea",
      "A. O. Mohammed",
      "A. Adebakin"
    ],
    "year": "2025",
    "journal": "arXiv:2502.15722v1",
    "doi": "",
    "abstract": "Accessing accurate medication insights is vital for enhancing patient safety, minimizing errors, and supporting clinical decision-making. However, healthcare professionals in Africa often rely on manual and time-consuming processes to retrieve drug information, exacerbated by limited access to pharmacists due to brain drain and healthcare disparities. This paper presents \"Drug Insights,\" an open-source Retrieval-Augmented Generation (RAG) chatbot designed to streamline medication lookup for healthcare workers in Africa. By leveraging a corpus of Nigerian pharmaceutical data and advanced AI technologies, including Pinecone databases and GPT models, the system delivers accurate, context-specific responses with minimal hallucination. The chatbot integrates prompt engineering and S-BERT evaluation to optimize retrieval and response generation. Preliminary tests, including pharmacist feedback, affirm the tool's potential to improve drug information access while highlighting areas for enhancement, such as UI/UX refinement and extended corpus integration.",
    "source_database": "arxiv",
    "arxiv_id": "2502.15722v1"
  },
  {
    "title": "Engineering the RAG Stack: A Comprehensive Review of the Architecture and Trust Frameworks for Retrieval-Augmented Generation Systems",
    "authors": [
      "Dean Wampler",
      "Dave Nielson",
      "Alireza Seddighi"
    ],
    "year": "2025",
    "journal": "arXiv:2601.05264v1",
    "doi": "",
    "abstract": "This article provides a comprehensive systematic literature review of academic studies, industrial applications, and real-world deployments from 2018 to 2025, providing a practical guide and detailed overview of modern Retrieval-Augmented Generation (RAG) architectures. RAG offers a modular approach for integrating external knowledge without increasing the capacity of the model as LLM systems expand. Research and engineering practices have been fragmented as a result of the increasing diversity of RAG methodologies, which encompasses a variety of fusion mechanisms, retrieval strategies, and orchestration approaches. We provide quantitative assessment frameworks, analyze the implications for trust and alignment, and systematically consolidate existing RAG techniques into a unified taxonomy. This document is a practical framework for the deployment of resilient, secure, and domain-adaptable RAG systems, synthesizing insights from academic literature, industry reports, and technical implementation guides. It also functions as a technical reference.",
    "source_database": "arxiv",
    "arxiv_id": "2601.05264v1"
  },
  {
    "title": "RAGPart & RAGMask: Retrieval-Stage Defenses Against Corpus Poisoning in Retrieval-Augmented Generation",
    "authors": [
      "Pankayaraj Pathmanathan",
      "Michael-Andrei Panaitescu-Liess",
      "Cho-Yu Jason Chiang",
      "Furong Huang"
    ],
    "year": "2025",
    "journal": "arXiv:2512.24268v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) has emerged as a promising paradigm to enhance large language models (LLMs) with external knowledge, reducing hallucinations and compensating for outdated information. However, recent studies have exposed a critical vulnerability in RAG pipelines corpus poisoning where adversaries inject malicious documents into the retrieval corpus to manipulate model outputs. In this work, we propose two complementary retrieval-stage defenses: RAGPart and RAGMask. Our defenses operate directly on the retriever, making them computationally lightweight and requiring no modification to the generation model. RAGPart leverages the inherent training dynamics of dense retrievers, exploiting document partitioning to mitigate the effect of poisoned points. In contrast, RAGMask identifies suspicious tokens based on significant similarity shifts under targeted token masking. Across two benchmarks, four poisoning strategies, and four state-of-the-art retrievers, our defenses consistently reduce attack success rates while preserving utility under benign conditions. We further introduce an interpretable attack to stress-test our defenses. Our findings highlight the potential and limitations of retrieval-stage defenses, providing practical insights for robust RAG deployments.",
    "source_database": "arxiv",
    "arxiv_id": "2512.24268v1"
  },
  {
    "title": "MultiRAG: A Knowledge-guided Framework for Mitigating Hallucination in Multi-source Retrieval Augmented Generation",
    "authors": [
      "Wenlong Wu",
      "Haofen Wang",
      "Bohan Li",
      "Peixuan Huang",
      "Xinzhe Zhao",
      "Lei Liang"
    ],
    "year": "2025",
    "journal": "arXiv:2508.03553v1",
    "doi": "https://doi.org/10.1109/ICDE65448.2025.00230",
    "abstract": "Retrieval Augmented Generation (RAG) has emerged as a promising solution to address hallucination issues in Large Language Models (LLMs). However, the integration of multiple retrieval sources, while potentially more informative, introduces new challenges that can paradoxically exacerbate hallucination problems. These challenges manifest primarily in two aspects: the sparse distribution of multi-source data that hinders the capture of logical relationships and the inherent inconsistencies among different sources that lead to information conflicts. To address these challenges, we propose MultiRAG, a novel framework designed to mitigate hallucination in multi-source retrieval-augmented generation through knowledge-guided approaches. Our framework introduces two key innovations: (1) a knowledge construction module that employs multi-source line graphs to efficiently aggregate logical relationships across different knowledge sources, effectively addressing the sparse data distribution issue; and (2) a sophisticated retrieval module that implements a multi-level confidence calculation mechanism, performing both graph-level and node-level assessments to identify and eliminate unreliable information nodes, thereby reducing hallucinations caused by inter-source inconsistencies. Extensive experiments on four multi-domain query datasets and two multi-hop QA datasets demonstrate that MultiRAG significantly enhances the reliability and efficiency of knowledge retrieval in complex multi-source scenarios. \\textcolor{blue}{Our code is available in https://github.com/wuwenlong123/MultiRAG.",
    "source_database": "arxiv",
    "arxiv_id": "2508.03553v1"
  },
  {
    "title": "Hybrid-Code v2: Zero-Hallucination Clinical ICD-10 Coding via Neuro-Symbolic Verification and Automated Knowledge Base Expansion",
    "authors": [
      "Yunguo Yu"
    ],
    "year": "2025",
    "journal": "arXiv:2512.23743v2",
    "doi": "",
    "abstract": "Automated clinical ICD-10 coding is a high-impact healthcare task requiring a balance between coverage, precision, and safety. While neural approaches achieve strong performance, they suffer from hallucination-generating invalid or unsupported codes-posing unacceptable risks in safety-critical clinical settings. Rule-based systems eliminate hallucination but lack scalability and coverage due to manual knowledge base (KB) curation.   We present Hybrid-Code v2, a neuro-symbolic framework that achieves zero Type-I hallucination by construction while maintaining competitive coverage and precision. The system integrates neural candidate generation with a symbolic KB verification layer that enforces validity constraints through multi-layer verification, including format, evidence grounding, negation detection, temporal consistency, and exclusion rules. In addition, we introduce an automated KB expansion mechanism that extracts and validates coding patterns from unlabeled clinical text, addressing the scalability limitations of rule-based systems.   Evaluated on the MIMIC-III dataset against ClinicalBERT, BioBERT, rule-based systems, and GPT-4, Hybrid-Code v2 achieves 85% coverage, 92% precision, and 0% Type-I hallucination, outperforming rule-based systems by +40% coverage while eliminating hallucination observed in neural baselines (6-18%). The proposed architecture provides a formal safety guarantee for syntactic validity while preserving strong empirical performance.   These results demonstrate that neuro-symbolic verification can enforce safety constraints in neural medical AI systems without sacrificing effectiveness, offering a generalizable design pattern for deploying trustworthy AI in safety-critical domains.",
    "source_database": "arxiv",
    "arxiv_id": "2512.23743v2"
  },
  {
    "title": "SoftTiger: A Clinical Foundation Model for Healthcare Workflows",
    "authors": [
      "Ye Chen",
      "Igor Couto",
      "Wei Cai",
      "Cong Fu",
      "Bruno Dorneles"
    ],
    "year": "2024",
    "journal": "arXiv:2403.00868v3",
    "doi": "",
    "abstract": "We introduce SoftTiger, a clinical large language model (CLaM) designed as a foundation model for healthcare workflows. The narrative and unstructured nature of clinical notes is a major obstacle for healthcare intelligentization. We address a critical problem of structuring clinical notes into clinical data, according to international interoperability standards. We collect and annotate data for three subtasks, namely, international patient summary, clinical impression and medical encounter. We then supervised fine-tuned a state-of-the-art LLM using public and credentialed clinical data. The training is orchestrated in a way that the target model can first support basic clinical tasks such as abbreviation expansion and temporal information extraction, and then learn to perform more complex downstream clinical tasks. Moreover, we address several modeling challenges in the healthcare context, e.g., extra long context window. Our blind pairwise evaluation shows that SoftTiger outperforms other popular open-source models and GPT-3.5, comparable to Gemini-pro, with a mild gap from GPT-4. We believe that LLMs may become a step-stone towards healthcare digitalization and democratization. Therefore, we publicly release SoftTiger models at scales of 13 billion and 70 billion parameters, as well as datasets and code for our innovative scalable evaluation, hopefully, making a significant contribution to the healthcare industry.",
    "source_database": "arxiv",
    "arxiv_id": "2403.00868v3"
  },
  {
    "title": "Utilizing Metadata for Better Retrieval-Augmented Generation",
    "authors": [
      "Raquib Bin Yousuf",
      "Shengzhe Xu",
      "Mandar Sharma",
      "Andrew Neeser",
      "Chris Latimer",
      "Naren Ramakrishnan"
    ],
    "year": "2026",
    "journal": "arXiv:2601.11863v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation systems depend on retrieving semantically relevant document chunks to support accurate, grounded outputs from large language models. In structured and repetitive corpora such as regulatory filings, chunk similarity alone often fails to distinguish between documents with overlapping language. Practitioners often flatten metadata into input text as a heuristic, but the impact and trade-offs of this practice remain poorly understood. We present a systematic study of metadata-aware retrieval strategies, comparing plain-text baselines with approaches that embed metadata directly. Our evaluation spans metadata-as-text (prefix and suffix), a dual-encoder unified embedding that fuses metadata and content in a single index, dual-encoder late-fusion retrieval, and metadata-aware query reformulation. Across multiple retrieval metrics and question types, we find that prefixing and unified embeddings consistently outperform plain-text baselines, with the unified at times exceeding prefixing while being easier to maintain. Beyond empirical comparisons, we analyze embedding space, showing that metadata integration improves effectiveness by increasing intra-document cohesion, reducing inter-document confusion, and widening the separation between relevant and irrelevant chunks. Field-level ablations show that structural cues provide strong disambiguating signals. Our code, evaluation framework, and the RAGMATE-10K dataset are publicly hosted.",
    "source_database": "arxiv",
    "arxiv_id": "2601.11863v1"
  },
  {
    "title": "CARROT: A Learned Cost-Constrained Retrieval Optimization System for RAG",
    "authors": [
      "Ziting Wang",
      "Haitao Yuan",
      "Wei Dong",
      "Gao Cong",
      "Feifei Li"
    ],
    "year": "2024",
    "journal": "arXiv:2411.00744v2",
    "doi": "",
    "abstract": "Large Language Models (LLMs) have demonstrated impressive ability in generation and reasoning tasks but struggle with handling up-to-date knowledge, leading to inaccuracies or hallucinations. Retrieval-Augmented Generation (RAG) mitigates this by retrieving and incorporating external knowledge into input prompts. In particular, due to LLMs' context window limitations and long-context hallucinations, only the most relevant \"chunks\" are retrieved. However, current RAG systems face three key challenges: (1) chunks are often retrieved independently without considering their relationships, such as redundancy and ordering; (2) the utility of chunks is non-monotonic, as adding more chunks can degrade quality; and (3) retrieval strategies fail to adapt to the unique characteristics of different queries. To overcome these challenges, we design a cost-constrained retrieval optimization framework for RAG. We adopt a Monte Carlo Tree Search (MCTS) based strategy to find the optimal chunk combination order, which considers the chunks' correlations. In addition, to address the non-monotonicity of chunk utility, instead of treating budget exhaustion as the termination condition, we design a utility computation strategy to identify the optimal chunk combination without necessarily exhausting the budget. Furthermore, we propose a configuration agent that predicts optimal configurations for each query domain, improving our framework's adaptability and efficiency. Experimental results demonstrate up to a 30% improvement over baseline models, highlighting the framework's effectiveness, scalability, and suitability. Our source code has been released at https://github.com/wang0702/CARROT.",
    "source_database": "arxiv",
    "arxiv_id": "2411.00744v2"
  },
  {
    "title": "Privacy-preserving machine learning for healthcare: open challenges and future perspectives",
    "authors": [
      "Alejandro Guerra-Manzanares",
      "L. Julian Lechuga Lopez",
      "Michail Maniatakos",
      "Farah E. Shamout"
    ],
    "year": "2023",
    "journal": "arXiv:2303.15563v1",
    "doi": "https://doi.org/10.1007/978-3-031-39539-0_3",
    "abstract": "Machine Learning (ML) has recently shown tremendous success in modeling various healthcare prediction tasks, ranging from disease diagnosis and prognosis to patient treatment. Due to the sensitive nature of medical data, privacy must be considered along the entire ML pipeline, from model training to inference. In this paper, we conduct a review of recent literature concerning Privacy-Preserving Machine Learning (PPML) for healthcare. We primarily focus on privacy-preserving training and inference-as-a-service, and perform a comprehensive review of existing trends, identify challenges, and discuss opportunities for future research directions. The aim of this review is to guide the development of private and efficient ML models in healthcare, with the prospects of translating research efforts into real-world settings.",
    "source_database": "arxiv",
    "arxiv_id": "2303.15563v1"
  },
  {
    "title": "Federated Learning for Healthcare Domain - Pipeline, Applications and Challenges",
    "authors": [
      "Madhura Joshi",
      "Ankit Pal",
      "Malaikannan Sankarasubbu"
    ],
    "year": "2022",
    "journal": "arXiv:2211.07893v2",
    "doi": "https://doi.org/10.1145/3533708",
    "abstract": "Federated learning is the process of developing machine learning models over datasets distributed across data centers such as hospitals, clinical research labs, and mobile devices while preventing data leakage. This survey examines previous research and studies on federated learning in the healthcare sector across a range of use cases and applications. Our survey shows what challenges, methods, and applications a practitioner should be aware of in the topic of federated learning. This paper aims to lay out existing research and list the possibilities of federated learning for healthcare industries.",
    "source_database": "arxiv",
    "arxiv_id": "2211.07893v2"
  },
  {
    "title": "To Retrieve or Not to Retrieve? Uncertainty Detection for Dynamic Retrieval Augmented Generation",
    "authors": [
      "Kaustubh D. Dhole"
    ],
    "year": "2025",
    "journal": "arXiv:2501.09292v3",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation equips large language models with the capability to retrieve external knowledge, thereby mitigating hallucinations by incorporating information beyond the model's intrinsic abilities. However, most prior works have focused on invoking retrieval deterministically, which makes it unsuitable for tasks such as long-form question answering. Instead, dynamically performing retrieval by invoking it only when the underlying LLM lacks the required knowledge can be more efficient. In this context, we delve deeper into the question, \"To Retrieve or Not to Retrieve?\" by exploring multiple uncertainty detection methods. We evaluate these methods for the task of long-form question answering, employing dynamic retrieval, and present our comparisons. Our findings suggest that uncertainty detection metrics, such as Degree Matrix Jaccard and Eccentricity, can reduce the number of retrieval calls by almost half, with only a slight reduction in question-answering accuracy.",
    "source_database": "arxiv",
    "arxiv_id": "2501.09292v3"
  },
  {
    "title": "Reducing hallucination in structured outputs via Retrieval-Augmented Generation",
    "authors": [
      "Patrice Béchard",
      "Orlando Marquez Ayala"
    ],
    "year": "2024",
    "journal": "arXiv:2404.08189v1",
    "doi": "https://doi.org/10.18653/v1/2024.naacl-industry.19",
    "abstract": "A common and fundamental limitation of Generative AI (GenAI) is its propensity to hallucinate. While large language models (LLM) have taken the world by storm, without eliminating or at least reducing hallucinations, real-world GenAI systems may face challenges in user adoption. In the process of deploying an enterprise application that produces workflows based on natural language requirements, we devised a system leveraging Retrieval Augmented Generation (RAG) to greatly improve the quality of the structured output that represents such workflows. Thanks to our implementation of RAG, our proposed system significantly reduces hallucinations in the output and improves the generalization of our LLM in out-of-domain settings. In addition, we show that using a small, well-trained retriever encoder can reduce the size of the accompanying LLM, thereby making deployments of LLM-based systems less resource-intensive.",
    "source_database": "arxiv",
    "arxiv_id": "2404.08189v1"
  },
  {
    "title": "Document Understanding for Healthcare Referrals",
    "authors": [
      "Jimit Mistry",
      "Natalia M. Arzeno"
    ],
    "year": "2023",
    "journal": "arXiv:2309.13184v1",
    "doi": "https://doi.org/10.1109/ICHI57859.2023.00067",
    "abstract": "Reliance on scanned documents and fax communication for healthcare referrals leads to high administrative costs and errors that may affect patient care. In this work we propose a hybrid model leveraging LayoutLMv3 along with domain-specific rules to identify key patient, physician, and exam-related entities in faxed referral documents. We explore some of the challenges in applying a document understanding model to referrals, which have formats varying by medical practice, and evaluate model performance using MUC-5 metrics to obtain appropriate metrics for the practical use case. Our analysis shows the addition of domain-specific rules to the transformer model yields greatly increased precision and F1 scores, suggesting a hybrid model trained on a curated dataset can increase efficiency in referral management.",
    "source_database": "arxiv",
    "arxiv_id": "2309.13184v1"
  },
  {
    "title": "IGMiRAG: Intuition-Guided Retrieval-Augmented Generation with Adaptive Mining of In-Depth Memory",
    "authors": [
      "Xingliang Hou",
      "Yuyan Liu",
      "Qi Sun",
      "haoxiu wang",
      "Hao Hu",
      "Shaoyi Du",
      "Zhiqiang Tian"
    ],
    "year": "2026",
    "journal": "arXiv:2602.07525v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) equips large language models (LLMs) with reliable knowledge memory. To strengthen cross-text associations, recent research integrates graphs and hypergraphs into RAG to capture pairwise and multi-entity relations as structured links. However, their misaligned memory organization necessitates costly, disjointed retrieval. To address these limitations, we propose IGMiRAG, a framework inspired by human intuition-guided reasoning. It constructs a hierarchical heterogeneous hypergraph to align multi-granular knowledge, incorporating deductive pathways to simulate realistic memory structures. During querying, IGMiRAG distills intuitive strategies via a question parser to control mining depth and memory window, and activates instantaneous memories as anchors using dual-focus retrieval. Mirroring human intuition, the framework guides retrieval resource allocation dynamically. Furthermore, we design a bidirectional diffusion algorithm that navigates deductive paths to mine in-depth memories, emulating human reasoning processes. Extensive evaluations indicate IGMiRAG outperforms the state-of-the-art baseline by 4.8% EM and 5.0% F1 overall, with token costs adapting to task complexity (average 6.3k+, minimum 3.0k+). This work presents a cost-effective RAG paradigm that improves both efficiency and effectiveness.",
    "source_database": "arxiv",
    "arxiv_id": "2602.07525v1"
  },
  {
    "title": "AlzheimerRAG: Multimodal Retrieval Augmented Generation for Clinical Use Cases using PubMed articles",
    "authors": [
      "Aritra Kumar Lahiri",
      "Qinmin Vivian Hu"
    ],
    "year": "2024",
    "journal": "arXiv:2412.16701v3",
    "doi": "https://doi.org/10.3390/make7030089",
    "abstract": "Recent advancements in generative AI have fostered the development of highly adept Large Language Models (LLMs) that integrate diverse data types to empower decision-making. Among these, multimodal retrieval-augmented generation (RAG) applications are promising because they combine the strengths of information retrieval and generative models, enhancing their utility across various domains, including clinical use cases. This paper introduces AlzheimerRAG, a Multimodal RAG application for clinical use cases, primarily focusing on Alzheimer's Disease case studies from PubMed articles. This application incorporates cross-modal attention fusion techniques to integrate textual and visual data processing by efficiently indexing and accessing vast amounts of biomedical literature. Our experimental results, compared to benchmarks such as BioASQ and PubMedQA, have yielded improved performance in the retrieval and synthesis of domain-specific information. We also present a case study using our multimodal RAG in various Alzheimer's clinical scenarios. We infer that AlzheimerRAG can generate responses with accuracy non-inferior to humans and with low rates of hallucination.",
    "source_database": "arxiv",
    "arxiv_id": "2412.16701v3"
  },
  {
    "title": "Reconstructing Context: Evaluating Advanced Chunking Strategies for Retrieval-Augmented Generation",
    "authors": [
      "Carlo Merola",
      "Jaspinder Singh"
    ],
    "year": "2025",
    "journal": "arXiv:2504.19754v1",
    "doi": "",
    "abstract": "Retrieval-augmented generation (RAG) has become a transformative approach for enhancing large language models (LLMs) by grounding their outputs in external knowledge sources. Yet, a critical question persists: how can vast volumes of external knowledge be managed effectively within the input constraints of LLMs? Traditional methods address this by chunking external documents into smaller, fixed-size segments. While this approach alleviates input limitations, it often fragments context, resulting in incomplete retrieval and diminished coherence in generation. To overcome these shortcomings, two advanced techniques, late chunking and contextual retrieval, have been introduced, both aiming to preserve global context. Despite their potential, their comparative strengths and limitations remain unclear. This study presents a rigorous analysis of late chunking and contextual retrieval, evaluating their effectiveness and efficiency in optimizing RAG systems. Our results indicate that contextual retrieval preserves semantic coherence more effectively but requires greater computational resources. In contrast, late chunking offers higher efficiency but tends to sacrifice relevance and completeness.",
    "source_database": "arxiv",
    "arxiv_id": "2504.19754v1"
  },
  {
    "title": "Lightweight Transformers for Clinical Natural Language Processing",
    "authors": [
      "Omid Rohanian",
      "Mohammadmahdi Nouriborji",
      "Hannah Jauncey",
      "Samaneh Kouchaki",
      "ISARIC Clinical Characterisation Group",
      "Lei Clifton",
      "Laura Merson",
      "David A. Clifton"
    ],
    "year": "2023",
    "journal": "arXiv:2302.04725v1",
    "doi": "https://doi.org/10.1017/S1351324923000542",
    "abstract": "Specialised pre-trained language models are becoming more frequent in NLP since they can potentially outperform models trained on generic texts. BioBERT and BioClinicalBERT are two examples of such models that have shown promise in medical NLP tasks. Many of these models are overparametrised and resource-intensive, but thanks to techniques like Knowledge Distillation (KD), it is possible to create smaller versions that perform almost as well as their larger counterparts. In this work, we specifically focus on development of compact language models for processing clinical texts (i.e. progress notes, discharge summaries etc). We developed a number of efficient lightweight clinical transformers using knowledge distillation and continual learning, with the number of parameters ranging from 15 million to 65 million. These models performed comparably to larger models such as BioBERT and ClinicalBioBERT and significantly outperformed other compact models trained on general or biomedical data. Our extensive evaluation was done across several standard datasets and covered a wide range of clinical text-mining tasks, including Natural Language Inference, Relation Extraction, Named Entity Recognition, and Sequence Classification. To our knowledge, this is the first comprehensive study specifically focused on creating efficient and compact transformers for clinical NLP tasks. The models and code used in this study can be found on our Huggingface profile at https://huggingface.co/nlpie and Github page at https://github.com/nlpie-research/Lightweight-Clinical-Transformers, respectively, promoting reproducibility of our results.",
    "source_database": "arxiv",
    "arxiv_id": "2302.04725v1"
  },
  {
    "title": "Enhancing Critical Thinking with AI: A Tailored Warning System for RAG Models",
    "authors": [
      "Xuyang Zhu",
      "Sejoon Chang",
      "Andrew Kuik"
    ],
    "year": "2025",
    "journal": "arXiv:2504.16883v1",
    "doi": "",
    "abstract": "Retrieval-Augmented Generation (RAG) systems offer a powerful approach to enhancing large language model (LLM) outputs by incorporating fact-checked, contextually relevant information. However, fairness and reliability concerns persist, as hallucinations can emerge at both the retrieval and generation stages, affecting users' reasoning and decision-making. Our research explores how tailored warning messages -- whose content depends on the specific context of hallucination -- shape user reasoning and actions in an educational quiz setting. Preliminary findings suggest that while warnings improve accuracy and awareness of high-level hallucinations, they may also introduce cognitive friction, leading to confusion and diminished trust in the system. By examining these interactions, this work contributes to the broader goal of AI-augmented reasoning: developing systems that actively support human reflection, critical thinking, and informed decision-making rather than passive information consumption.",
    "source_database": "arxiv",
    "arxiv_id": "2504.16883v1"
  },
  {
    "title": "Towards Smart Healthcare: Challenges and Opportunities in IoT and ML",
    "authors": [
      "Munshi Saifuzzaman",
      "Tajkia Nuri Ananna"
    ],
    "year": "2023",
    "journal": "arXiv:2312.05530v2",
    "doi": "https://doi.org/10.1007/978-981-97-5624-7_10",
    "abstract": "The COVID-19 pandemic and other ongoing health crises have underscored the need for prompt healthcare services worldwide. The traditional healthcare system, centered around hospitals and clinics, has proven inadequate in the face of such challenges. Intelligent wearable devices, a key part of modern healthcare, leverage Internet of Things technology to collect extensive data related to the environment as well as psychological, behavioral, and physical health. However, managing the substantial data generated by these wearables and other IoT devices in healthcare poses a significant challenge, potentially impeding decision-making processes. Recent interest has grown in applying data analytics for extracting information, gaining insights, and making predictions. Additionally, machine learning, known for addressing various big data and networking challenges, has seen increased implementation to enhance IoT systems in healthcare. This chapter focuses exclusively on exploring the hurdles encountered when integrating ML methods into the IoT healthcare sector. It offers a comprehensive summary of current research challenges and potential opportunities, categorized into three scenarios: IoT-based, ML-based, and the implementation of machine learning methodologies in the IoT-based healthcare industry. This compilation will assist future researchers, healthcare professionals, and government agencies by offering valuable insights into recent smart healthcare advancements.",
    "source_database": "arxiv",
    "arxiv_id": "2312.05530v2"
  },
  {
    "title": "Blended RAG: Improving RAG (Retriever-Augmented Generation) Accuracy with Semantic Search and Hybrid Query-Based Retrievers",
    "authors": [
      "Kunal Sawarkar",
      "Abhilasha Mangal",
      "Shivam Raj Solanki"
    ],
    "year": "2024",
    "journal": "arXiv:2404.07220v2",
    "doi": "https://doi.org/10.1109/MIPR62202.2024.00031",
    "abstract": "Retrieval-Augmented Generation (RAG) is a prevalent approach to infuse a private knowledge base of documents with Large Language Models (LLM) to build Generative Q\\&A (Question-Answering) systems. However, RAG accuracy becomes increasingly challenging as the corpus of documents scales up, with Retrievers playing an outsized role in the overall RAG accuracy by extracting the most relevant document from the corpus to provide context to the LLM. In this paper, we propose the 'Blended RAG' method of leveraging semantic search techniques, such as Dense Vector indexes and Sparse Encoder indexes, blended with hybrid query strategies. Our study achieves better retrieval results and sets new benchmarks for IR (Information Retrieval) datasets like NQ and TREC-COVID datasets. We further extend such a 'Blended Retriever' to the RAG system to demonstrate far superior results on Generative Q\\&A datasets like SQUAD, even surpassing fine-tuning performance.",
    "source_database": "arxiv",
    "arxiv_id": "2404.07220v2"
  },
  {
    "title": "Investigating Retrieval-Augmented Generation in Quranic Studies: A Study of 13 Open-Source Large Language Models",
    "authors": [
      "Zahra Khalila",
      "Arbi Haza Nasution",
      "Winda Monika",
      "Aytug Onan",
      "Yohei Murakami",
      "Yasir Bin Ismail Radi",
      "Noor Mohammad Osmani"
    ],
    "year": "2025",
    "journal": "arXiv:2503.16581v1",
    "doi": "https://doi.org/10.14569/IJACSA.2025.01602134",
    "abstract": "Accurate and contextually faithful responses are critical when applying large language models (LLMs) to sensitive and domain-specific tasks, such as answering queries related to quranic studies. General-purpose LLMs often struggle with hallucinations, where generated responses deviate from authoritative sources, raising concerns about their reliability in religious contexts. This challenge highlights the need for systems that can integrate domain-specific knowledge while maintaining response accuracy, relevance, and faithfulness. In this study, we investigate 13 open-source LLMs categorized into large (e.g., Llama3:70b, Gemma2:27b, QwQ:32b), medium (e.g., Gemma2:9b, Llama3:8b), and small (e.g., Llama3.2:3b, Phi3:3.8b). A Retrieval-Augmented Generation (RAG) is used to make up for the problems that come with using separate models. This research utilizes a descriptive dataset of Quranic surahs including the meanings, historical context, and qualities of the 114 surahs, allowing the model to gather relevant knowledge before responding. The models are evaluated using three key metrics set by human evaluators: context relevance, answer faithfulness, and answer relevance. The findings reveal that large models consistently outperform smaller models in capturing query semantics and producing accurate, contextually grounded responses. The Llama3.2:3b model, even though it is considered small, does very well on faithfulness (4.619) and relevance (4.857), showing the promise of smaller architectures that have been well optimized. This article examines the trade-offs between model size, computational efficiency, and response quality while using LLMs in domain-specific applications.",
    "source_database": "arxiv",
    "arxiv_id": "2503.16581v1"
  },
  {
    "title": "Video Enriched Retrieval Augmented Generation Using Aligned Video Captions",
    "authors": [
      "Kevin Dela Rosa"
    ],
    "year": "2024",
    "journal": "arXiv:2405.17706v1",
    "doi": "",
    "abstract": "In this work, we propose the use of \"aligned visual captions\" as a mechanism for integrating information contained within videos into retrieval augmented generation (RAG) based chat assistant systems. These captions are able to describe the visual and audio content of videos in a large corpus while having the advantage of being in a textual format that is both easy to reason about & incorporate into large language model (LLM) prompts, but also typically require less multimedia content to be inserted into the multimodal LLM context window, where typical configurations can aggressively fill up the context window by sampling video frames from the source video. Furthermore, visual captions can be adapted to specific use cases by prompting the original foundational model / captioner for particular visual details or fine tuning. In hopes of helping advancing progress in this area, we curate a dataset and describe automatic evaluation procedures on common RAG tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2405.17706v1"
  },
  {
    "title": "Expert Mind: A Retrieval-Augmented Architecture for Expert Knowledge Preservation in the Energy Sector",
    "authors": [
      "Diego Ezequiel Cervera"
    ],
    "year": "2026",
    "journal": "arXiv:2603.14541v1",
    "doi": "",
    "abstract": "The departure of subject-matter experts from industrial organizations results in the irreversible loss of tacit knowledge that is rarely captured through conventional documentation practices. This paper proposes Expert Mind, an experimental system that leverages Retrieval-Augmented Generation (RAG), large language models (LLMs), and multimodal capture techniques to preserve, structure, and make queryable the deep expertise of organizational knowledge holders. Drawing on the specific context of the energy sector, where decades of operational experience risk being lost to an aging workforce, we describe the system architecture, processing pipeline, ethical framework, and evaluation methodology. The proposed system addresses the knowledge elicitation problem through structured interviews, think-aloud sessions, and text corpus ingestion, which are subsequently embedded into a vector store and queried through a conversational interface. Preliminary design considerations suggest Expert Mind can significantly reduce knowledge transfer latency and improve onboarding efficiency. Ethical dimensions including informed consent, intellectual property, and the right to erasure are addressed as first-class design constraints.",
    "source_database": "arxiv",
    "arxiv_id": "2603.14541v1"
  },
  {
    "title": "Fact-Controlled Diagnosis of Hallucinations in Medical Text Summarization",
    "authors": [
      "Suhas BN",
      "Han-Chin Shing",
      "Lei Xu",
      "Mitch Strong",
      "Jon Burnsky",
      "Jessica Ofor",
      "Jordan R. Mason",
      "Susan Chen",
      "Sundararajan Srinivasan",
      "Chaitanya Shivade",
      "Jack Moriarty",
      "Joseph Paul Cohen"
    ],
    "year": "2025",
    "journal": "arXiv:2506.00448v1",
    "doi": "",
    "abstract": "Hallucinations in large language models (LLMs) during summarization of patient-clinician dialogues pose significant risks to patient care and clinical decision-making. However, the phenomenon remains understudied in the clinical domain, with uncertainty surrounding the applicability of general-domain hallucination detectors. The rarity and randomness of hallucinations further complicate their investigation. In this paper, we conduct an evaluation of hallucination detection methods in the medical domain, and construct two datasets for the purpose: A fact-controlled Leave-N-out dataset -- generated by systematically removing facts from source dialogues to induce hallucinated content in summaries; and a natural hallucination dataset -- arising organically during LLM-based medical summarization. We show that general-domain detectors struggle to detect clinical hallucinations, and that performance on fact-controlled hallucinations does not reliably predict effectiveness on natural hallucinations. We then develop fact-based approaches that count hallucinations, offering explainability not available with existing methods. Notably, our LLM-based detectors, which we developed using fact-controlled hallucinations, generalize well to detecting real-world clinical hallucinations. This research contributes a suite of specialized metrics supported by expert-annotated datasets to advance faithful clinical summarization systems.",
    "source_database": "arxiv",
    "arxiv_id": "2506.00448v1"
  },
  {
    "title": "Contradictions in Context: Challenges for Retrieval-Augmented Generation in Healthcare",
    "authors": [
      "Saeedeh Javadi",
      "Sara Mirabi",
      "Manan Gangar",
      "Bahadorreza Ofoghi"
    ],
    "year": "2025",
    "journal": "arXiv:2511.06668v2",
    "doi": "",
    "abstract": "In high-stakes information domains such as healthcare, where large language models (LLMs) can produce hallucinations or misinformation, retrieval-augmented generation (RAG) has been proposed as a mitigation strategy, grounding model outputs in external, domain-specific documents. Yet, this approach can introduce errors when source documents contain outdated or contradictory information. This work investigates the performance of five LLMs in generating RAG-based responses to medicine-related queries. Our contributions are three-fold: i) the creation of a benchmark dataset using consumer medicine information documents from the Australian Therapeutic Goods Administration (TGA), where headings are repurposed as natural language questions, ii) the retrieval of PubMed abstracts using TGA headings, stratified across multiple publication years, to enable controlled temporal evaluation of outdated evidence, and iii) a comparative analysis of the frequency and impact of outdated or contradictory content on model-generated responses, assessing how LLMs integrate and reconcile temporally inconsistent information. Our findings show that contradictions between highly similar abstracts do, in fact, degrade performance, leading to inconsistencies and reduced factual accuracy in model answers. These results highlight that retrieval similarity alone is insufficient for reliable medical RAG and underscore the need for contradiction-aware filtering strategies to ensure trustworthy responses in high-stakes domains.",
    "source_database": "arxiv",
    "arxiv_id": "2511.06668v2"
  },
  {
    "title": "Tree of Reviews: A Tree-based Dynamic Iterative Retrieval Framework for Multi-hop Question Answering",
    "authors": [
      "Li Jiapeng",
      "Liu Runze",
      "Li Yabo",
      "Zhou Tong",
      "Li Mingling",
      "Chen Xiang"
    ],
    "year": "2024",
    "journal": "arXiv:2404.14464v1",
    "doi": "",
    "abstract": "Multi-hop question answering is a knowledge-intensive complex problem. Large Language Models (LLMs) use their Chain of Thoughts (CoT) capability to reason complex problems step by step, and retrieval-augmentation can effectively alleviate factual errors caused by outdated and unknown knowledge in LLMs. Recent works have introduced retrieval-augmentation in the CoT reasoning to solve multi-hop question answering. However, these chain methods have the following problems: 1) Retrieved irrelevant paragraphs may mislead the reasoning; 2) An error in the chain structure may lead to a cascade of errors.   In this paper, we propose a dynamic retrieval framework called Tree of Reviews (ToR), where the root node is the question, and the other nodes are paragraphs from retrieval, extending different reasoning paths from the root node to other nodes. Our framework dynamically decides to initiate a new search, reject, or accept based on the paragraphs on the reasoning paths. Compared to related work, we introduce a tree structure to handle each retrieved paragraph separately, alleviating the misleading effect of irrelevant paragraphs on the reasoning path; the diversity of reasoning path extension reduces the impact of a single reasoning error on the whole. We conducted experiments on three different multi-hop question answering datasets. The results show that compared to the baseline methods, ToR achieves state-of-the-art performance in both retrieval and response generation. In addition, we propose two tree-based search optimization strategies, pruning and effective expansion, to reduce time overhead and increase the diversity of path extension. We will release our code.",
    "source_database": "arxiv",
    "arxiv_id": "2404.14464v1"
  },
  {
    "title": "The Geometry of Queries: Query-Based Innovations in Retrieval-Augmented Generation for Healthcare QA",
    "authors": [
      "Eric Yang",
      "Jonathan Amar",
      "Jong Ha Lee",
      "Bhawesh Kumar",
      "Yugang Jia"
    ],
    "year": "2024",
    "journal": "arXiv:2407.18044v2",
    "doi": "",
    "abstract": "Deploying Large Language Models (LLMs) for healthcare question answering requires robust methods to ensure accuracy and reliability. This work introduces Query-Based Retrieval Augmented Generation (QB-RAG), a framework for enhancing Retrieval-Augmented Generation (RAG) systems in healthcare question-answering by pre-aligning user queries with a database of curated, answerable questions derived from healthcare content. A key component of QB-RAG is an LLM-based filtering mechanism that ensures that only relevant and answerable questions are included in the database, enabling reliable reference query generation at scale. We provide theoretical motivation for QB-RAG, conduct a comparative analysis of existing retrieval enhancement techniques, and introduce a generalizable, comprehensive evaluation framework that assesses both the retrieval effectiveness and the quality of the generated response based on faithfulness, relevance, and adherence to the guideline. Our empirical evaluation on a healthcare data set demonstrates the superior performance of QB-RAG compared to existing retrieval methods, highlighting its practical value in building trustworthy digital health applications for health question-answering.",
    "source_database": "arxiv",
    "arxiv_id": "2407.18044v2"
  },
  {
    "title": "Retrieval Augmented Thought Process for Private Data Handling in Healthcare",
    "authors": [
      "Thomas Pouplin",
      "Hao Sun",
      "Samuel Holt",
      "Mihaela van der Schaar"
    ],
    "year": "2024",
    "journal": "arXiv:2402.07812v2",
    "doi": "",
    "abstract": "Large Language Models (LLMs) have demonstrated the strong potential to assist both clinicians and the general public with their extensive medical knowledge. However, their application in healthcare is constrained due to concerns about the privacy of data used in training, which prevents the integration of private and personal information because of security and ethical issues. Moreover, if their capabilities can be enhanced with information retrieval to access up-to-date knowledge, the current integration of LLMs with Information retrieval lacks robustness to imperfect retrieval, which can hinder their effectiveness and even reduce overall performance. In this work, we address this challenge by introducing the Retrieval-Augmented Thought Process (RATP). Given access to external knowledge, RATP formulates the thought generation of LLMs as a multiple-step decision process. To optimise such a thought process, RATP leverages Monte-Carlo Tree Search and learns a proxy reward function that permits cost-efficient inference. On a private dataset of electronic medical records, deliberately excluded from any LLM training set, RATP achieves 35% additional accuracy compared to in-context retrieval-augmented generation for the question-answering task.",
    "source_database": "arxiv",
    "arxiv_id": "2402.07812v2"
  },
  {
    "title": "Multi-Task Retrieval-Augmented Text Generation with Relevance Sampling",
    "authors": [
      "Sebastian Hofstätter",
      "Jiecao Chen",
      "Karthik Raman",
      "Hamed Zamani"
    ],
    "year": "2022",
    "journal": "arXiv:2207.03030v1",
    "doi": "",
    "abstract": "This paper studies multi-task training of retrieval-augmented generation models for knowledge-intensive tasks. We propose to clean the training set by utilizing a distinct property of knowledge-intensive generation: The connection of query-answer pairs to items in the knowledge base. We filter training examples via a threshold of confidence on the relevance labels, whether a pair is answerable by the knowledge base or not. We train a single Fusion-in-Decoder (FiD) generator on seven combined tasks of the KILT benchmark. The experimental results suggest that our simple yet effective approach substantially improves competitive baselines on two strongly imbalanced tasks; and shows either smaller improvements or no significant regression on the remaining tasks. Furthermore, we demonstrate our multi-task training with relevance label sampling scales well with increased model capacity and achieves state-of-the-art results in five out of seven KILT tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2207.03030v1"
  },
  {
    "title": "Are Large Language Models Ready for Healthcare? A Comparative Study on Clinical Language Understanding",
    "authors": [
      "Yuqing Wang",
      "Yun Zhao",
      "Linda Petzold"
    ],
    "year": "2023",
    "journal": "arXiv:2304.05368v3",
    "doi": "",
    "abstract": "Large language models (LLMs) have made significant progress in various domains, including healthcare. However, the specialized nature of clinical language understanding tasks presents unique challenges and limitations that warrant further investigation. In this study, we conduct a comprehensive evaluation of state-of-the-art LLMs, namely GPT-3.5, GPT-4, and Bard, within the realm of clinical language understanding tasks. These tasks span a diverse range, including named entity recognition, relation extraction, natural language inference, semantic textual similarity, document classification, and question-answering. We also introduce a novel prompting strategy, self-questioning prompting (SQP), tailored to enhance LLMs' performance by eliciting informative questions and answers pertinent to the clinical scenarios at hand. Our evaluation underscores the significance of task-specific learning strategies and prompting techniques for improving LLMs' effectiveness in healthcare-related tasks. Additionally, our in-depth error analysis on the challenging relation extraction task offers valuable insights into error distribution and potential avenues for improvement using SQP. Our study sheds light on the practical implications of employing LLMs in the specialized domain of healthcare, serving as a foundation for future research and the development of potential applications in healthcare settings.",
    "source_database": "arxiv",
    "arxiv_id": "2304.05368v3"
  },
  {
    "title": "Grounded by Experience: Generative Healthcare Prediction Augmented with Hierarchical Agentic Retrieval",
    "authors": [
      "Chuang Zhao",
      "Hui Tang",
      "Hongke Zhao",
      "Xiaofang Zhou",
      "Xiaomeng Li"
    ],
    "year": "2025",
    "journal": "arXiv:2511.13293v1",
    "doi": "",
    "abstract": "Accurate healthcare prediction is critical for improving patient outcomes and reducing operational costs. Bolstered by growing reasoning capabilities, large language models (LLMs) offer a promising path to enhance healthcare predictions by drawing on their rich parametric knowledge. However, LLMs are prone to factual inaccuracies due to limitations in the reliability and coverage of their embedded knowledge. While retrieval-augmented generation (RAG) frameworks, such as GraphRAG and its variants, have been proposed to mitigate these issues by incorporating external knowledge, they face two key challenges in the healthcare scenario: (1) identifying the clinical necessity to activate the retrieval mechanism, and (2) achieving synergy between the retriever and the generator to craft contextually appropriate retrievals. To address these challenges, we propose GHAR, a \\underline{g}enerative \\underline{h}ierarchical \\underline{a}gentic \\underline{R}AG framework that simultaneously resolves when to retrieve and how to optimize the collaboration between submodules in healthcare. Specifically, for the first challenge, we design a dual-agent architecture comprising Agent-Top and Agent-Low. Agent-Top acts as the primary physician, iteratively deciding whether to rely on parametric knowledge or to initiate retrieval, while Agent-Low acts as the consulting service, summarising all task-relevant knowledge once retrieval was triggered. To tackle the second challenge, we innovatively unify the optimization of both agents within a formal Markov Decision Process, designing diverse rewards to align their shared goal of accurate prediction while preserving their distinct roles. Extensive experiments on three benchmark datasets across three popular tasks demonstrate our superiority over state-of-the-art baselines, highlighting the potential of hierarchical agentic RAG in advancing healthcare systems.",
    "source_database": "arxiv",
    "arxiv_id": "2511.13293v1"
  },
  {
    "title": "DeepCodeSeek: Real-Time API Retrieval for Context-Aware Code Generation",
    "authors": [
      "Esakkivel Esakkiraja",
      "Denis Akhiyarov",
      "Aditya Shanmugham",
      "Chitra Ganapathy"
    ],
    "year": "2025",
    "journal": "arXiv:2509.25716v1",
    "doi": "",
    "abstract": "Current search techniques are limited to standard RAG query-document applications. In this paper, we propose a novel technique to expand the code and index for predicting the required APIs, directly enabling high-quality, end-to-end code generation for auto-completion and agentic AI applications. We address the problem of API leaks in current code-to-code benchmark datasets by introducing a new dataset built from real-world ServiceNow Script Includes that capture the challenge of unclear API usage intent in the code. Our evaluation metrics show that this method achieves 87.86% top-40 retrieval accuracy, allowing the critical context with APIs needed for successful downstream code generation. To enable real-time predictions, we develop a comprehensive post-training pipeline that optimizes a compact 0.6B reranker through synthetic dataset generation, supervised fine-tuning, and reinforcement learning. This approach enables our compact reranker to outperform a much larger 8B model while maintaining 2.5x reduced latency, effectively addressing the nuances of enterprise-specific code without the computational overhead of larger models.",
    "source_database": "arxiv",
    "arxiv_id": "2509.25716v1"
  },
  {
    "title": "Hybrid Retrieval for Hallucination Mitigation in Large Language Models: A Comparative Analysis",
    "authors": [
      "Chandana Sree Mala",
      "Gizem Gezici",
      "Fosca Giannotti"
    ],
    "year": "2025",
    "journal": "arXiv:2504.05324v1",
    "doi": "",
    "abstract": "Large Language Models (LLMs) excel in language comprehension and generation but are prone to hallucinations, producing factually incorrect or unsupported outputs. Retrieval Augmented Generation (RAG) systems address this issue by grounding LLM responses with external knowledge. This study evaluates the relationship between retriever effectiveness and hallucination reduction in LLMs using three retrieval approaches: sparse retrieval based on BM25 keyword search, dense retrieval using semantic search with Sentence Transformers, and a proposed hybrid retrieval module. The hybrid module incorporates query expansion and combines the results of sparse and dense retrievers through a dynamically weighted Reciprocal Rank Fusion score. Using the HaluBench dataset, a benchmark for hallucinations in question answering tasks, we assess retrieval performance with metrics such as mean average precision and normalised discounted cumulative gain, focusing on the relevance of the top three retrieved documents. Results show that the hybrid retriever achieves better relevance scores, outperforming both sparse and dense retrievers. Further evaluation of LLM-generated answers against ground truth using metrics such as accuracy, hallucination rate, and rejection rate reveals that the hybrid retriever achieves the highest accuracy on fails, the lowest hallucination rate, and the lowest rejection rate. These findings highlight the hybrid retriever's ability to enhance retrieval relevance, reduce hallucination rates, and improve LLM reliability, emphasising the importance of advanced retrieval techniques in mitigating hallucinations and improving response accuracy.",
    "source_database": "arxiv",
    "arxiv_id": "2504.05324v1"
  },
  {
    "title": "Detecting Hallucination and Coverage Errors in Retrieval Augmented Generation for Controversial Topics",
    "authors": [
      "Tyler A. Chang",
      "Katrin Tomanek",
      "Jessica Hoffmann",
      "Nithum Thain",
      "Erin van Liemt",
      "Kathleen Meier-Hellstern",
      "Lucas Dixon"
    ],
    "year": "2024",
    "journal": "arXiv:2403.08904v1",
    "doi": "",
    "abstract": "We explore a strategy to handle controversial topics in LLM-based chatbots based on Wikipedia's Neutral Point of View (NPOV) principle: acknowledge the absence of a single true answer and surface multiple perspectives. We frame this as retrieval augmented generation, where perspectives are retrieved from a knowledge base and the LLM is tasked with generating a fluent and faithful response from the given perspectives. As a starting point, we use a deterministic retrieval system and then focus on common LLM failure modes that arise during this approach to text generation, namely hallucination and coverage errors. We propose and evaluate three methods to detect such errors based on (1) word-overlap, (2) salience, and (3) LLM-based classifiers. Our results demonstrate that LLM-based classifiers, even when trained only on synthetic errors, achieve high error detection performance, with ROC AUC scores of 95.3% for hallucination and 90.5% for coverage error detection on unambiguous error cases. We show that when no training data is available, our other methods still yield good results on hallucination (84.0%) and coverage error (85.2%) detection.",
    "source_database": "arxiv",
    "arxiv_id": "2403.08904v1"
  },
  {
    "title": "Explainable Depression Detection in Clinical Interviews with Personalized Retrieval-Augmented Generation",
    "authors": [
      "Linhai Zhang",
      "Ziyang Gao",
      "Deyu Zhou",
      "Yulan He"
    ],
    "year": "2025",
    "journal": "arXiv:2503.01315v1",
    "doi": "",
    "abstract": "Depression is a widespread mental health disorder, and clinical interviews are the gold standard for assessment. However, their reliance on scarce professionals highlights the need for automated detection. Current systems mainly employ black-box neural networks, which lack interpretability, which is crucial in mental health contexts. Some attempts to improve interpretability use post-hoc LLM generation but suffer from hallucination. To address these limitations, we propose RED, a Retrieval-augmented generation framework for Explainable depression Detection. RED retrieves evidence from clinical interview transcripts, providing explanations for predictions. Traditional query-based retrieval systems use a one-size-fits-all approach, which may not be optimal for depression detection, as user backgrounds and situations vary. We introduce a personalized query generation module that combines standard queries with user-specific background inferred by LLMs, tailoring retrieval to individual contexts. Additionally, to enhance LLM performance in social intelligence, we augment LLMs by retrieving relevant knowledge from a social intelligence datastore using an event-centric retriever. Experimental results on the real-world benchmark demonstrate RED's effectiveness compared to neural networks and LLM-based baselines.",
    "source_database": "arxiv",
    "arxiv_id": "2503.01315v1"
  },
  {
    "title": "Using Bottleneck Adapters to Identify Cancer in Clinical Notes under Low-Resource Constraints",
    "authors": [
      "Omid Rohanian",
      "Hannah Jauncey",
      "Mohammadmahdi Nouriborji",
      "Vinod Kumar Chauhan",
      "Bronner P. Gonçalves",
      "Christiana Kartsonaki",
      "ISARIC Clinical Characterisation Group",
      "Laura Merson",
      "David Clifton"
    ],
    "year": "2022",
    "journal": "arXiv:2210.09440v2",
    "doi": "",
    "abstract": "Processing information locked within clinical health records is a challenging task that remains an active area of research in biomedical NLP. In this work, we evaluate a broad set of machine learning techniques ranging from simple RNNs to specialised transformers such as BioBERT on a dataset containing clinical notes along with a set of annotations indicating whether a sample is cancer-related or not.   Furthermore, we specifically employ efficient fine-tuning methods from NLP, namely, bottleneck adapters and prompt tuning, to adapt the models to our specialised task. Our evaluations suggest that fine-tuning a frozen BERT model pre-trained on natural language and with bottleneck adapters outperforms all other strategies, including full fine-tuning of the specialised BioBERT model. Based on our findings, we suggest that using bottleneck adapters in low-resource situations with limited access to labelled data or processing capacity could be a viable strategy in biomedical text mining. The code used in the experiments are going to be made available at https://github.com/omidrohanian/bottleneck-adapters.",
    "source_database": "arxiv",
    "arxiv_id": "2210.09440v2"
  }
]