[
  {
    "title": "Specialized AI and neurosurgeons in niche expertise: a proof-of-concept in neuromodulation with vagus nerve stimulation.",
    "authors": [
      "Barrit S",
      "Ranuzzi G",
      "Fetzer S",
      "Al Barajraji M",
      "Hadwe SE",
      "Zanello M",
      "Ortler M",
      "O'Flaherty J",
      "Massager N",
      "Madsen JR",
      "Dibué M",
      "Carron R"
    ],
    "year": "2025",
    "journal": "Acta neurochirurgica",
    "doi": "10.1007/s00701-025-06610-8",
    "pmid": "40711622",
    "abstract": "Applying large language models (LLM) in specialized medical disciplines presents unique challenges requiring precision, reliability, and domain-specific relevance. We evaluated a specialized LLM-driven system against neurosurgeons in vagus nerve stimulation (VNS) for drug-resistant epilepsy knowledge assessment-a complex neuromodulation therapy requiring transdisciplinary expertise in neural anatomy, epileptic disorders, and device technology.",
    "source_database": "pubmed"
  },
  {
    "title": "Detecting hallucinations in large language models using semantic entropy.",
    "authors": [
      "Farquhar S",
      "Kossen J",
      "Kuhn L",
      "Gal Y"
    ],
    "year": "2024",
    "journal": "Nature",
    "doi": "10.1038/s41586-024-07421-0",
    "pmid": "38898292",
    "abstract": "Large language model (LLM) systems, such as ChatGPT",
    "source_database": "pubmed"
  },
  {
    "title": "PediatricsGPT: Large Language Models as Chinese Medical Assistants for Pediatric Applications",
    "authors": [
      "Dingkang Yang",
      "Jinjie Wei",
      "Dongling Xiao",
      "Shunli Wang",
      "Tong Wu",
      "Gang Li",
      "Mingcheng Li",
      "Shuaibing Wang",
      "Jiawei Chen",
      "Yue Jiang",
      "Qingyao Xu",
      "Ke Li",
      "Peng Zhai",
      "Lihua Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2405.19266v4",
    "doi": "",
    "abstract": "Developing intelligent pediatric consultation systems offers promising prospects for improving diagnostic efficiency, especially in China, where healthcare resources are scarce. Despite recent advances in Large Language Models (LLMs) for Chinese medicine, their performance is sub-optimal in pediatric applications due to inadequate instruction data and vulnerable training procedures. To address the above issues, this paper builds PedCorpus, a high-quality dataset of over 300,000 multi-task instructions from pediatric textbooks, guidelines, and knowledge graph resources to fulfil diverse diagnostic demands. Upon well-designed PedCorpus, we propose PediatricsGPT, the first Chinese pediatric LLM assistant built on a systematic and robust training pipeline. In the continuous pre-training phase, we introduce a hybrid instruction pre-training mechanism to mitigate the internal-injected knowledge inconsistency of LLMs for medical domain adaptation. Immediately, the full-parameter Supervised Fine-Tuning (SFT) is utilized to incorporate the general medical knowledge schema into the models. After that, we devise a direct following preference optimization to enhance the generation of pediatrician-like humanistic responses. In the parameter-efficient secondary SFT phase, a mixture of universal-specific experts strategy is presented to resolve the competency conflict between medical generalist and pediatric expertise mastery. Extensive results based on the metrics, GPT-4, and doctor evaluations on distinct doctor downstream tasks show that PediatricsGPT consistently outperforms previous Chinese medical LLMs. Our model and dataset will be open-source for community development.",
    "source_database": "arxiv",
    "arxiv_id": "2405.19266v4"
  },
  {
    "title": "Hallucination Detection with Small Language Models",
    "authors": [
      "Ming Cheung"
    ],
    "year": "2025",
    "journal": "arXiv:2506.22486v1",
    "doi": "",
    "abstract": "Since the introduction of ChatGPT, large language models (LLMs) have demonstrated significant utility in various tasks, such as answering questions through retrieval-augmented generation. Context can be retrieved using a vectorized database, serving as a foundation for LLMs to generate responses. However, hallucinations in responses can undermine the reliability of LLMs in practical applications, and they are not easily detectable in the absence of ground truth, particularly in question-and-answer scenarios. This paper proposes a framework that integrates multiple small language models to verify responses generated by LLMs using the retrieved context from a vectorized database. By breaking down the responses into individual sentences and utilizing the probability of generating \"Yes\" tokens from the outputs of multiple models for a given set of questions, responses, and relevant context, hallucinations can be detected. The proposed framework is validated through experiments with real datasets comprising over 100 sets of questions, answers, and contexts, including responses with fully and partially correct sentences. The results demonstrate a 10\\% improvement in F1 scores for detecting correct responses compared to hallucinations, indicating that multiple small language models can be effectively employed for answer verification, providing a scalable and efficient solution for both academic and practical applications.",
    "source_database": "arxiv",
    "arxiv_id": "2506.22486v1"
  },
  {
    "title": "Reviewing Clinical Knowledge in Medical Large Language Models: Training and Beyond",
    "authors": [
      "Qiyuan Li",
      "Haijiang Liu",
      "Caicai Guo",
      "Chao Gao",
      "Deyu Chen",
      "Meng Wang",
      "Feng Gao",
      "Frank van Harmelen",
      "Jinguang Gu"
    ],
    "year": "2025",
    "journal": "arXiv:2502.20988v2",
    "doi": "https://doi.org/10.1016/j.knosys.2025.114215",
    "abstract": "The large-scale development of large language models (LLMs) in medical contexts, such as diagnostic assistance and treatment recommendations, necessitates that these models possess accurate medical knowledge and deliver traceable decision-making processes. Clinical knowledge, encompassing the insights gained from research on the causes, prognosis, diagnosis, and treatment of diseases, has been extensively examined within real-world medical practices. Recently, there has been a notable increase in research efforts aimed at integrating this type of knowledge into LLMs, encompassing not only traditional text and multimodal data integration but also technologies such as knowledge graphs (KGs) and retrieval-augmented generation (RAG). In this paper, we review the various initiatives to embed clinical knowledge into training-based, KG-supported, and RAG-assisted LLMs. We begin by gathering reliable knowledge sources from the medical domain, including databases and datasets. Next, we evaluate implementations for integrating clinical knowledge through specialized datasets and collaborations with external knowledge sources such as KGs and relevant documentation. Furthermore, we discuss the applications of the developed medical LLMs in the industrial sector to assess the disparity between models developed in academic settings and those in industry. We conclude the survey by presenting evaluation systems applicable to relevant tasks and identifying potential challenges facing this field. In this review, we do not aim for completeness, since any ostensibly complete review would soon be outdated. Our goal is to illustrate diversity by selecting representative and accessible items from current research and industry practices, reflecting real-world situations rather than claiming completeness. Thus, we emphasize showcasing diverse approaches.",
    "source_database": "arxiv",
    "arxiv_id": "2502.20988v2"
  },
  {
    "title": "THaMES: An End-to-End Tool for Hallucination Mitigation and Evaluation in Large Language Models",
    "authors": [
      "Mengfei Liang",
      "Archish Arun",
      "Zekun Wu",
      "Cristian Munoz",
      "Jonathan Lutch",
      "Emre Kazim",
      "Adriano Koshiyama",
      "Philip Treleaven"
    ],
    "year": "2024",
    "journal": "arXiv:2409.11353v3",
    "doi": "",
    "abstract": "Hallucination, the generation of factually incorrect content, is a growing challenge in Large Language Models (LLMs). Existing detection and mitigation methods are often isolated and insufficient for domain-specific needs, lacking a standardized pipeline. This paper introduces THaMES (Tool for Hallucination Mitigations and EvaluationS), an integrated framework and library addressing this gap. THaMES offers an end-to-end solution for evaluating and mitigating hallucinations in LLMs, featuring automated test set generation, multifaceted benchmarking, and adaptable mitigation strategies. It automates test set creation from any corpus, ensuring high data quality, diversity, and cost-efficiency through techniques like batch processing, weighted sampling, and counterfactual validation. THaMES assesses a model's ability to detect and reduce hallucinations across various tasks, including text generation and binary classification, applying optimal mitigation strategies like In-Context Learning (ICL), Retrieval Augmented Generation (RAG), and Parameter-Efficient Fine-tuning (PEFT). Evaluations of state-of-the-art LLMs using a knowledge base of academic papers, political news, and Wikipedia reveal that commercial models like GPT-4o benefit more from RAG than ICL, while open-weight models like Llama-3.1-8B-Instruct and Mistral-Nemo gain more from ICL. Additionally, PEFT significantly enhances the performance of Llama-3.1-8B-Instruct in both evaluation tasks.",
    "source_database": "arxiv",
    "arxiv_id": "2409.11353v3"
  },
  {
    "title": "Instruction-tuned Large Language Models for Machine Translation in the Medical Domain",
    "authors": [
      "Miguel Rios"
    ],
    "year": "2024",
    "journal": "arXiv:2408.16440v2",
    "doi": "",
    "abstract": "Large Language Models (LLMs) have shown promising results on machine translation for high resource language pairs and domains. However, in specialised domains (e.g. medical) LLMs have shown lower performance compared to standard neural machine translation models. The consistency in the machine translation of terminology is crucial for users, researchers, and translators in specialised domains. In this study, we compare the performance between baseline LLMs and instruction-tuned LLMs in the medical domain. In addition, we introduce terminology from specialised medical dictionaries into the instruction formatted datasets for fine-tuning LLMs. The instruction-tuned LLMs significantly outperform the baseline models with automatic metrics.",
    "source_database": "arxiv",
    "arxiv_id": "2408.16440v2"
  },
  {
    "title": "Med-HALT: Medical Domain Hallucination Test for Large Language Models",
    "authors": [
      "Ankit Pal",
      "Logesh Kumar Umapathi",
      "Malaikannan Sankarasubbu"
    ],
    "year": "2023",
    "journal": "arXiv:2307.15343v2",
    "doi": "",
    "abstract": "This research paper focuses on the challenges posed by hallucinations in large language models (LLMs), particularly in the context of the medical domain. Hallucination, wherein these models generate plausible yet unverified or incorrect information, can have serious consequences in healthcare applications. We propose a new benchmark and dataset, Med-HALT (Medical Domain Hallucination Test), designed specifically to evaluate and reduce hallucinations. Med-HALT provides a diverse multinational dataset derived from medical examinations across various countries and includes multiple innovative testing modalities. Med-HALT includes two categories of tests reasoning and memory-based hallucination tests, designed to assess LLMs's problem-solving and information retrieval abilities.   Our study evaluated leading LLMs, including Text Davinci, GPT-3.5, LlaMa-2, MPT, and Falcon, revealing significant differences in their performance. The paper provides detailed insights into the dataset, promoting transparency and reproducibility. Through this work, we aim to contribute to the development of safer and more reliable language models in healthcare. Our benchmark can be found at medhalt.github.io",
    "source_database": "arxiv",
    "arxiv_id": "2307.15343v2"
  },
  {
    "title": "MedHallu: A Comprehensive Benchmark for Detecting Medical Hallucinations in Large Language Models",
    "authors": [
      "Shrey Pandit",
      "Jiawei Xu",
      "Junyuan Hong",
      "Zhangyang Wang",
      "Tianlong Chen",
      "Kaidi Xu",
      "Ying Ding"
    ],
    "year": "2025",
    "journal": "arXiv:2502.14302v1",
    "doi": "",
    "abstract": "Advancements in Large Language Models (LLMs) and their increasing use in medical question-answering necessitate rigorous evaluation of their reliability. A critical challenge lies in hallucination, where models generate plausible yet factually incorrect outputs. In the medical domain, this poses serious risks to patient safety and clinical decision-making. To address this, we introduce MedHallu, the first benchmark specifically designed for medical hallucination detection. MedHallu comprises 10,000 high-quality question-answer pairs derived from PubMedQA, with hallucinated answers systematically generated through a controlled pipeline. Our experiments show that state-of-the-art LLMs, including GPT-4o, Llama-3.1, and the medically fine-tuned UltraMedical, struggle with this binary hallucination detection task, with the best model achieving an F1 score as low as 0.625 for detecting \"hard\" category hallucinations. Using bidirectional entailment clustering, we show that harder-to-detect hallucinations are semantically closer to ground truth. Through experiments, we also show incorporating domain-specific knowledge and introducing a \"not sure\" category as one of the answer categories improves the precision and F1 scores by up to 38% relative to baselines.",
    "source_database": "arxiv",
    "arxiv_id": "2502.14302v1"
  },
  {
    "title": "Knowledge-Driven Agentic Scientific Corpus Distillation Framework for Biomedical Large Language Models Training",
    "authors": [
      "Meng Xiao",
      "Xunxin Cai",
      "Qingqing Long",
      "Chengrui Wang",
      "Yuanchun Zhou",
      "Hengshu Zhu"
    ],
    "year": "2025",
    "journal": "arXiv:2504.19565v3",
    "doi": "",
    "abstract": "Corpus distillation for biomedical large language models (LLMs) seeks to address the pressing challenge of insufficient quantity and quality in open-source annotated scientific corpora, which remains a bottleneck for effective LLM training in biomedical research. This paper proposes a knowledge-driven, agentic framework for scientific corpus distillation, tailored explicitly for LLM training in the biomedical domain, addressing the challenge posed by the complex hierarchy of biomedical knowledge. Central to our approach is a collaborative multi-agent architecture, where specialized agents, each guided by the Medical Subject Headings (MeSH) hierarchy, work in concert to autonomously extract, synthesize, and self-evaluate high-quality textual data from vast scientific literature. This agentic framework collectively generates and refines domain-specific question-answer pairs, ensuring comprehensive coverage and consistency with biomedical ontologies while minimizing manual involvement. Extensive experimental results show that language models trained on our multi-agent distilled datasets achieve notable improvements in biomedical question-answering tasks, outperforming both strong life sciences LLM baselines and advanced proprietary models. Notably, our AI-Ready dataset enables Llama3-70B to surpass GPT-4 with MedPrompt and Med-PaLM-2, despite their larger scale. Detailed ablation studies and case analyses further validate the effectiveness and synergy of each agent within the framework, highlighting the potential of multi-agent collaboration in biomedical LLM training.",
    "source_database": "arxiv",
    "arxiv_id": "2504.19565v3"
  },
  {
    "title": "Probabilistic distances-based hallucination detection in LLMs with RAG",
    "authors": [
      "Rodion Oblovatny",
      "Alexandra Kuleshova",
      "Konstantin Polev",
      "Alexey Zaytsev"
    ],
    "year": "2025",
    "journal": "arXiv:2506.09886v2",
    "doi": "",
    "abstract": "Detecting hallucinations in large language models (LLMs) is critical for their safety in many applications. Without proper detection, these systems often provide harmful, unreliable answers. In recent years, LLMs have been actively used in retrieval-augmented generation (RAG) settings. However, hallucinations remain even in this setting, and while numerous hallucination detection methods have been proposed, most approaches are not specifically designed for RAG systems. To overcome this limitation, we introduce a hallucination detection method based on estimating the distances between the distributions of prompt token embeddings and language model response token embeddings. The method examines the geometric structure of token hidden states to reliably extract a signal of factuality in text, while remaining friendly to long sequences. Extensive experiments demonstrate that our method achieves state-of-the-art or competitive performance. It also has transferability from solving the NLI task to the hallucination detection task, making it a fully unsupervised and efficient method with a competitive performance on the final task.",
    "source_database": "arxiv",
    "arxiv_id": "2506.09886v2"
  },
  {
    "title": "Beneath the Surface: Unveiling Harmful Memes with Multimodal Reasoning Distilled from Large Language Models",
    "authors": [
      "Hongzhan Lin",
      "Ziyang Luo",
      "Jing Ma",
      "Long Chen"
    ],
    "year": "2023",
    "journal": "arXiv:2312.05434v1",
    "doi": "",
    "abstract": "The age of social media is rife with memes. Understanding and detecting harmful memes pose a significant challenge due to their implicit meaning that is not explicitly conveyed through the surface text and image. However, existing harmful meme detection approaches only recognize superficial harm-indicative signals in an end-to-end classification manner but ignore in-depth cognition of the meme text and image. In this paper, we attempt to detect harmful memes based on advanced reasoning over the interplay of multimodal information in memes. Inspired by the success of Large Language Models (LLMs) on complex reasoning, we first conduct abductive reasoning with LLMs. Then we propose a novel generative framework to learn reasonable thoughts from LLMs for better multimodal fusion and lightweight fine-tuning, which consists of two training stages: 1) Distill multimodal reasoning knowledge from LLMs; and 2) Fine-tune the generative framework to infer harmfulness. Extensive experiments conducted on three meme datasets demonstrate that our proposed approach achieves superior performance than state-of-the-art methods on the harmful meme detection task.",
    "source_database": "arxiv",
    "arxiv_id": "2312.05434v1"
  },
  {
    "title": "How do language models learn facts? Dynamics, curricula and hallucinations",
    "authors": [
      "Nicolas Zucchet",
      "Jörg Bornschein",
      "Stephanie Chan",
      "Andrew Lampinen",
      "Razvan Pascanu",
      "Soham De"
    ],
    "year": "2025",
    "journal": "arXiv:2503.21676v2",
    "doi": "",
    "abstract": "Large language models accumulate vast knowledge during pre-training, yet the dynamics governing this acquisition remain poorly understood. This work investigates the learning dynamics of language models on a synthetic factual recall task, uncovering three key findings: First, language models learn in three phases, exhibiting a performance plateau before acquiring precise factual knowledge. Mechanistically, this plateau coincides with the formation of attention-based circuits that support recall. Second, the training data distribution significantly impacts learning dynamics, as imbalanced distributions lead to shorter plateaus. Finally, hallucinations emerge simultaneously with knowledge, and integrating new knowledge into the model through fine-tuning is challenging, as it quickly corrupts its existing parametric memories. Our results emphasize the importance of data distribution in knowledge acquisition and suggest novel data scheduling strategies to accelerate neural network training.",
    "source_database": "arxiv",
    "arxiv_id": "2503.21676v2"
  },
  {
    "title": "Fact-Controlled Diagnosis of Hallucinations in Medical Text Summarization",
    "authors": [
      "Suhas BN",
      "Han-Chin Shing",
      "Lei Xu",
      "Mitch Strong",
      "Jon Burnsky",
      "Jessica Ofor",
      "Jordan R. Mason",
      "Susan Chen",
      "Sundararajan Srinivasan",
      "Chaitanya Shivade",
      "Jack Moriarty",
      "Joseph Paul Cohen"
    ],
    "year": "2025",
    "journal": "arXiv:2506.00448v1",
    "doi": "",
    "abstract": "Hallucinations in large language models (LLMs) during summarization of patient-clinician dialogues pose significant risks to patient care and clinical decision-making. However, the phenomenon remains understudied in the clinical domain, with uncertainty surrounding the applicability of general-domain hallucination detectors. The rarity and randomness of hallucinations further complicate their investigation. In this paper, we conduct an evaluation of hallucination detection methods in the medical domain, and construct two datasets for the purpose: A fact-controlled Leave-N-out dataset -- generated by systematically removing facts from source dialogues to induce hallucinated content in summaries; and a natural hallucination dataset -- arising organically during LLM-based medical summarization. We show that general-domain detectors struggle to detect clinical hallucinations, and that performance on fact-controlled hallucinations does not reliably predict effectiveness on natural hallucinations. We then develop fact-based approaches that count hallucinations, offering explainability not available with existing methods. Notably, our LLM-based detectors, which we developed using fact-controlled hallucinations, generalize well to detecting real-world clinical hallucinations. This research contributes a suite of specialized metrics supported by expert-annotated datasets to advance faithful clinical summarization systems.",
    "source_database": "arxiv",
    "arxiv_id": "2506.00448v1"
  },
  {
    "title": "Detecting and Evaluating Medical Hallucinations in Large Vision Language Models",
    "authors": [
      "Jiawei Chen",
      "Dingkang Yang",
      "Tong Wu",
      "Yue Jiang",
      "Xiaolu Hou",
      "Mingcheng Li",
      "Shunli Wang",
      "Dongling Xiao",
      "Ke Li",
      "Lihua Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2406.10185v1",
    "doi": "",
    "abstract": "Large Vision Language Models (LVLMs) are increasingly integral to healthcare applications, including medical visual question answering and imaging report generation. While these models inherit the robust capabilities of foundational Large Language Models (LLMs), they also inherit susceptibility to hallucinations-a significant concern in high-stakes medical contexts where the margin for error is minimal. However, currently, there are no dedicated methods or benchmarks for hallucination detection and evaluation in the medical field. To bridge this gap, we introduce Med-HallMark, the first benchmark specifically designed for hallucination detection and evaluation within the medical multimodal domain. This benchmark provides multi-tasking hallucination support, multifaceted hallucination data, and hierarchical hallucination categorization. Furthermore, we propose the MediHall Score, a new medical evaluative metric designed to assess LVLMs' hallucinations through a hierarchical scoring system that considers the severity and type of hallucination, thereby enabling a granular assessment of potential clinical impacts. We also present MediHallDetector, a novel Medical LVLM engineered for precise hallucination detection, which employs multitask training for hallucination detection. Through extensive experimental evaluations, we establish baselines for popular LVLMs using our benchmark. The findings indicate that MediHall Score provides a more nuanced understanding of hallucination impacts compared to traditional metrics and demonstrate the enhanced performance of MediHallDetector. We hope this work can significantly improve the reliability of LVLMs in medical applications. All resources of this work will be released soon.",
    "source_database": "arxiv",
    "arxiv_id": "2406.10185v1"
  },
  {
    "title": "Unifying Large Language Models and Knowledge Graphs: A Roadmap",
    "authors": [
      "Shirui Pan",
      "Linhao Luo",
      "Yufei Wang",
      "Chen Chen",
      "Jiapu Wang",
      "Xindong Wu"
    ],
    "year": "2023",
    "journal": "arXiv:2306.08302v3",
    "doi": "https://doi.org/10.1109/TKDE.2024.3352100",
    "abstract": "Large language models (LLMs), such as ChatGPT and GPT4, are making new waves in the field of natural language processing and artificial intelligence, due to their emergent ability and generalizability. However, LLMs are black-box models, which often fall short of capturing and accessing factual knowledge. In contrast, Knowledge Graphs (KGs), Wikipedia and Huapu for example, are structured knowledge models that explicitly store rich factual knowledge. KGs can enhance LLMs by providing external knowledge for inference and interpretability. Meanwhile, KGs are difficult to construct and evolving by nature, which challenges the existing methods in KGs to generate new facts and represent unseen knowledge. Therefore, it is complementary to unify LLMs and KGs together and simultaneously leverage their advantages. In this article, we present a forward-looking roadmap for the unification of LLMs and KGs. Our roadmap consists of three general frameworks, namely, 1) KG-enhanced LLMs, which incorporate KGs during the pre-training and inference phases of LLMs, or for the purpose of enhancing understanding of the knowledge learned by LLMs; 2) LLM-augmented KGs, that leverage LLMs for different KG tasks such as embedding, completion, construction, graph-to-text generation, and question answering; and 3) Synergized LLMs + KGs, in which LLMs and KGs play equal roles and work in a mutually beneficial way to enhance both LLMs and KGs for bidirectional reasoning driven by both data and knowledge. We review and summarize existing efforts within these three frameworks in our roadmap and pinpoint their future research directions.",
    "source_database": "arxiv",
    "arxiv_id": "2306.08302v3"
  },
  {
    "title": "AutoHall: Automated Factuality Hallucination Dataset Generation for Large Language Models",
    "authors": [
      "Zouying Cao",
      "Yifei Yang",
      "XiaoJing Li",
      "Hai Zhao"
    ],
    "year": "2023",
    "journal": "arXiv:2310.00259v3",
    "doi": "",
    "abstract": "Large language models (LLMs) have gained broad applications across various domains but still struggle with hallucinations. Currently, hallucinations occur frequently in the generation of factual content and pose a great challenge to trustworthy LLMs. However, hallucination detection is hindered by the laborious and expensive manual annotation of hallucinatory content. Meanwhile, as different LLMs exhibit distinct types and rates of hallucination, the collection of hallucination datasets is inherently model-specific, which also increases the cost. To address this issue, this paper proposes a method called $\\textbf{AutoHall}$ for $\\underline{Auto}$matically constructing model-specific $\\underline{Hall}$ucination datasets based on existing fact-checking datasets. The empirical results reveal variations in hallucination proportions and types among different models. Moreover, we introduce a zero-resource and black-box hallucination detection method based on self-contradiction to recognize the hallucination in our constructed dataset, achieving superior detection performance compared to baselines. Further analysis on our dataset provides insight into factors that may contribute to LLM hallucinations. Our codes and datasets are publicly available at https://github.com/zouyingcao/AutoHall.",
    "source_database": "arxiv",
    "arxiv_id": "2310.00259v3"
  },
  {
    "title": "Learning From Failure: Integrating Negative Examples when Fine-tuning Large Language Models as Agents",
    "authors": [
      "Renxi Wang",
      "Haonan Li",
      "Xudong Han",
      "Yixuan Zhang",
      "Timothy Baldwin"
    ],
    "year": "2024",
    "journal": "arXiv:2402.11651v2",
    "doi": "",
    "abstract": "Large language models (LLMs) have achieved success in acting as agents, which interact with environments through tools such as search engines. However, LLMs are optimized for language generation instead of tool use during training or alignment, limiting their effectiveness as agents. To resolve this problem, previous work has first collected interaction trajectories between LLMs and environments, using only trajectories that successfully finished the task to fine-tune smaller models, making fine-tuning data scarce and acquiring it both difficult and costly. Discarding failed trajectories also leads to significant wastage of data and resources and limits the possible optimization paths during fine-tuning. In this paper, we argue that unsuccessful trajectories offer valuable insights, and LLMs can learn from these trajectories through appropriate quality control and fine-tuning strategies. By simply adding a prefix or suffix that tells the model whether to generate a successful trajectory during training, we improve model performance by a large margin on mathematical reasoning, multi-hop question answering, and strategic question answering tasks. We further analyze the inference results and find that our method provides a better trade-off between valuable information and errors in unsuccessful trajectories. To our knowledge, we are the first to demonstrate the value of negative trajectories and their application in agent-tunning scenarios. Our findings offer guidance for developing better agent-tuning methods and low-resource data usage techniques.",
    "source_database": "arxiv",
    "arxiv_id": "2402.11651v2"
  },
  {
    "title": "Investigating Retrieval-Augmented Generation in Quranic Studies: A Study of 13 Open-Source Large Language Models",
    "authors": [
      "Zahra Khalila",
      "Arbi Haza Nasution",
      "Winda Monika",
      "Aytug Onan",
      "Yohei Murakami",
      "Yasir Bin Ismail Radi",
      "Noor Mohammad Osmani"
    ],
    "year": "2025",
    "journal": "arXiv:2503.16581v1",
    "doi": "https://doi.org/10.14569/IJACSA.2025.01602134",
    "abstract": "Accurate and contextually faithful responses are critical when applying large language models (LLMs) to sensitive and domain-specific tasks, such as answering queries related to quranic studies. General-purpose LLMs often struggle with hallucinations, where generated responses deviate from authoritative sources, raising concerns about their reliability in religious contexts. This challenge highlights the need for systems that can integrate domain-specific knowledge while maintaining response accuracy, relevance, and faithfulness. In this study, we investigate 13 open-source LLMs categorized into large (e.g., Llama3:70b, Gemma2:27b, QwQ:32b), medium (e.g., Gemma2:9b, Llama3:8b), and small (e.g., Llama3.2:3b, Phi3:3.8b). A Retrieval-Augmented Generation (RAG) is used to make up for the problems that come with using separate models. This research utilizes a descriptive dataset of Quranic surahs including the meanings, historical context, and qualities of the 114 surahs, allowing the model to gather relevant knowledge before responding. The models are evaluated using three key metrics set by human evaluators: context relevance, answer faithfulness, and answer relevance. The findings reveal that large models consistently outperform smaller models in capturing query semantics and producing accurate, contextually grounded responses. The Llama3.2:3b model, even though it is considered small, does very well on faithfulness (4.619) and relevance (4.857), showing the promise of smaller architectures that have been well optimized. This article examines the trade-offs between model size, computational efficiency, and response quality while using LLMs in domain-specific applications.",
    "source_database": "arxiv",
    "arxiv_id": "2503.16581v1"
  },
  {
    "title": "Efficient Hallucination Detection: Adaptive Bayesian Estimation of Semantic Entropy with Guided Semantic Exploration",
    "authors": [
      "Qiyao Sun",
      "Xingming Li",
      "Xixiang He",
      "Ao Cheng",
      "Xuanyu Ji",
      "Hailun Lu",
      "Runke Huang",
      "Qingyong Hu"
    ],
    "year": "2026",
    "journal": "arXiv:2603.22812v1",
    "doi": "",
    "abstract": "Large language models (LLMs) have achieved remarkable success in various natural language processing tasks, yet they remain prone to generating factually incorrect outputs known as hallucinations. While recent approaches have shown promise for hallucination detection by repeatedly sampling from LLMs and quantifying the semantic inconsistency among the generated responses, they rely on fixed sampling budgets that fail to adapt to query complexity, resulting in computational inefficiency. We propose an Adaptive Bayesian Estimation framework for Semantic Entropy with Guided Semantic Exploration, which dynamically adjusts sampling requirements based on observed uncertainty. Our approach employs a hierarchical Bayesian framework to model the semantic distribution, enabling dynamic control of sampling iterations through variance-based thresholds that terminate generation once sufficient certainty is achieved. We also develop a perturbation-based importance sampling strategy to systematically explore the semantic space. Extensive experiments on four QA datasets demonstrate that our method achieves superior hallucination detection performance with significant efficiency gains. In low-budget scenarios, our approach requires about 50% fewer samples to achieve comparable detection performance to existing methods, while delivers an average AUROC improvement of 12.6% under the same sampling budget.",
    "source_database": "arxiv",
    "arxiv_id": "2603.22812v1"
  },
  {
    "title": "Towards Omni-RAG: Comprehensive Retrieval-Augmented Generation for Large Language Models in Medical Applications",
    "authors": [
      "Zhe Chen",
      "Yusheng Liao",
      "Shuyang Jiang",
      "Pingjie Wang",
      "Yiqiu Guo",
      "Yanfeng Wang",
      "Yu Wang"
    ],
    "year": "2025",
    "journal": "arXiv:2501.02460v3",
    "doi": "",
    "abstract": "Large language models hold promise for addressing medical challenges, such as medical diagnosis reasoning, research knowledge acquisition, clinical decision-making, and consumer health inquiry support. However, they often generate hallucinations due to limited medical knowledge. Incorporating external knowledge is therefore critical, which necessitates multi-source knowledge acquisition. We address this challenge by framing it as a source planning problem, which is to formulate context-appropriate queries tailored to the attributes of diverse sources. Existing approaches either overlook source planning or fail to achieve it effectively due to misalignment between the model's expectation of the sources and their actual content. To bridge this gap, we present MedOmniKB, a repository comprising multigenre and multi-structured medical knowledge sources. Leveraging these sources, we propose the Source Planning Optimisation method, which enhances multi-source utilisation. Our approach involves enabling an expert model to explore and evaluate potential plans while training a smaller model to learn source alignment. Experimental results demonstrate that our method substantially improves multi-source planning performance, enabling the optimised small model to achieve state-of-the-art results in leveraging diverse medical knowledge sources.",
    "source_database": "arxiv",
    "arxiv_id": "2501.02460v3"
  },
  {
    "title": "Evaluating Class Membership Relations in Knowledge Graphs using Large Language Models",
    "authors": [
      "Bradley P. Allen",
      "Paul T. Groth"
    ],
    "year": "2024",
    "journal": "arXiv:2404.17000v1",
    "doi": "",
    "abstract": "A backbone of knowledge graphs are their class membership relations, which assign entities to a given class. As part of the knowledge engineering process, we propose a new method for evaluating the quality of these relations by processing descriptions of a given entity and class using a zero-shot chain-of-thought classifier that uses a natural language intensional definition of a class. We evaluate the method using two publicly available knowledge graphs, Wikidata and CaLiGraph, and 7 large language models. Using the gpt-4-0125-preview large language model, the method's classification performance achieves a macro-averaged F1-score of 0.830 on data from Wikidata and 0.893 on data from CaLiGraph. Moreover, a manual analysis of the classification errors shows that 40.9% of errors were due to the knowledge graphs, with 16.0% due to missing relations and 24.9% due to incorrectly asserted relations. These results show how large language models can assist knowledge engineers in the process of knowledge graph refinement. The code and data are available on Github.",
    "source_database": "arxiv",
    "arxiv_id": "2404.17000v1"
  },
  {
    "title": "SADM: Sequence-Aware Diffusion Model for Longitudinal Medical Image Generation",
    "authors": [
      "Jee Seok Yoon",
      "Chenghao Zhang",
      "Heung-Il Suk",
      "Jia Guo",
      "Xiaoxiao Li"
    ],
    "year": "2022",
    "journal": "arXiv:2212.08228v2",
    "doi": "https://doi.org/10.1007/978-3-031-34048-2_30",
    "abstract": "Human organs constantly undergo anatomical changes due to a complex mix of short-term (e.g., heartbeat) and long-term (e.g., aging) factors. Evidently, prior knowledge of these factors will be beneficial when modeling their future state, i.e., via image generation. However, most of the medical image generation tasks only rely on the input from a single image, thus ignoring the sequential dependency even when longitudinal data is available. Sequence-aware deep generative models, where model input is a sequence of ordered and timestamped images, are still underexplored in the medical imaging domain that is featured by several unique challenges: 1) Sequences with various lengths; 2) Missing data or frame, and 3) High dimensionality. To this end, we propose a sequence-aware diffusion model (SADM) for the generation of longitudinal medical images. Recently, diffusion models have shown promising results in high-fidelity image generation. Our method extends this new technique by introducing a sequence-aware transformer as the conditional module in a diffusion model. The novel design enables learning longitudinal dependency even with missing data during training and allows autoregressive generation of a sequence of images during inference. Our extensive experiments on 3D longitudinal medical images demonstrate the effectiveness of SADM compared with baselines and alternative methods. The code is available at https://github.com/ubc-tea/SADM-Longitudinal-Medical-Image-Generation.",
    "source_database": "arxiv",
    "arxiv_id": "2212.08228v2"
  },
  {
    "title": "Commander-GPT: Fully Unleashing the Sarcasm Detection Capability of Multi-Modal Large Language Models",
    "authors": [
      "Yazhou Zhang",
      "Chunwang Zou",
      "Bo Wang",
      "Jing Qin"
    ],
    "year": "2025",
    "journal": "arXiv:2503.18681v3",
    "doi": "",
    "abstract": "Sarcasm detection, as a crucial research direction in the field of Natural Language Processing (NLP), has attracted widespread attention. Traditional sarcasm detection tasks have typically focused on single-modal approaches (e.g., text), but due to the implicit and subtle nature of sarcasm, such methods often fail to yield satisfactory results. In recent years, researchers have shifted the focus of sarcasm detection to multi-modal approaches. However, effectively leveraging multi-modal information to accurately identify sarcastic content remains a challenge that warrants further exploration. Leveraging the powerful integrated processing capabilities of Multi-Modal Large Language Models (MLLMs) for various information sources, we propose an innovative multi-modal Commander-GPT framework. Inspired by military strategy, we first decompose the sarcasm detection task into six distinct sub-tasks. A central commander (decision-maker) then assigns the best-suited large language model to address each specific sub-task. Ultimately, the detection results from each model are aggregated to identify sarcasm. We conducted extensive experiments on MMSD and MMSD 2.0, utilizing four multi-modal large language models and six prompting strategies. Our experiments demonstrate that our approach achieves state-of-the-art performance, with a 19.3% improvement in F1 score, without necessitating fine-tuning or ground-truth rationales.",
    "source_database": "arxiv",
    "arxiv_id": "2503.18681v3"
  },
  {
    "title": "HuaTuo: Tuning LLaMA Model with Chinese Medical Knowledge",
    "authors": [
      "Haochun Wang",
      "Chi Liu",
      "Nuwa Xi",
      "Zewen Qiang",
      "Sendong Zhao",
      "Bing Qin",
      "Ting Liu"
    ],
    "year": "2023",
    "journal": "arXiv:2304.06975v1",
    "doi": "",
    "abstract": "Large Language Models (LLMs), such as the LLaMA model, have demonstrated their effectiveness in various general-domain natural language processing (NLP) tasks. Nevertheless, LLMs have not yet performed optimally in biomedical domain tasks due to the need for medical expertise in the responses. In response to this challenge, we propose HuaTuo, a LLaMA-based model that has been supervised-fine-tuned with generated QA (Question-Answer) instances. The experimental results demonstrate that HuaTuo generates responses that possess more reliable medical knowledge. Our proposed HuaTuo model is accessible at https://github.com/SCIR-HI/Huatuo-Llama-Med-Chinese.",
    "source_database": "arxiv",
    "arxiv_id": "2304.06975v1"
  },
  {
    "title": "Reducing Hallucinations of Medical Multimodal Large Language Models with Visual Retrieval-Augmented Generation",
    "authors": [
      "Yun-Wei Chu",
      "Kai Zhang",
      "Christopher Malon",
      "Martin Renqiang Min"
    ],
    "year": "2025",
    "journal": "arXiv:2502.15040v1",
    "doi": "",
    "abstract": "Multimodal Large Language Models (MLLMs) have shown impressive performance in vision and text tasks. However, hallucination remains a major challenge, especially in fields like healthcare where details are critical. In this work, we show how MLLMs may be enhanced to support Visual RAG (V-RAG), a retrieval-augmented generation framework that incorporates both text and visual data from retrieved images. On the MIMIC-CXR chest X-ray report generation and Multicare medical image caption generation datasets, we show that Visual RAG improves the accuracy of entity probing, which asks whether a medical entities is grounded by an image. We show that the improvements extend both to frequent and rare entities, the latter of which may have less positive training data. Downstream, we apply V-RAG with entity probing to correct hallucinations and generate more clinically accurate X-ray reports, obtaining a higher RadGraph-F1 score.",
    "source_database": "arxiv",
    "arxiv_id": "2502.15040v1"
  },
  {
    "title": "Align, Reason and Learn: Enhancing Medical Vision-and-Language Pre-training with Knowledge",
    "authors": [
      "Zhihong Chen",
      "Guanbin Li",
      "Xiang Wan"
    ],
    "year": "2022",
    "journal": "arXiv:2209.07118v1",
    "doi": "",
    "abstract": "Medical vision-and-language pre-training (Med-VLP) has received considerable attention owing to its applicability to extracting generic vision-and-language representations from medical images and texts. Most existing methods mainly contain three elements: uni-modal encoders (i.e., a vision encoder and a language encoder), a multi-modal fusion module, and pretext tasks, with few studies considering the importance of medical domain expert knowledge and explicitly exploiting such knowledge to facilitate Med-VLP. Although there exist knowledge-enhanced vision-and-language pre-training (VLP) methods in the general domain, most require off-the-shelf toolkits (e.g., object detectors and scene graph parsers), which are unavailable in the medical domain. In this paper, we propose a systematic and effective approach to enhance Med-VLP by structured medical knowledge from three perspectives. First, considering knowledge can be regarded as the intermediate medium between vision and language, we align the representations of the vision encoder and the language encoder through knowledge. Second, we inject knowledge into the multi-modal fusion model to enable the model to perform reasoning using knowledge as the supplementation of the input image and text. Third, we guide the model to put emphasis on the most critical information in images and texts by designing knowledge-induced pretext tasks. To perform a comprehensive evaluation and facilitate further research, we construct a medical vision-and-language benchmark including three tasks. Experimental results illustrate the effectiveness of our approach, where state-of-the-art performance is achieved on all downstream tasks. Further analyses explore the effects of different components of our approach and various settings of pre-training.",
    "source_database": "arxiv",
    "arxiv_id": "2209.07118v1"
  },
  {
    "title": "Demystifying Instruction Mixing for Fine-tuning Large Language Models",
    "authors": [
      "Renxi Wang",
      "Haonan Li",
      "Minghao Wu",
      "Yuxia Wang",
      "Xudong Han",
      "Chiyu Zhang",
      "Timothy Baldwin"
    ],
    "year": "2023",
    "journal": "arXiv:2312.10793v3",
    "doi": "",
    "abstract": "Instruction tuning significantly enhances the performance of large language models (LLMs) across various tasks. However, the procedure to optimizing the mixing of instruction datasets for LLM fine-tuning is still poorly understood. This study categorizes instructions into three primary types: NLP downstream tasks, coding, and general chat. We explore the effects of instruction tuning on different combinations of datasets on LLM performance, and find that certain instruction types are more advantageous for specific applications but can negatively impact other areas. This work provides insights into instruction mixtures, laying the foundations for future research.",
    "source_database": "arxiv",
    "arxiv_id": "2312.10793v3"
  },
  {
    "title": "Proficient Graph Neural Network Design by Accumulating Knowledge on Large Language Models",
    "authors": [
      "Jialiang Wang",
      "Hanmo Liu",
      "Shimin Di",
      "Zhili Wang",
      "Jiachuan Wang",
      "Lei Chen",
      "Xiaofang Zhou"
    ],
    "year": "2024",
    "journal": "arXiv:2408.06717v3",
    "doi": "https://doi.org/10.1145/3773966.3777982",
    "abstract": "High-level automation is increasingly critical in AI, driven by rapid advances in large language models (LLMs) and AI agents. However, LLMs, despite their general reasoning power, struggle significantly in specialized, data-sensitive tasks such as designing Graph Neural Networks (GNNs). This difficulty arises from (1) the inherent knowledge gaps in modeling the intricate, varying relationships between graph properties and suitable architectures and (2) the external noise from misleading descriptive inputs, often resulting in generic or even misleading model suggestions. Achieving proficiency in designing data-aware models -- defined as the meta-level capability to systematically accumulate, interpret, and apply data-specific design knowledge -- remains challenging for existing automated approaches, due to their inefficient construction and application of meta-knowledge. To achieve meta-level proficiency, we propose DesiGNN, a knowledge-centered framework that systematically converts past model design experience into structured, fine-grained knowledge priors well-suited for meta-learning with LLMs. To account for the inherent variability and external noise, DesiGNN aligns empirical property filtering from extensive benchmarks with adaptive elicitation of literature insights via LLMs. By constructing a solid meta-knowledge between unseen graph understanding and known effective architecture patterns, DesiGNN can deliver top-5.77% initial model proposals for unseen datasets within seconds and achieve consistently superior performance with minimal search cost compared to baselines.",
    "source_database": "arxiv",
    "arxiv_id": "2408.06717v3"
  },
  {
    "title": "Can ChatGPT be Your Personal Medical Assistant?",
    "authors": [
      "Md. Rafiul Biswas",
      "Ashhadul Islam",
      "Zubair Shah",
      "Wajdi Zaghouani",
      "Samir Brahim Belhaouari"
    ],
    "year": "2023",
    "journal": "arXiv:2312.12006v1",
    "doi": "",
    "abstract": "The advanced large language model (LLM) ChatGPT has shown its potential in different domains and remains unbeaten due to its characteristics compared to other LLMs. This study aims to evaluate the potential of using a fine-tuned ChatGPT model as a personal medical assistant in the Arabic language. To do so, this study uses publicly available online questions and answering datasets in Arabic language. There are almost 430K questions and answers for 20 disease-specific categories. GPT-3.5-turbo model was fine-tuned with a portion of this dataset. The performance of this fine-tuned model was evaluated through automated and human evaluation. The automated evaluations include perplexity, coherence, similarity, and token count. Native Arabic speakers with medical knowledge evaluated the generated text by calculating relevance, accuracy, precision, logic, and originality. The overall result shows that ChatGPT has a bright future in medical assistance.",
    "source_database": "arxiv",
    "arxiv_id": "2312.12006v1"
  },
  {
    "title": "Is Self-knowledge and Action Consistent or Not: Investigating Large Language Model's Personality",
    "authors": [
      "Yiming Ai",
      "Zhiwei He",
      "Ziyin Zhang",
      "Wenhong Zhu",
      "Hongkun Hao",
      "Kai Yu",
      "Lingjun Chen",
      "Rui Wang"
    ],
    "year": "2024",
    "journal": "arXiv:2402.14679v2",
    "doi": "",
    "abstract": "In this study, we delve into the validity of conventional personality questionnaires in capturing the human-like personality traits of Large Language Models (LLMs). Our objective is to assess the congruence between the personality traits LLMs claim to possess and their demonstrated tendencies in real-world scenarios. By conducting an extensive examination of LLM outputs against observed human response patterns, we aim to understand the disjunction between self-knowledge and action in LLMs.",
    "source_database": "arxiv",
    "arxiv_id": "2402.14679v2"
  },
  {
    "title": "Soft Inductive Bias Approach via Explicit Reasoning Perspectives in Inappropriate Utterance Detection Using Large Language Models",
    "authors": [
      "Ju-Young Kim",
      "Ji-Hong Park",
      "Se-Yeon Lee",
      "Sujin Park",
      "Gun-Woo Kim"
    ],
    "year": "2025",
    "journal": "arXiv:2512.08480v1",
    "doi": "",
    "abstract": "Recent incidents in certain online games and communities, where anonymity is guaranteed, show that unchecked inappropriate remarks frequently escalate into verbal abuse and even criminal behavior, raising significant social concerns. Consequently, there is a growing need for research on techniques that can detect inappropriate utterances within conversational texts to help build a safer communication environment. Although large-scale language models trained on Korean corpora and chain-of-thought reasoning have recently gained attention, research applying these approaches to inappropriate utterance detection remains limited. In this study, we propose a soft inductive bias approach that explicitly defines reasoning perspectives to guide the inference process, thereby promoting rational decision-making and preventing errors that may arise during reasoning. We fine-tune a Korean large language model using the proposed method and conduct both quantitative performance comparisons and qualitative evaluations across different training strategies. Experimental results show that the Kanana-1.5 model achieves an average accuracy of 87.0046, improving by approximately 3.89 percent over standard supervised learning. These findings indicate that the proposed method goes beyond simple knowledge imitation by large language models and enables more precise and consistent judgments through constrained reasoning perspectives, demonstrating its effectiveness for inappropriate utterance detection.",
    "source_database": "arxiv",
    "arxiv_id": "2512.08480v1"
  },
  {
    "title": "Knowledge-tuning Large Language Models with Structured Medical Knowledge Bases for Reliable Response Generation in Chinese",
    "authors": [
      "Haochun Wang",
      "Sendong Zhao",
      "Zewen Qiang",
      "Zijian Li",
      "Nuwa Xi",
      "Yanrui Du",
      "MuZhen Cai",
      "Haoqiang Guo",
      "Yuhan Chen",
      "Haoming Xu",
      "Bing Qin",
      "Ting Liu"
    ],
    "year": "2023",
    "journal": "arXiv:2309.04175v1",
    "doi": "https://doi.org/10.1145/3686807",
    "abstract": "Large Language Models (LLMs) have demonstrated remarkable success in diverse natural language processing (NLP) tasks in general domains. However, LLMs sometimes generate responses with the hallucination about medical facts due to limited domain knowledge. Such shortcomings pose potential risks in the utilization of LLMs within medical contexts. To address this challenge, we propose knowledge-tuning, which leverages structured medical knowledge bases for the LLMs to grasp domain knowledge efficiently and facilitate reliable response generation. We also release cMedKnowQA, a Chinese medical knowledge question-answering dataset constructed from medical knowledge bases to assess the medical knowledge proficiency of LLMs. Experimental results show that the LLMs which are knowledge-tuned with cMedKnowQA, can exhibit higher levels of accuracy in response generation compared with vanilla instruction-tuning and offer a new reliable way for the domain adaptation of LLMs.",
    "source_database": "arxiv",
    "arxiv_id": "2309.04175v1"
  },
  {
    "title": "WizardLM: Empowering large pre-trained language models to follow complex instructions",
    "authors": [
      "Can Xu",
      "Qingfeng Sun",
      "Kai Zheng",
      "Xiubo Geng",
      "Pu Zhao",
      "Jiazhan Feng",
      "Chongyang Tao",
      "Qingwei Lin",
      "Daxin Jiang"
    ],
    "year": "2023",
    "journal": "arXiv:2304.12244v3",
    "doi": "",
    "abstract": "Training large language models (LLMs) with open-domain instruction following data brings colossal success. However, manually creating such instruction data is very time-consuming and labor-intensive. Moreover, humans may struggle to produce high-complexity instructions. In this paper, we show an avenue for creating large amounts of instruction data with varying levels of complexity using LLM instead of humans. Starting with an initial set of instructions, we use our proposed Evol-Instruct to rewrite them step by step into more complex instructions. Then, we mix all generated instruction data to fine-tune LLaMA. We call the resulting model WizardLM. Human evaluations on a complexity-balanced test bed and Vicuna's testset show that instructions from Evol-Instruct are superior to human-created ones. By analyzing the human evaluation results of the high complexity part, we demonstrate that outputs from our WizardLM are preferred to outputs from OpenAI ChatGPT. In GPT-4 automatic evaluation, WizardLM achieves more than 90\\% capacity of ChatGPT on 17 out of 29 skills. Even though WizardLM still lags behind ChatGPT in some aspects, our findings suggest that fine-tuning with AI-evolved instructions is a promising direction for enhancing LLMs. Our code and data are public at https://github.com/nlpxucan/WizardLM",
    "source_database": "arxiv",
    "arxiv_id": "2304.12244v3"
  },
  {
    "title": "Object Detection with Multimodal Large Vision-Language Models: An In-depth Review",
    "authors": [
      "Ranjan Sapkota",
      "Manoj Karkee"
    ],
    "year": "2025",
    "journal": "arXiv:2508.19294v2",
    "doi": "https://doi.org/10.1016/j.inffus.2025.103575",
    "abstract": "The fusion of language and vision in large vision-language models (LVLMs) has revolutionized deep learning-based object detection by enhancing adaptability, contextual reasoning, and generalization beyond traditional architectures. This in-depth review presents a structured exploration of the state-of-the-art in LVLMs, systematically organized through a three-step research review process. First, we discuss the functioning of vision language models (VLMs) for object detection, describing how these models harness natural language processing (NLP) and computer vision (CV) techniques to revolutionize object detection and localization. We then explain the architectural innovations, training paradigms, and output flexibility of recent LVLMs for object detection, highlighting how they achieve advanced contextual understanding for object detection. The review thoroughly examines the approaches used in integration of visual and textual information, demonstrating the progress made in object detection using VLMs that facilitate more sophisticated object detection and localization strategies. This review presents comprehensive visualizations demonstrating LVLMs' effectiveness in diverse scenarios including localization and segmentation, and then compares their real-time performance, adaptability, and complexity to traditional deep learning systems. Based on the review, its is expected that LVLMs will soon meet or surpass the performance of conventional methods in object detection. The review also identifies a few major limitations of the current LVLM modes, proposes solutions to address those challenges, and presents a clear roadmap for the future advancement in this field. We conclude, based on this study, that the recent advancement in LVLMs have made and will continue to make a transformative impact on object detection and robotic applications in the future.",
    "source_database": "arxiv",
    "arxiv_id": "2508.19294v2"
  },
  {
    "title": "MedHallBench: A New Benchmark for Assessing Hallucination in Medical Large Language Models",
    "authors": [
      "Kaiwen Zuo",
      "Yirui Jiang"
    ],
    "year": "2024",
    "journal": "arXiv:2412.18947v4",
    "doi": "",
    "abstract": "Medical Large Language Models (MLLMs) have demonstrated potential in healthcare applications, yet their propensity for hallucinations -- generating medically implausible or inaccurate information -- presents substantial risks to patient care. This paper introduces MedHallBench, a comprehensive benchmark framework for evaluating and mitigating hallucinations in MLLMs. Our methodology integrates expert-validated medical case scenarios with established medical databases to create a robust evaluation dataset. The framework employs a sophisticated measurement system that combines automated ACHMI (Automatic Caption Hallucination Measurement in Medical Imaging) scoring with rigorous clinical expert evaluations and utilizes reinforcement learning methods to achieve automatic annotation. Through an optimized reinforcement learning from human feedback (RLHF) training pipeline specifically designed for medical applications, MedHallBench enables thorough evaluation of MLLMs across diverse clinical contexts while maintaining stringent accuracy standards. We conducted comparative experiments involving various models, utilizing the benchmark to establish a baseline for widely adopted large language models (LLMs). Our findings indicate that ACHMI provides a more nuanced understanding of the effects of hallucinations compared to traditional metrics, thereby highlighting its advantages in hallucination assessment. This research establishes a foundational framework for enhancing MLLMs' reliability in healthcare settings and presents actionable strategies for addressing the critical challenge of AI hallucinations in medical applications.",
    "source_database": "arxiv",
    "arxiv_id": "2412.18947v4"
  },
  {
    "title": "Quantifying Hallucinations in Language Language Models on Medical Textbooks",
    "authors": [
      "Brandon C. Colelough",
      "Davis Bartels",
      "Dina Demner-Fushman"
    ],
    "year": "2026",
    "journal": "arXiv:2603.09986v1",
    "doi": "",
    "abstract": "Hallucinations, the tendency for large language models to provide responses with factually incorrect and unsupported claims, is a serious problem within natural language processing for which we do not yet have an effective solution to mitigate against. Existing benchmarks for medical QA rarely evaluate this behavior against a fixed evidence source. We ask how often hallucinations occur on textbook-grounded QA and how responses to medical QA prompts vary across models. We conduct two experiments: the first experiment to determine the prevalence of hallucinations for a prominent open source large language model (LLaMA-70B-Instruct) in medical QA given novel prompts, and the second experiment to determine the prevalence of hallucinations and clinician preference to model responses. We observed, in experiment one, with the passages provided, LLaMA-70B-Instruct hallucinated in 19.7\\% of answers (95\\% CI 18.6 to 20.7) even though 98.8\\% of prompt responses received maximal plausibility, and observed in experiment two, across models, lower hallucination rates aligned with higher usefulness scores ($ρ=-0.71$, $p=0.058$). Clinicians produced high agreement (quadratic weighted $κ=0.92$) and ($τ_b=0.06$ to $0.18$, $κ=0.57$ to $0.61$) for experiments 1 and ,2 respectively",
    "source_database": "arxiv",
    "arxiv_id": "2603.09986v1"
  },
  {
    "title": "Knowledge Overshadowing Causes Amalgamated Hallucination in Large Language Models",
    "authors": [
      "Yuji Zhang",
      "Sha Li",
      "Jiateng Liu",
      "Pengfei Yu",
      "Yi R. Fung",
      "Jing Li",
      "Manling Li",
      "Heng Ji"
    ],
    "year": "2024",
    "journal": "arXiv:2407.08039v1",
    "doi": "",
    "abstract": "Hallucination is often regarded as a major impediment for using large language models (LLMs), especially for knowledge-intensive tasks. Even when the training corpus consists solely of true statements, language models still generate hallucinations in the form of amalgamations of multiple facts. We coin this phenomenon as ``knowledge overshadowing'': when we query knowledge from a language model with multiple conditions, some conditions overshadow others, leading to hallucinated outputs. This phenomenon partially stems from training data imbalance, which we verify on both pretrained models and fine-tuned models, over a wide range of LM model families and sizes.From a theoretical point of view, knowledge overshadowing can be interpreted as over-generalization of the dominant conditions (patterns). We show that the hallucination rate grows with both the imbalance ratio (between the popular and unpopular condition) and the length of dominant condition description, consistent with our derived generalization bound. Finally, we propose to utilize overshadowing conditions as a signal to catch hallucination before it is produced, along with a training-free self-contrastive decoding method to alleviate hallucination during inference. Our proposed approach showcases up to 82% F1 for hallucination anticipation and 11.2% to 39.4% hallucination control, with different models and datasets.",
    "source_database": "arxiv",
    "arxiv_id": "2407.08039v1"
  },
  {
    "title": "WizardCoder: Empowering Code Large Language Models with Evol-Instruct",
    "authors": [
      "Ziyang Luo",
      "Can Xu",
      "Pu Zhao",
      "Qingfeng Sun",
      "Xiubo Geng",
      "Wenxiang Hu",
      "Chongyang Tao",
      "Jing Ma",
      "Qingwei Lin",
      "Daxin Jiang"
    ],
    "year": "2023",
    "journal": "arXiv:2306.08568v2",
    "doi": "",
    "abstract": "Code Large Language Models (Code LLMs), such as StarCoder, have demonstrated exceptional performance in code-related tasks. However, most existing models are solely pre-trained on extensive raw code data without instruction fine-tuning. In this paper, we introduce WizardCoder, which empowers Code LLMs with complex instruction fine-tuning, by adapting the Evol-Instruct method to the domain of code. Through comprehensive experiments on four prominent code generation benchmarks, namely HumanEval, HumanEval+, MBPP, and DS-1000, we unveil the exceptional capabilities of our model. It surpasses all other open-source Code LLMs by a substantial margin. Moreover, our model even outperforms the largest closed LLMs, Anthropic's Claude and Google's Bard, on HumanEval and HumanEval+. Our code, model weights, and data are public at https://github.com/nlpxucan/WizardLM",
    "source_database": "arxiv",
    "arxiv_id": "2306.08568v2"
  },
  {
    "title": "(Im)possibility of Automated Hallucination Detection in Large Language Models",
    "authors": [
      "Amin Karbasi",
      "Omar Montasser",
      "John Sous",
      "Grigoris Velegkas"
    ],
    "year": "2025",
    "journal": "arXiv:2504.17004v2",
    "doi": "",
    "abstract": "Is automated hallucination detection possible? In this work, we introduce a theoretical framework to analyze the feasibility of automatically detecting hallucinations produced by large language models (LLMs). Inspired by the classical Gold-Angluin framework for language identification and its recent adaptation to language generation by Kleinberg and Mullainathan, we investigate whether an algorithm, trained on examples drawn from an unknown target language $K$ (selected from a countable collection) and given access to an LLM, can reliably determine whether the LLM's outputs are correct or constitute hallucinations.   First, we establish an equivalence between hallucination detection and the classical task of language identification. We prove that any hallucination detection method can be converted into a language identification method, and conversely, algorithms solving language identification can be adapted for hallucination detection. Given the inherent difficulty of language identification, this implies that hallucination detection is fundamentally impossible for most language collections if the detector is trained using only correct examples from the target language.   Second, we show that the use of expert-labeled feedback, i.e., training the detector with both positive examples (correct statements) and negative examples (explicitly labeled incorrect statements), dramatically changes this conclusion. Under this enriched training regime, automated hallucination detection becomes possible for all countable language collections.   These results highlight the essential role of expert-labeled examples in training hallucination detectors and provide theoretical support for feedback-based methods, such as reinforcement learning with human feedback (RLHF), which have proven critical for reliable LLM deployment.",
    "source_database": "arxiv",
    "arxiv_id": "2504.17004v2"
  },
  {
    "title": "MedHal: An Evaluation Dataset for Medical Hallucination Detection",
    "authors": [
      "Gaya Mehenni",
      "Fabrice Lamarche",
      "Odette Rios-Ibacache",
      "John Kildea",
      "Amal Zouaq"
    ],
    "year": "2025",
    "journal": "arXiv:2504.08596v2",
    "doi": "",
    "abstract": "We present MedHal, a novel large-scale dataset specifically designed to evaluate if models can detect hallucinations in medical texts. Current hallucination detection methods face significant limitations when applied to specialized domains like medicine, where they can have disastrous consequences. Existing medical datasets are either too small, containing only a few hundred samples, or focus on a single task like Question Answering or Natural Language Inference. MedHal addresses these gaps by: (1) incorporating diverse medical text sources and tasks; (2) providing a substantial volume of annotated samples suitable for training medical hallucination detection models; and (3) including explanations for factual inconsistencies to guide model learning. We demonstrate MedHal's utility by training and evaluating a baseline medical hallucination detection model, showing improvements over general-purpose hallucination detection approaches. This resource enables more efficient evaluation of medical text generation systems while reducing reliance on costly expert review, potentially accelerating the development of medical AI research.",
    "source_database": "arxiv",
    "arxiv_id": "2504.08596v2"
  },
  {
    "title": "Fact Grounded Attention: Eliminating Hallucination in Large Language Models Through Attention Level Knowledge Integration",
    "authors": [
      "Aayush Gupta"
    ],
    "year": "2025",
    "journal": "arXiv:2509.25252v2",
    "doi": "",
    "abstract": "\"The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge.\" Large Language Models have conquered natural language but remain prisoners of their own probabilistic nature--confidently hallucinating facts they never truly knew. We present Fact Grounded Attention (FGA), a novel architectural modification that transforms unreliable language models into deterministic truth tellers by injecting verifiable knowledge directly into the attention mechanism. Unlike existing approaches that patch hallucinations after generation or prepend retrieved text, FGA intervenes at the mathematical heart of the transformer--the pre-softmax attention scores--creating a model that cannot hallucinate when facts exist in its knowledge base. Our experiments across 1,107 technical queries spanning smartphones, laptops, and electric vehicles demonstrate a transformation from 6.3% accuracy in vanilla Llama 3.2 to 99.7% accuracy with FGA. More critically, knowledge updates occur in under one second without retraining, compared to hours for parameter editing approaches. FGA doesn't just reduce hallucination--it eliminates it entirely for verifiable facts, marking a fundamental shift from probabilistic approximation to deterministic precision in neural language generation.",
    "source_database": "arxiv",
    "arxiv_id": "2509.25252v2"
  },
  {
    "title": "Improving Scientific Hypothesis Generation with Knowledge Grounded Large Language Models",
    "authors": [
      "Guangzhi Xiong",
      "Eric Xie",
      "Amir Hassan Shariatmadari",
      "Sikun Guo",
      "Stefan Bekiranov",
      "Aidong Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2411.02382v1",
    "doi": "",
    "abstract": "Large language models (LLMs) have demonstrated remarkable capabilities in various scientific domains, from natural language processing to complex problem-solving tasks. Their ability to understand and generate human-like text has opened up new possibilities for advancing scientific research, enabling tasks such as data analysis, literature review, and even experimental design. One of the most promising applications of LLMs in this context is hypothesis generation, where they can identify novel research directions by analyzing existing knowledge. However, despite their potential, LLMs are prone to generating ``hallucinations'', outputs that are plausible-sounding but factually incorrect. Such a problem presents significant challenges in scientific fields that demand rigorous accuracy and verifiability, potentially leading to erroneous or misleading conclusions. To overcome these challenges, we propose KG-CoI (Knowledge Grounded Chain of Ideas), a novel system that enhances LLM hypothesis generation by integrating external, structured knowledge from knowledge graphs (KGs). KG-CoI guides LLMs through a structured reasoning process, organizing their output as a chain of ideas (CoI), and includes a KG-supported module for the detection of hallucinations. With experiments on our newly constructed hypothesis generation dataset, we demonstrate that KG-CoI not only improves the accuracy of LLM-generated hypotheses but also reduces the hallucination in their reasoning chains, highlighting its effectiveness in advancing real-world scientific research.",
    "source_database": "arxiv",
    "arxiv_id": "2411.02382v1"
  },
  {
    "title": "Editing Factual Knowledge and Explanatory Ability of Medical Large Language Models",
    "authors": [
      "Derong Xu",
      "Ziheng Zhang",
      "Zhihong Zhu",
      "Zhenxi Lin",
      "Qidong Liu",
      "Xian Wu",
      "Tong Xu",
      "Wanyu Wang",
      "Yuyang Ye",
      "Xiangyu Zhao",
      "Enhong Chen",
      "Yefeng Zheng"
    ],
    "year": "2024",
    "journal": "arXiv:2402.18099v3",
    "doi": "",
    "abstract": "Model editing aims to precisely alter the behaviors of large language models (LLMs) in relation to specific knowledge, while leaving unrelated knowledge intact. This approach has proven effective in addressing issues of hallucination and outdated information in LLMs. However, the potential of using model editing to modify knowledge in the medical field remains largely unexplored, even though resolving hallucination is a pressing need in this area. Our observations indicate that current methods face significant challenges in dealing with specialized and complex knowledge in medical domain. Therefore, we propose MedLaSA, a novel Layer-wise Scalable Adapter strategy for medical model editing. MedLaSA harnesses the strengths of both adding extra parameters and locate-then-edit methods for medical model editing. We utilize causal tracing to identify the association of knowledge in neurons across different layers, and generate a corresponding scale set from the association value for each piece of knowledge. Subsequently, we incorporate scalable adapters into the dense layers of LLMs. These adapters are assigned scaling values based on the corresponding specific knowledge, which allows for the adjustment of the adapter's weight and rank. The more similar the content, the more consistent the scale between them. This ensures precise editing of semantically identical knowledge while avoiding impact on unrelated knowledge. To evaluate the editing impact on the behaviours of LLMs, we propose two model editing studies for medical domain: (1) editing factual knowledge for medical specialization and (2) editing the explanatory ability for complex knowledge. We build two novel medical benchmarking datasets and introduce a series of challenging and comprehensive metrics. Extensive experiments on medical LLMs demonstrate the editing efficiency of MedLaSA, without affecting unrelated knowledge.",
    "source_database": "arxiv",
    "arxiv_id": "2402.18099v3"
  },
  {
    "title": "Neural Probe-Based Hallucination Detection for Large Language Models",
    "authors": [
      "Shize Liang",
      "Hongzhi Wang"
    ],
    "year": "2025",
    "journal": "arXiv:2512.20949v1",
    "doi": "",
    "abstract": "Large language models(LLMs) excel at text generation and knowledge question-answering tasks, but they are prone to generating hallucinated content, severely limiting their application in high-risk domains. Current hallucination detection methods based on uncertainty estimation and external knowledge retrieval suffer from the limitation that they still produce erroneous content at high confidence levels and rely heavily on retrieval efficiency and knowledge coverage. In contrast, probe methods that leverage the model's hidden-layer states offer real-time and lightweight advantages. However, traditional linear probes struggle to capture nonlinear structures in deep semantic spaces.To overcome these limitations, we propose a neural network-based framework for token-level hallucination detection. By freezing language model parameters, we employ lightweight MLP probes to perform nonlinear modeling of high-level hidden states. A multi-objective joint loss function is designed to enhance detection stability and semantic disambiguity. Additionally, we establish a layer position-probe performance response model, using Bayesian optimization to automatically search for optimal probe insertion layers and achieve superior training results.Experimental results on LongFact, HealthBench, and TriviaQA demonstrate that MLP probes significantly outperform state-of-the-art methods in accuracy, recall, and detection capability under low false-positive conditions.",
    "source_database": "arxiv",
    "arxiv_id": "2512.20949v1"
  },
  {
    "title": "MedVH: Towards Systematic Evaluation of Hallucination for Large Vision Language Models in the Medical Context",
    "authors": [
      "Zishan Gu",
      "Changchang Yin",
      "Fenglin Liu",
      "Ping Zhang"
    ],
    "year": "2024",
    "journal": "arXiv:2407.02730v1",
    "doi": "",
    "abstract": "Large Vision Language Models (LVLMs) have recently achieved superior performance in various tasks on natural image and text data, which inspires a large amount of studies for LVLMs fine-tuning and training. Despite their advancements, there has been scant research on the robustness of these models against hallucination when fine-tuned on smaller datasets. In this study, we introduce a new benchmark dataset, the Medical Visual Hallucination Test (MedVH), to evaluate the hallucination of domain-specific LVLMs. MedVH comprises five tasks to evaluate hallucinations in LVLMs within the medical context, which includes tasks for comprehensive understanding of textual and visual input, as well as long textual response generation. Our extensive experiments with both general and medical LVLMs reveal that, although medical LVLMs demonstrate promising performance on standard medical tasks, they are particularly susceptible to hallucinations, often more so than the general models, raising significant concerns about the reliability of these domain-specific models. For medical LVLMs to be truly valuable in real-world applications, they must not only accurately integrate medical knowledge but also maintain robust reasoning abilities to prevent hallucination. Our work paves the way for future evaluations of these studies.",
    "source_database": "arxiv",
    "arxiv_id": "2407.02730v1"
  },
  {
    "title": "Evolutionary Computation in the Era of Large Language Model: Survey and Roadmap",
    "authors": [
      "Xingyu Wu",
      "Sheng-hao Wu",
      "Jibin Wu",
      "Liang Feng",
      "Kay Chen Tan"
    ],
    "year": "2024",
    "journal": "arXiv:2401.10034v3",
    "doi": "",
    "abstract": "Large language models (LLMs) have not only revolutionized natural language processing but also extended their prowess to various domains, marking a significant stride towards artificial general intelligence. The interplay between LLMs and evolutionary algorithms (EAs), despite differing in objectives and methodologies, share a common pursuit of applicability in complex problems. Meanwhile, EA can provide an optimization framework for LLM's further enhancement under black-box settings, empowering LLM with flexible global search capacities. On the other hand, the abundant domain knowledge inherent in LLMs could enable EA to conduct more intelligent searches. Furthermore, the text processing and generative capabilities of LLMs would aid in deploying EAs across a wide range of tasks. Based on these complementary advantages, this paper provides a thorough review and a forward-looking roadmap, categorizing the reciprocal inspiration into two main avenues: LLM-enhanced EA and EA-enhanced LLM. Some integrated synergy methods are further introduced to exemplify the complementarity between LLMs and EAs in diverse scenarios, including code generation, software engineering, neural architecture search, and various generation tasks. As the first comprehensive review focused on the EA research in the era of LLMs, this paper provides a foundational stepping stone for understanding the collaborative potential of LLMs and EAs. The identified challenges and future directions offer guidance for researchers and practitioners to unlock the full potential of this innovative collaboration in propelling advancements in optimization and artificial intelligence. We have created a GitHub repository to index the relevant papers: https://github.com/wuxingyu-ai/LLM4EC.",
    "source_database": "arxiv",
    "arxiv_id": "2401.10034v3"
  },
  {
    "title": "Unified Hallucination Detection for Multimodal Large Language Models",
    "authors": [
      "Xiang Chen",
      "Chenxi Wang",
      "Yida Xue",
      "Ningyu Zhang",
      "Xiaoyan Yang",
      "Qiang Li",
      "Yue Shen",
      "Lei Liang",
      "Jinjie Gu",
      "Huajun Chen"
    ],
    "year": "2024",
    "journal": "arXiv:2402.03190v4",
    "doi": "",
    "abstract": "Despite significant strides in multimodal tasks, Multimodal Large Language Models (MLLMs) are plagued by the critical issue of hallucination. The reliable detection of such hallucinations in MLLMs has, therefore, become a vital aspect of model evaluation and the safeguarding of practical application deployment. Prior research in this domain has been constrained by a narrow focus on singular tasks, an inadequate range of hallucination categories addressed, and a lack of detailed granularity. In response to these challenges, our work expands the investigative horizons of hallucination detection. We present a novel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate the evaluation of advancements in hallucination detection methods. Additionally, we unveil a novel unified multimodal hallucination detection framework, UNIHD, which leverages a suite of auxiliary tools to validate the occurrence of hallucinations robustly. We demonstrate the effectiveness of UNIHD through meticulous evaluation and comprehensive analysis. We also provide strategic insights on the application of specific tools for addressing various categories of hallucinations.",
    "source_database": "arxiv",
    "arxiv_id": "2402.03190v4"
  },
  {
    "title": "ANAH-v2: Scaling Analytical Hallucination Annotation of Large Language Models",
    "authors": [
      "Yuzhe Gu",
      "Ziwei Ji",
      "Wenwei Zhang",
      "Chengqi Lyu",
      "Dahua Lin",
      "Kai Chen"
    ],
    "year": "2024",
    "journal": "arXiv:2407.04693v2",
    "doi": "",
    "abstract": "Large language models (LLMs) exhibit hallucinations in long-form question-answering tasks across various domains and wide applications. Current hallucination detection and mitigation datasets are limited in domains and sizes, which struggle to scale due to prohibitive labor costs and insufficient reliability of existing hallucination annotators. To facilitate the scalable oversight of LLM hallucinations, this paper introduces an iterative self-training framework that simultaneously and progressively scales up the hallucination annotation dataset and improves the accuracy of the hallucination annotator. Based on the Expectation Maximization (EM) algorithm, in each iteration, the framework first applies a hallucination annotation pipeline to annotate a scaled dataset and then trains a more accurate hallucination annotator on the dataset. This new hallucination annotator is adopted in the hallucination annotation pipeline used for the next iteration. Extensive experimental results demonstrate that the finally obtained hallucination annotator with only 7B parameters surpasses the performance of GPT-4 and obtains new state-of-the-art hallucination detection results on HaluEval and HalluQA by zero-shot inference. Such an annotator can not only evaluate the hallucination levels of various LLMs on the large-scale dataset but also help to mitigate the hallucination of LLMs generations, with the Natural Language Inference (NLI) metric increasing from 25% to 37% on HaluEval.",
    "source_database": "arxiv",
    "arxiv_id": "2407.04693v2"
  },
  {
    "title": "Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open Generative Large Language Models",
    "authors": [
      "Neha Sengupta",
      "Sunil Kumar Sahu",
      "Bokang Jia",
      "Satheesh Katipomu",
      "Haonan Li",
      "Fajri Koto",
      "William Marshall",
      "Gurpreet Gosal",
      "Cynthia Liu",
      "Zhiming Chen",
      "Osama Mohammed Afzal",
      "Samta Kamboj",
      "Onkar Pandit",
      "Rahul Pal",
      "Lalit Pradhan",
      "Zain Muhammad Mujahid",
      "Massa Baali",
      "Xudong Han",
      "Sondos Mahmoud Bsharat",
      "Alham Fikri Aji",
      "Zhiqiang Shen",
      "Zhengzhong Liu",
      "Natalia Vassilieva",
      "Joel Hestness",
      "Andy Hock",
      "Andrew Feldman",
      "Jonathan Lee",
      "Andrew Jackson",
      "Hector Xuguang Ren",
      "Preslav Nakov",
      "Timothy Baldwin",
      "Eric Xing"
    ],
    "year": "2023",
    "journal": "arXiv:2308.16149v2",
    "doi": "",
    "abstract": "We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric foundation and instruction-tuned open generative large language models (LLMs). The models are based on the GPT-3 decoder-only architecture and are pretrained on a mixture of Arabic and English texts, including source code in various programming languages. With 13 billion parameters, they demonstrate better knowledge and reasoning capabilities in Arabic than any existing open Arabic and multilingual models by a sizable margin, based on extensive evaluation. Moreover, the models are competitive in English compared to English-centric open models of similar size, despite being trained on much less English data. We provide a detailed description of the training, the tuning, the safety alignment, and the evaluation of the models. We release two open versions of the model -- the foundation Jais model, and an instruction-tuned Jais-chat variant -- with the aim of promoting research on Arabic LLMs. Available at https://huggingface.co/inception-mbzuai/jais-13b-chat",
    "source_database": "arxiv",
    "arxiv_id": "2308.16149v2"
  }
]