Merge pull request #60 from VinciGit00/huggingface_integration

PeriniM · web-flow · commit dde9eeec879f · 2024-04-11T22:40:05.000+02:00
add hugginface integration (embeddings, models ...)
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -1,9 +1,9 @@
-""" 
+"""
 Module having abstract class for creating all the graphs
 """
 from abc import ABC, abstractmethod
 from typing import Optional
-from ..models import OpenAI, Gemini, Ollama, AzureOpenAI
+from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace
 from ..helpers import models_tokens
 
 class AbstractGraph(ABC):
@@ -48,7 +48,7 @@ def _create_llm(self, llm_config: dict):
             # take the model after the last dash
             llm_params["model"] = llm_params["model"].split("/")[-1]
             try:
-                self.model_token = models_tokens["openai"][llm_params["model"]]
+                self.model_token = models_tokens["azure"][llm_params["model"]]
             except KeyError:
                 raise ValueError("Model not supported")
             return AzureOpenAI(llm_params)
@@ -61,14 +61,6 @@ def _create_llm(self, llm_config: dict):
             return Gemini(llm_params)
 
         elif "ollama" in llm_params["model"]:
-            """ 
-            Avaiable models:
-            - llama2
-            - mistral
-            - codellama
-            - dolphin-mixtral
-            - mistral-openorca
-            """
             llm_params["model"] = llm_params["model"].split("/")[-1]
 
             # allow user to set model_tokens in config
@@ -81,9 +73,15 @@ def _create_llm(self, llm_config: dict):
                     raise ValueError("Model not supported")
 
             return Ollama(llm_params)
-
+        elif "hugging_face" in llm_params["model"]:
+            try:
+                self.model_token = models_tokens["hugging_face"][llm_params["model"]]
+            except KeyError:
+                raise ValueError("Model not supported")
+            return HuggingFace(llm_params)
         else:
-            raise ValueError("Model not supported")
+            raise ValueError(
+                "Model provided by the configuration not supported")
 
     def get_execution_info(self):
         """
diff --git a/scrapegraphai/models/__init__.py b/scrapegraphai/models/__init__.py
@@ -8,3 +8,4 @@
 from .openai_tts import OpenAITextToSpeech
 from .gemini import Gemini
 from .ollama import Ollama
+from .hugging_face import HuggingFace
diff --git a/scrapegraphai/models/hugging_face.py b/scrapegraphai/models/hugging_face.py
@@ -0,0 +1,22 @@
+"""
+Module for implementing the hugginface class
+"""
+from langchain_community.chat_models.huggingface import ChatHuggingFace
+
+
+class HuggingFace(ChatHuggingFace):
+    """Provides a convenient wrapper for interacting with Hugging Face language models
+    designed for conversational AI applications.
+
+    Args:
+        llm_config (dict): A configuration dictionary containing:
+            * api_key (str, optional): Your Hugging Face API key.
+            * model_name (str): The name of the Hugging Face LLM to load.
+            * tokenizer_name (str, optional):  Name of the corresponding tokenizer.
+            * device (str, optional): Device for running the model ('cpu' by default).
+
+    """
+
+    def __init__(self, llm_config: dict):
+        """Initializes the HuggingFace chat model wrapper"""
+        super().__init__(**llm_config)
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
@@ -7,9 +7,10 @@
 from langchain.retrievers import ContextualCompressionRetriever
 from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline
 from langchain_community.document_transformers import EmbeddingsRedundantFilter
+from langchain_community.embeddings import HuggingFaceHubEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings
-from ..models import OpenAI, Ollama, AzureOpenAI
+from ..models import OpenAI, Ollama, AzureOpenAI, HuggingFace
 from langchain_community.embeddings import OllamaEmbeddings
 from .base_node import BaseNode
 
@@ -26,11 +27,11 @@ class RAGNode(BaseNode):
         node_type (str): The type of the node, set to "node" indicating a standard operational node.
 
     Args:
-        node_name (str, optional): The unique identifier name for the node. 
+        node_name (str, optional): The unique identifier name for the node.
         Defaults to "ParseHTMLNode".
 
     Methods:
-        execute(state): Parses the HTML document contained within the state using 
+        execute(state): Parses the HTML document contained within the state using
         the specified tags, if provided, and updates the state with the parsed content.
     """
 
@@ -44,7 +45,7 @@ def __init__(self, input: str, output: List[str], node_config: dict, node_name:
 
     def execute(self, state):
         """
-        Executes the node's logic to implement RAG (Retrieval-Augmented Generation) 
+        Executes the node's logic to implement RAG (Retrieval-Augmented Generation)
         The method updates the state with relevant chunks of the document.
 
         Args:
@@ -54,7 +55,7 @@ def execute(self, state):
             dict: The updated state containing the 'relevant_chunks' key with the relevant chunks.
 
         Raises:
-            KeyError: If 'document' is not found in the state, indicating that the necessary 
+            KeyError: If 'document' is not found in the state, indicating that the necessary
                       information for parsing is missing.
         """
 
@@ -92,6 +93,8 @@ def execute(self, state):
             embeddings = AzureOpenAIEmbeddings()
         elif isinstance(embedding_model, Ollama):
             embeddings = OllamaEmbeddings(model=embedding_model.model)
+        elif isinstance(embedding_model, HuggingFace):
+            embeddings = HuggingFaceHubEmbeddings(model=embedding_model.model)
         else:
             raise ValueError("Embedding Model missing or not supported")