from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain_community.llms import LlamaCpp import constants callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) def load_model(): return LlamaCpp( model_path=constants.MODEL_SAVE_PATH, temperature=constants.TEMPERATURE, max_tokens=constants.MAX_TOKENS, top_p=constants.TOP_P, # callback_manager=callback_manager, # will stream to stdout, but wont attach to variable verbose=False, # Verbose is required to pass to the callback manager n_gpu_layers=constants.N_GPU_LAYERS, n_batch=constants.N_BATCH, n_ctx=constants.N_CTX, repeat_penalty=constants.REPEAT_PENALTY, streaming=False, )