Create an inference function¶
Everything is ready to wrap the previously trained model and deploy it.
First, we need to configure the LargeFileBackend, the TracingDatabase and GreatAI.
from great_ai.utilities import ConfigFile
from great_ai.large_file import LargeFileS3
from great_ai import configure, MongoDbDriver
configuration = ConfigFile("config.ini")
LargeFileS3.configure_credentials_from_file(configuration)
MongoDbDriver.configure_credentials_from_file(configuration)
configure(
dashboard_table_size=100, # traces are small, we can show many
prediction_cache_size=4096, # predictions are expensive, cache them
)
The value of `ENVIRONMENT` contains the "ENV` prefix but `ENVIRONMENT` is not defined as an environment variable, using the default value defined above (`DEVELOPMENT`) Environment variable ENVIRONMENT is not set, defaulting to development mode ‼️ MongoDbDriver has been already configured: skipping initialisation LargeFileS3 has been already configured: skipping initialisation GreatAI (v0.1.6): configured ✅ 🔩 tracing_database: MongoDbDriver 🔩 large_file_implementation: LargeFileS3 🔩 is_production: False 🔩 should_log_exception_stack: True 🔩 prediction_cache_size: 4096 🔩 dashboard_table_size: 100 You still need to check whether you follow all best practices before trusting your deployment. > Find out more at https://se-ml.github.io/practices
For a pleasant developer experience, we create some typed models that will show up in the automatically generated OpenAPI schema specification and will also provide runtime type validation.
from typing import List
from pydantic import BaseModel
class Attention(BaseModel):
weight: float
token: str
class EvaluatedSentence(BaseModel):
score: float
text: str
explanation: List[Attention]
Even though @use_model
caches the remote files locally and it also handles deserialising objects, we only use it to store a directory. In this case, it gives back a path, the path to that directory. So, we need to load the files from that folder ourselves. In order to only load it once per process, we create a small model loader helper function.
This is usually not needed, however, when we can outsmart
dill
so for optimisation purposes, we do it.
from great_ai import use_model
from pathlib import Path
from typing import Tuple
from transformers import (
PreTrainedModel,
PreTrainedTokenizer,
)
from transformers import (
AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
)
_tokenizer: PreTrainedTokenizer = None
_loaded_model: PreTrainedModel = None
@use_model("scibert-highlights", version="latest", model_kwarg_name="model_path")
def get_tokenizer_and_model(
model_path: Path, original_model: str = "allenai/scibert_scivocab_uncased"
) -> Tuple[PreTrainedTokenizer, PreTrainedModel]:
global _tokenizer, _loaded_model
if _tokenizer is None:
_tokenizer = AutoTokenizer.from_pretrained(original_model)
if _loaded_model is None:
config = AutoConfig.from_pretrained(
model_path, output_hidden_states=True, output_attentions=True
)
_loaded_model = AutoModelForSequenceClassification.from_pretrained(
model_path, config=config
)
return _tokenizer, _loaded_model
Latest version of scibert-highlights is 0 (from versions: 0) File scibert-highlights-0 found in cache
Finally, implement the inference function.
from great_ai import GreatAI
from great_ai.utilities import clean
import re
import numpy as np
import torch
from transformers.modeling_outputs import SequenceClassifierOutput
@GreatAI.create
def find_highlights(sentence: str) -> EvaluatedSentence:
"""Get the interestingness prediction of the input sentence using SciBERT.
Run the SciBERT model in inference mode and evaluate the sentence.
Additionally, provide explanation in the form of the last layer's sum attention
between `[CLS]` and the other tokens.
"""
tokenizer, loaded_model = get_tokenizer_and_model()
sentence = clean(sentence, convert_to_ascii=True, remove_brackets=True)
tensors = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
with torch.inference_mode():
result: SequenceClassifierOutput = loaded_model(**tensors)
positive_likelihood = torch.nn.Softmax(dim=1)(result.logits)[0][1]
tokens = tensors["input_ids"][0]
attentions = np.sum(result.attentions[-1].numpy()[0], axis=0)[0][1:-1]
# Tuple of `torch.FloatTensor` (one for each layer) of shape
# `(batch_size, num_heads, sequence_length, sequence_length)`.
explanation = []
token_attentions = list(zip(attentions, tokens[1:-1]))
for token in re.split(r"([ .,])", sentence):
token = token.strip()
if not token:
continue
bert_tokens = tokenizer(
token, return_tensors="pt", truncation=True, max_length=512
)["input_ids"][0][
1:-1
] # truncation=True needed to fix `RuntimeError: Already borrowed`
weight = 0
for t1 in bert_tokens:
if not token_attentions:
break
a, t2 = token_attentions.pop(0)
assert t1 == t2, sentence
weight += a
explanation.append(
Attention(
token=token if token in ".," else " " + token, weight=round(weight, 4)
)
)
if not token_attentions:
break
return EvaluatedSentence(
score=positive_likelihood, text=sentence, explanation=explanation
)
A simple test to see everything works. Note that the models list is filled by the @use_model
call even though it's not on the main inference function.
if __name__ == "__main__":
find_highlights(
"Our solution has outperformed the state-of-the-art."
), find_highlights("Their solution did not perform well.")
(Trace[EvaluatedSentence]({'created': '2022-07-16T18:47:29.581701', 'exception': None, 'feedback': None, 'logged_values': { 'arg:sentence:length': 51, 'arg:sentence:value': 'Our solution has outperformed the ' 'state-of-the-art.'}, 'models': [{'key': 'scibert-highlights', 'version': 0}], 'original_execution_time_ms': 7127.2063, 'output': { 'explanation': [ {'token': ' Our', 'weight': 0.3993}, {'token': ' solution', 'weight': 0.3481}, {'token': ' has', 'weight': 0.2945}, {'token': ' outperformed', 'weight': 0.4011}, {'token': ' the', 'weight': 0.1484}, {'token': ' state-of-the-art', 'weight': 0.5727}, {'token': '.', 'weight': 7.775}], 'score': 0.9991180300712585, 'text': 'Our solution has outperformed the state-of-the-art.'}, 'tags': ['find_highlights', 'online', 'development'], 'trace_id': '56e20e94-79df-4793-ae61-d20820ebe2d3'}), Trace[EvaluatedSentence]({'created': '2022-07-16T18:47:37.020275', 'exception': None, 'feedback': None, 'logged_values': { 'arg:sentence:length': 36, 'arg:sentence:value': 'Their solution did not perform ' 'well.'}, 'models': [{'key': 'scibert-highlights', 'version': 0}], 'original_execution_time_ms': 170.7057, 'output': { 'explanation': [ {'token': ' Their', 'weight': 1.1475}, {'token': ' solution', 'weight': 0.8205}, {'token': ' did', 'weight': 0.3254}, {'token': ' not', 'weight': 0.2921}, {'token': ' perform', 'weight': 0.4293}, {'token': ' well', 'weight': 0.2772}, {'token': '.', 'weight': 4.4723}], 'score': 0.12305451184511185, 'text': 'Their solution did not perform well.'}, 'tags': ['find_highlights', 'online', 'development'], 'trace_id': '7fcf8271-1738-4025-8305-d5a1e5100aea'}))
In this case, the service is built as a docker image, pushed to our image registry and subsequent rolling update is performed in the production cluster. To check out the Dockerimage, go to the additional files page.