tranSymbolics

g99.py Line by Line

vardescriptioninsight #
#!/media/krusty/gm/gm120/anaconda3/envs/apy/bin/pythonShebang for Python interpreter in Anaconda env `apy`
import os,sys,time,socketStandard library imports: file ops, system, timing, networking
sys.path.insert(0,"/webroot/lib")Prioritize custom library path `/webroot/lib`
import plibUser-defined or local module `plib` assumed to be in inserted path
from transformers import AutoTokenizer,AutoModelForCausalLMLoad HuggingFace classes for tokenizer and model3
import torchImport PyTorch for tensor and model operations
def init():Initializes model, tokenizer, dummy KV cache, and metadata1,3,4
global mod, tok, maxlen, padid, pkv, chat, turns, cp, dev, dtype, modpathExpose critical objects and settings globally across session
mp={...}Model paths for local and remote Gemma variants
"4b":"/home/krusty/.cache/huggingface/hub/models--google--gemma-3-4b-it/..."Local snapshot directory for 4b variant
"9b":"google/gemma-3-9b-it"Remote hub-based identifier
"27b":"google/gemma-3-27b-it"Additional model identifier for higher capacity
dp={"cpu":"cpu","gpu":"cuda"}Simple device mapping string
tp={"bf":torch.bfloat16,"f16":torch.float16,"f32":torch.float32}Abbreviated types to corresponding torch dtypes
if socket.gethostname()=="machf":...Machine-specific config: match hostname to default model, device, type
tok=AutoTokenizer.from_pretrained(modpath)Instantiate tokenizer using resolved path3
mod=AutoModelForCausalLM.from_pretrained(...)Load model with correct type/device map3
maxlen=1024Define maximum token length (fits model window)3
padid=tok.pad_token_idAttempt to use model's pad ID1
if padid is None:Fallback check: no pad token defined1
padid=tok.eos_token_idSubstitute pad with EOS token1
with torch.no_grad():Preload model once for cache without tracking gradients4
rng=(1,maxlen)Input shape for dummy token forward4
dummyids=torch.full(...)Fill dummy tensor with pad tokens to trigger cache prep4
r=mod(input_ids=dummyids,use_cache=True)Single forward pass to populate `past_key_values`4
pkv=r.past_key_valuesCapture initial KV state from dummy run1,4
chat=[]Empty history of conversation so far3
turns=[...]Simulated multi-turn user messages3
cp=0Initialize token cursor at 02
def atc():Encodes chat history with template formatting3
r=tok.apply_chat_template(...)Produce prompt using chat template3
return r.to(dev)Ensure prompt is on same device as model3
def dumoda():Model forward using current newids and cached pkv4
return mod(input_ids=newids,...)Perform generation from current slice4
def dumodb():Model forward for a single token input4
v=nxt.view(1,1)Reshape single token to batch form4
return mod(input_ids=v,...)Feed next token with cache and position4
init()Run setup for tokenizer, model, and dummy cache1,4
for t in turns:Iterate over user turns3
t0=time.time()Start timing the response
chat.append({"role":"user","content":t})Add user message to history3
promptids=atc()Tokenize full prompt using current chat3
newids=promptids[:,cp:]Slice prompt to new portion2
newlen=newids.shape[1]Count number of new tokens2
pos=torch.arange(cp,cp+newlen,...)Absolute cache position for each token2
with torch.no_grad():Disable gradient tracking for inference4
out=dumoda()Generate logits using past context and new slice4
cp+=newlenAdvance cursor by number of input tokens2
outstr=""Start building model reply string3
for i in range(60):Limit response to 60 tokens max4
t=out.logits[:,-1,:]Select last token logits from output4
nxt=torch.argmax(t,dim=-1)Choose most likely token4
tokstr=tok.decode(nxt)Convert token to readable string3
if tokstr=="<end_of_turn>": breakStop generation on end marker4
outstr+=tokstrAppend decoded token to string3
pos=torch.tensor([cp],device=dev)Update single-token cache position2
with torch.no_grad():Safe inference block4
out=dumodb()Single-token model forward4
cp+=1Advance cursor by one token2
chat.append({"role":"model","content":outstr})Append model reply to chat3
print(outstr)Print model response string
print("Response time:",round(time.time()-t0,3),"sec")Show time spent on generating the turn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24