From 13ebd9028f46c0539d32d8fe2b7dfdd080a30ebc Mon Sep 17 00:00:00 2001 From: cliu-us Date: Tue, 29 Apr 2025 19:34:39 +0000 Subject: [PATCH 1/3] rot impl Signed-off-by: cliu-us --- fms_mo/quant/quantizers.py | 25 +++++ fms_mo/quant/rotation.py | 79 ++++++++++++++++ fms_mo/utils/hadamard_util.py | 171 ++++++++++++++++++++++++++++++++++ fms_mo/utils/hadk.safetensors | Bin 0 -> 391424 bytes 4 files changed, 275 insertions(+) create mode 100644 fms_mo/quant/rotation.py create mode 100644 fms_mo/utils/hadamard_util.py create mode 100644 fms_mo/utils/hadk.safetensors diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py index c97dbfa8..abd24389 100644 --- a/fms_mo/quant/quantizers.py +++ b/fms_mo/quant/quantizers.py @@ -40,6 +40,9 @@ import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F +# Local +from fms_mo.quant.rotation import RotQuantWrapper + logger = logging.getLogger(__name__) @@ -66,8 +69,16 @@ def get_activation_quantizer( - pact/pact+/pactsym - sawb/sawb+ - max + + If qa_mode has "rot_" prefix or "_rot" suffix, wrap it with RotQuantizer(), remember to set up + R_left, R_right tensors later. """ + use_rot = False + if "rot_" in qa_mode or "_rot" in qa_mode: + use_rot = True + qa_mode.replace("rot_", "").replace("_rot", "") + if not use_swcap: QPACTLUT = { "pact_uni": PACT, @@ -220,6 +231,9 @@ def get_activation_quantizer( f"activation quantization mode {qa_mode} is incompatible with swcap" ) + if use_rot: + act_quantizer = RotQuantWrapper(act_quantizer) + return act_quantizer @@ -245,7 +259,15 @@ def get_weight_quantizer( SWCAP quantizers: - sawb/sawb+ - max + If qa_mode has "rot_" prefix or "_rot" suffix, wrap it with RotQuantizer(), remember to set up + R_left, R_right tensors later. """ + + use_rot = False + if "rot_" in qw_mode or "_rot" in qw_mode: + use_rot = True + qw_mode.replace("rot_", "").replace("_rot", "") + weight_quantizer = None if not use_swcap: cggrad = "cgpact" in qw_mode @@ -367,6 +389,9 @@ def get_weight_quantizer( f"activation quantized mode {qw_mode} is incompatible with swcap" ) + if use_rot: + weight_quantizer = RotQuantWrapper(weight_quantizer) + return weight_quantizer diff --git a/fms_mo/quant/rotation.py b/fms_mo/quant/rotation.py new file mode 100644 index 00000000..db18d73b --- /dev/null +++ b/fms_mo/quant/rotation.py @@ -0,0 +1,79 @@ +# Copyright The FMS Model Optimizer Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Util functions related to Hadamard rotation.""" + +# Third Party +import torch + +# Local +from fms_mo.utils.hadamard_util import matmul_hadU_cuda + + +class RotQuantWrapper(torch.nn.Module): + """Add a wrapper to fms-mo quantizers. Objects of this class could have two rotation tensors, + and basic formula is: + + self.quantizer(self.rot_left @ input_tensor @ self.rot_right) + + NOTE rot_xxx could be optional, depending on whether it's for weights or activations. + For example, in SpinQuant QKV Linears will looks like (pseudo-code, "self" are not refering + to the same objects here): + qx = self.quantize_feature(x) # no rotation, just a normal quantizer + qw_q = self.quantize_weight(self.weight, R1_t) # need left rotation only + qw_k = self.quantize_weight(sefl.weight, R1_t) + qw_v = self.quantize_weight(sefl.weight, R1_t, R2) # need both left and right rotation + + return F.linear(qx, qw, bias) + + for MLP down_proj + qx = self.quantize_feature(x, None, R4) # for activation, should be x @ R + qw = self.quantize_weight(sefl.weight, R4_t, R1) + + return F.linear(qx, qw, bias) + + Also need to make sure self.R is pointing to a nn.Parameter() if training on R is needed. + """ + + def __init__(self, quantizer, *args, **kwargs): + self.online_full_had = kwargs.pop("online_full_had", None) + self.f32_had = kwargs.pop("f32_had", None) + super().__init__(*args, **kwargs) + self.quantizer = quantizer + self.R_left = None + self.R_right = None + self.K_left = None # if K_xxx > 1, R_xxx is a special had matrix + self.K_right = None + + def forward(self, input_tensor): + org_dtype = input_tensor.dtype + + if self.online_full_had: + # online hadamard => rotation for activation. should be input_tensor @ R_right + # cannot be fused into W and no training, either. + if self.fp32_had: + input_tensor = input_tensor.float() + input_tensor = matmul_hadU_cuda( + input_tensor, self.R_right, self.K_right + ).to(org_dtype) + + return input_tensor + + # not online => rotation for weights, could be fused into W later. + if self.R_left: + input_tensor = self.R_left @ inp_tensor + if self.R_right: + inp_tensor = inp_tensor @ self.R_right + + return inp_tensor diff --git a/fms_mo/utils/hadamard_util.py b/fms_mo/utils/hadamard_util.py new file mode 100644 index 00000000..99f51f33 --- /dev/null +++ b/fms_mo/utils/hadamard_util.py @@ -0,0 +1,171 @@ +# Copyright The FMS Model Optimizer Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This code is based on QuaRot(https://github.com/spcl/QuaRot/tree/main/quarot). +# Licensed under Apache License 2.0. +# Adapted from https://github.com/Cornell-RelaxML/quip-sharp/blob/main/lib/utils/matmul_had.py +# and https://github.com/facebookresearch/SpinQuant/blob/main/utils/hadamard_utils.py +""" +Change original "text tensor implementation" into binaries for better efficiency. Only has 12 +sizes available in the safetensors file. [12, 20, 28, 36, 40, 44, 52, 60, 108, 140, 156, 172] +""" + +# Third Party +from fast_hadamard_transform import hadamard_transform +from safetensors import safe_open +import torch + + +class HadamardTransform(torch.autograd.Function): + """The unnormalized Hadamard transform (i.e. without dividing by sqrt(2))""" + + # TODO seems redundant, insdie hadamard_transform(), backward is already handled...? + @staticmethod + def forward(ctx, u): + return hadamard_transform(u) + + @staticmethod + def backward(ctx, grad): + return hadamard_transform(grad) + + +def get_hadK(n, transpose=False): + """Simplify the implementation and use binary tensors instead of text implementation.""" + for K in [172, 156, 140, 108, 60, 52, 44, 40, 36, 28, 20, 12]: + if n % K == 0 and is_pow2(n // K): + with safe_open("hadk.safetensors", framework="pt") as f: + assert ( + str(K) in f.keys() + ), f"Special size Hadamard {K} does not exist in the file." + hadK = f.get_tensor(str(K)) + + if transpose: + hadK = hadK.T + + break + + if hadK is None: + if is_pow2(n): + K = 1 + else: + raise RuntimeError( + f"{n} is not power of 2 or does not have a special size Hadamard available." + ) + + return hadK, K + + +def matmul_hadU(X, transpose=False): + n = X.shape[-1] + hadK, K = get_hadK(n, transpose) + input = X.clone().view(-1, n, 1) + output = input.clone() + while input.shape[1] > K: + input = input.view(input.shape[0], input.shape[1] // 2, 2, input.shape[2]) + output = output.view(input.shape) + output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :] + output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :] + output = output.view(input.shape[0], input.shape[1], -1) + (input, output) = (output, input) + del output + + if K > 1: + # Do not explicitly repeat - OOM + # input = torch.bmm( + # hadK.repeat(len(input), 1, 1).to(input.device).to(input.dtype), input) + # Use bcast instead + input = hadK.view(1, K, K).to(input) @ input + + return input.view(X.shape) / torch.tensor(n).sqrt() + + +def matmul_hadUt(X): + return matmul_hadU(X, transpose=True) + + +def random_hadamard_matrix(size, device): + # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation" + Q = torch.randint(low=0, high=2, size=(size,)).to(torch.float64) + Q = Q * 2 - 1 + Q = torch.diag(Q) + return matmul_hadU(Q).to(device) + + +def hadamard_matrix(size, device): + # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation" + Q = torch.eye(size) + return matmul_hadU(Q).to(device) + + +def matmul_hadU_cuda(X, hadK, K): + n = X.shape[-1] + if K == 1: + return HadamardTransform.apply(X.contiguous()) / torch.tensor(n).sqrt() + # if transpose: + # hadK = hadK.T.contiguous() + input = X.view(-1, K, n // K) + input = HadamardTransform.apply(input.contiguous()) / torch.tensor(n).sqrt() + input = hadK.to(input.device).to(input.dtype) @ input + return input.reshape(X.shape) + + +def matmul_hadUt_cuda(X, hadK, K): + return matmul_hadU_cuda(X, hadK, K, transpose=True) + + +def apply_exact_had_to_linear(module, had_dim=-1, output=False, R2=None): + assert isinstance(module, torch.nn.Linear) + in_features, out_features = module.in_features, module.out_features + + if had_dim != -1: + assert is_pow2(had_dim), "Hadamard dimension must be a power of 2!" + + W_ = module.weight.data + dtype = W_.dtype + dev = W_.device + init_shape = W_.shape + W_ = W_.float().cuda() + + if had_dim == -1: + if output: + had_K, K = get_hadK(out_features) + W_ = matmul_hadU_cuda(W_.t(), had_K, K).t() + if not output: + had_K, K = get_hadK(in_features) + W_ = matmul_hadU_cuda(W_, had_K, K) + else: + hadK = hadamard_matrix(had_dim, "cuda").to(torch.float64) + if R2 is not None: + hadK = R2.to(torch.float64) + if output: + W_ = W_.t() + transposed_shape = W_.shape + temp = W_.reshape(-1, transposed_shape[-1] // had_dim, had_dim) + temp = temp.to(torch.float64) @ hadK + W_ = temp.reshape(transposed_shape).t() + else: + init_shape = W_.shape + temp = W_.reshape(-1, init_shape[-1] // had_dim, had_dim) + temp = temp.to(torch.float64) @ hadK + W_ = temp.reshape(init_shape) + module.weight.data = W_.to(device=dev, dtype=dtype) + + +def is_pow2(n): + return (n & (n - 1) == 0) and (n > 0) + + +# hadamard matrices for had12, had36.pal2, had52,will, +# # had60.pal, had108.pal, had140.pal, had156.will, had172.will: +# http://www.neilsloane.com/hadamard/index.html diff --git a/fms_mo/utils/hadk.safetensors b/fms_mo/utils/hadk.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..399a8fa1d3d4d2dd479751d98fc81b89c26d631a GIT binary patch literal 391424 zcmeE^!K$X&m0SgW5mVzFRAptAq@J9aU&1s-(A{`ugETQsgWv4Dnscenf*`c_z0%Oo z4V#Eq5$F4FDf6?7=YRdnpT8GBe);z6ufP2I&;Rk|ho8QF@c!$U zpT7R>>%YGJ^6mFufByQ{FMRv)gHL?>@yk!&ejGl+^Y5Sk<=6L>Uw-}IE5H2u<)`m| z@8A4~{PzCK*RNmy`1{}fbAJEPf6wo~|Kjh5|DxaCfBE^_Pd|VE5B~nY%GVG6lfM4= zz01djCEj{qubP*O~rtetZ1=8^3@5I^VwS|2W^iefjq7Z~tk2d;I-R{N?B0{%L;s z;9usKuU~!{{#|~1{N?)xe){8I|6P9l;NRufAHV!M{k#12@t2?f_~S2p<3IlE_y7Cd z@BZ)ke)m7`|C|2}`1kVPOy%ECuEM6HhokK?yGhl%Bj-(AhU)Ko*KEuAPVZK|@B3yI zo}GTu+_pcTd_VVW$^Ax#>T@G+WdlRoXLggScSp{fxD3_b_paHN^PS$Udf)fWDm**= zq`7T>KKXv`*^*P!)7is4c`H2(ZJ*gqs@@$rZ{jjkf8V=iTh4cSx9WZ0H>>dM^pobc z{rTkkxz8=!tVi{BsJzt`n!CqH|Bh&&u(M#`+nQJDLc3Ac{80m zwKs5?9@X2S@>V;vedb#|S>{frPF#lS@A9qQnDd=JyN${3`)%{4?A*5J&2;WmHh6UN z^ts8cT`D~pZJ*gqs@@$rZ{jjkf8V=iTh4cSx9WZ0H>>dM^pobc{rTkk`8_>vCJ#%_ zj+3|Aq3tu@>d7*9GIioIRDYLm^~Rj<^x17pe&27KH)ZFxJ#VJ#U1s)VG(9YN_I9cC zWVC%|H>rAeD{XLec!CYv(rzS+xF*!@6)+g-ILq*CVRV7dNSHR zvzt`CJ96H{WvKqXcg?n(@APif`@U~h;o0dY&29U$d|&QfP2RpY+1sVklhO8>-K6T> zk@F@lL-qH)YqsTlr+2H~_kFVp&rUySZrh*b`*Qbc^7g&S-Y%7%jJD70CROi_oHub9 zs=x1Dvn}U4y<7FZ@0(S4cKS(k+x{%ym%CS!x9?5%cB%Aaw0&kbsd{(hyot+D{eAD6 zZ8_iR-KzI}->ky3(@&b)_GkIN+`XE-eQ&b2OQk2H?K8Vc)w?6-O{S-W@q_;xbf!-@9g8&UbpZ>V4lg ztMKgfljgSlS-vlKuO@Hbo9yjU>B(sO%x+Tk?#Ou)m!bOm-Zk5DzSFx^@B6-4g=eRq zG`HhJQc-k9^9KD&*{@B3}@q%ir7zpuT!R(GmBxqWZ4w@albqwO=hN!7a} z=S^IO>hF8kY|Hsh?^eC<`(_oMo&F=uH|@_Szmt2mWZ&pfy&WoVwL{xyzSWau?qur3 zWvKox-|CGy-|4g4nEbxqHmBk<-}kqn^7OFj*yOo^wRb01<*oEEw0&kbsd{(hyot+D z{eAD6Z8_iR-KzI}->ky3(|@G-rv3TkcXH2`>>E9*w?pNvc4+&|w|cV7olKp$4AtM| zTfH&oJAHN=li&B-X4JDw#dkiz@O|v_>|yQQ$yIqPJq&H1*-fh69XW5}GE{%xyJlO? zcY3$#ecv~$@a*&-X})QHKKY&8vnBgRkLvAEd8-}TKJ%@fEORGQCoV(vcllOt%=u2A z-Nxki{dT^QO$tk%{tVxOvB`4>Ywu33%3JASX#31=QuXf0c@vkR`upBB+j73syH)S| zzFCD&oov#4)Bb$&JGo~|_KhCZ+oAGSJG6b~TRmCkPNq&=hU)L~t=^dPoj$vb$#49& z@0w-CcRs=JeeCn>VeQ??Re38t3~is;O{(4w`QUea=br84-O=aCTiL+S_L<$J>fMp^CN4wu_q}Vj<$R}itKRp0b1Qvr^pobc z{aJp;o$8LfMp^CN4wu_q}Vj<$R}itKRp0b1Qvr^pobc{aJp; zo$8LfMp^CN4wu_q}Vj<$R}itKRp0b1Qvr^pobc{aJp;o$8L< zzBk$1rP7np_L<$J>fMp^CN4wu_q}Vj<$R}itKRp0b1Qvr^pobc{aJp;o$8LfMp^CN4wu_q}Vj<$R}itKRp0b1Qvr^pobc{aJp;o$8LfMp^CN4wu_q}Vj<$R}itKRs&ES263Tt6u+dHVByrSuxhij^hoS8= zyGhl%Bj-(AhU)Ko*KEuAPVct$zTcU6cKS(S$V;vedb#|S>{fr zPF#lS@A9qQnDd=JyY%<{PBJ_^JN=}&ZGS%be(u?leWOS9cBs774sDlc^Jz zq58Xgt2gF+r_V0^eZP|o56@0NX>Qw}Prjdfwq)PvQN0~1Z?!|)XTH^wW$t9^#AT@d zF5l{nIp671dzV>%hu@@wx8}Az+v(h?ZpiI>lf7LkJsEAE*-fh69XW5}GE{%xyJlO? zcWz{vHzvRDZ!mAl&TV_%Oy^E@LvG)j?CnzN$!Pn`Zc_E`$axc&q5AvYHQRE&)2I4w zWAgj{2J@!u+_vY?T$3j+{4f8LGeUU9&CA(Hn2^+1WRf-`MLX zg(Xja;v4DN{S-W@q_;xbf!-@9h^6Q9_9-?T#~CeKbkDJ*&VGyEouO`bbgdv|hG-bxQc+h=x@ zs&_}uo45?s-}f$AcFEbAd()kL?)7ZB%y?NUzViu&?_-~54{PsEuF6~KVQBlzZc_E` z$axc&q5AvY$<*eZ*(Jln@+L+P&rUySZrh*wJ+nJCdHddEZM%!n0ld5+|&YQRl z)!+9{rZ(@)E*Tz{H!*s6cKS(k+y2z=ncb<$+xI4WyHt8I+CH;{S-W@q_;xaV-jknpsw+yFuqr$z3 z(ZRFRPnz5ICw?ovJJpWdzBk$1rP7np_L<$J>fMp^CN4wsZoHMtwhYe=-mN+@I(T;a zNpsu&48I9;r@AAz?@ji0sq|#DeP%bQdUxc!iObNu8*k;ZEyHtzcdJf}4xXKU(%iN` z!*9aesqV<_dy~CgDm@u(pV>{S-W@q_;<8lz9sU*=eQwOnZfxC5tlyb84#*DJ(^k|-4GCVvx{iL~Vf9m(l?$qS% zdy~CgDm@u(pV>{S-W@q_;<8lz9sU*=eQwOnZfxC5tlyb8>dM^pobc{rTkkxo1oEjULt8q4HKc zw0-7VJz3^XrY28MZ$|ZSuGyC1>^VJ}XO|2Q&rUySZrh*wJ+nJCdHddEZM%!n0 zld5+|&XcQK*F8DemV0yKza!u1Q`vGd{iLwu=}&wkJ)1l`SbKMJRo+SuL)&L|ld5+| z&Vzf&@o-g7o)~>@%+2oGb)Q+!mdlKnrQ$oEVE8`vdG@gO?&PYxl^%w+&+H~u?~a@& zSGTTva}Y{|aSqk213-fD-o&wQ&V%iJZ?yUCpF#CP2^ z+cG@g;N7Ydql0IspES4a&+wZtcd9#b``%=4mr74Y+h=x@s&{8b^Rk!AP8P5B0ApB^?Hn>;tL_U`1Wyp!p8dq=x^K4Se5ZG--uQ2m zq44bVljgSl`Q-b#XG`{t9@X2S@>V;vedb#|S;o$jr_yD{%TPU>Yqn)Ldrpt$*(Jln zv(rzS+xDk^&+JZ3-o7{4+ojTz(e|0$r0Usuus*tt(RJT!%lS_4R=x4J$xwK9`bl%! z{(SQN+_NS7Mvv<4P=Du%rpIOhA%Z!($;ya&U z_&)Y|_OSNu}FD#dve~yWvKqXcg?n(@APif`@U~h;o0dY&29Vh$@g>5 zmh2lns<%Vst#)Yp%=d%d&OJGA;xbf!-@9g8&UbpZ>V4lgtMKgfljgSl`Q-b#XG`{t z9@X2S@>V;vedb$zX1#lI-o$07{=Rq3ww&+uZq@s~Z&u;i=_k!?`}4{7bI+FS8$GJG zL*=b@X#31Jeye$Q_&)c^WvJf1*_QL2-mQA;?@pebeo|QS^yicB=UcYEP2U?jd%ILS zw0-7VeP+FTa^A#csQ$ip&9>E9*w?pNvc4+&L z@{PUTJvndUGE{%xyJlO?cY3$#ecv~$@a*)H=C=L$ z`n!CqH|Bh&&u(M#`+nQJDLc3Ac{80m)eX6QZ?d;br6;59Wyx>t$TD{_b>cEqf0u9d z#+>i;*=Dpnx*@mkP4;%F^kg)BvWeTHdUrB);xbf!mv8mPobUA6 zZA^aOZ<{w|=e9j>rgNvdA-C^M_I9cCWHhgtc~aSV-y-KtT!!lJd)I8s`A+Xvz3=;G z6`q}b(%iN`pL{>}Y{|aSqk213-fD;Dv753yRnH#ZH*p!Nzwd3cE$2JETlK#0n^kyr z`bl%!{(SQN+_NS7Mvv<4PiOW#^jo(Su8*^{&`&Rdv^=!G!cv&jG z^9hFUW1nXaYwu33%3JASsqHiOmMn88QztG%^>_JJZ_N2lpWVjf_x-kcQ+96K^JY4C zsvC0q-ehm5(zS=l^_g$=WSKjeI&m4Qzst9JW6pQ_>^3I9@3+mHvUA&>H`BRO-H_Y& zCVM*+K4rL_KJ%@fEORGQCoV(vcllOt%=u2A-Nxki{kC~ic5d7AW;%DO8*=;JWS^a0 zWrs)GXLggScSp{fxD3_b_paHN^PS$Udf)fWDm**=q`7T>KKXv`*^+&uNA-4Sa(${B zw0-7VJz3^XrcPXj>hJQc-k9^9KD&*{@B3}@rtI9d=goBPR5#@Iz1ezqDmy&dKC_!t zy*qN=#AT@dzIV;GobU8*)%(70R^i#{C(Uj9^U3#f&z9^PJ*u}$wQpquL)&L|ld5+| z&YQRl)!+B7*_QL2-mQAy_suFiJN=}&ZGS%be(u?leWOS9cB%HQY+z{n%x+Tk?#Ou) zm!bOm-Zk5DzSFx^@B6-4g=eRqG`H=~C*RLKTe5HTsNOEszLgCOZJ*gqs@@$rZ{jjk zf8V=iTh4cSx9WZ0H>>dM^pobc{rTkkxo1oEjULt8rP{Z$fuZd)yGhl%Bj-(AhU)Ko z*KEuAPVZK|@B3yIo}GTu+_pcTd_VVW$-dE}>FrhfR(crPKC_!ty*qN=#AT@dzIV;G zobU8*)%(70R^i#{C(Uj9^U3#f&z9^PeX8CK3Y*RjZJ+s8PnNlpsS}r>`n!CqH|Bh& z&u(M#`+nQJDLc3Ac{80m)eU*_FuLq>XO~J(M%!n0ld5+|&YQRl)!+B7*_QL2-mQAy z_suFiJN=}&ZGS%be(u?leWOp+yFp>o*`e(--|ERScQSS2GE{$;Z}rBU@ATPiOn%>Q zn>S_WwmomAbEmo?>rwLGd%k0j&v$n6%*k%N%MCnt>FGRpADv8(dUmOFWoxio##ewrcXxp;2unmlK=Dm^M8ZC=l=)z{}+yJda?({ zPnrzB<@-8&vImSb)*sIp0u;kSI_@4)BrhD)aJZraJzTi=tTeQf*Kz*6yf^nLU& zbo1G7yJ_S1rVf_eEZg#lce#bqV5#^#`aXIXy7}z4-L&z0QwK|KmTmdOyWGO3W8;p#y}f+XQ*XEIdwa46 zn>c-U(@v(|`kox^W823DmWt1#@1uvIo6mmRO&h;Ab+F`S*_Kbd%Po95Hty)#+sijS z^>)j?waLM%DO*@%->w9vvk8K|tSSmh`zKaFj|(LT0)Y+$MQ zJo-L*7`plFx81byds7EXZkBEN#Jk)k$9rtJ%<1jjY|0xu{FZZr&*Kf3OyAwKlc~49 zCrA6(_OXGb;`8YH=wayQv)^{p#_vrXEV)^>enS z;kTR{d>(JOWcu!=olL#;JvrLPwvP=g6`x1nM-M|cpZ&I*HhypFV9Cw0EuZ0U%{STO z>Fmv@p3aj^nhd|?`#O8F2b(y3chgR$-uj*#?PJ@=29}D?qwk}Kp_|Ws+f5t4H+8V& zX4#fcyvuEJyvK&ioZjBero6GkZ#g&kJl=50^xaK6nR@Gcaoc7?xvkg zz4bjg+Q+t!4J;L(N8d*eLpPuOwwpG7Z|Y#l&9W(<_a-;8RJ?bG-{~$lc>304_$~X+ zp6tOUPT$?Mlc~49CrA6(_OXGb;`8YH=wayQv)^{p#_vrXEV)@`K5w^i`&7Jlhu`Tg zH+cHiWcV%n&YtYSCQjeow3Df~z9&cf*!Ho3rQ-AG`{-fl=Cj{+)5h;j9W1$7mTGt2 zo9tw%c<&Cs(_L=x^sUM8TlSqj*@I1-zPo8BQ*V7wj`p$bV*^XY=h64k!_duVzwM@t z-ar*A2olL#;JvrLPwvP=g6`x1n zM-M|cpZ&I*HhypFV9Cw0RJ-%uWG73-dw2Mq?s9{tZ%u~ZvhVE49&Fc-U(@v(|`kox^W823DmWt1#@1uvIo6mmRO&h;Ab+C`Pox9E_ z-e(8TqqB#%x0i2v>g|?&Z%_7M6Q}QP+R4;g-;<+#Z2Q>2Qt^58ee^JN^Vx5^Y2){% z4klO6W8F>J?W8d8#_!}-Z;#3*O@`m{9ex}35udvA-uTu{>gG@U4SKeHbg)!>9(^A@ z4BdS8+iu$Uy{VI-`ty7;y&Ln1ciF-7=RJ+W5V#n|iih_OQ(QNnzf7-^-ld9))k39e&Hcw|vSplfu0FzLz<@Jqq76JN%Y? zZ%_7M6Q}QP+R4;g-;<+#Z2Q>2Qt^58ee^JN^Vx5^Y2){{ZtB^3*~2pHCxvq zV5#^#`aXIXy7}z4-L!Fw(odV_X17~D@x9!_^XTm1?d|28o_f1w-`kTt*u?3(n|3nw z*7xLSAKN}QuvB~=eIGpx-F)`jZrZp->8DL|v)e77_+D<|d35&h_V)5kPrco;@9oJR zY~u9YO*@%->w9vvk8K|tSSmh`zKKDy`f1bL>~_m1zL#5g z9-TeBy}f+XQ*XEIdwa46n>c-U(@v(|`kox^W823DmWt1#@1uvIo6mmRO&hl;{j_Or zcDv;h-^(pLkIo+6-d?`xskdA9y*=53O`N{FX(v-}eNT?|vF&36OU38W_tC@9&1b*u z+^FtU_|!EwyWR4M@8uSrM`sUjZ!h2U)Y~ol-k$8iCQjeow3Df~z9&cf*!Ho3rQ-AG z`{-fl=Cl87H*Njy)WMRQWm`V+F1PSJI(v9~d-ar*A2olL#;JvrLP zwvP=g6`x1nM-M|cpZ&B;Hf`zQQ`g+=cFQNems@xqojtt0y?oPCZ@27wd$I?cIDL21 zPNv@ao*eCC+s6i$iqE6(qlclJ&wkn^o3`}uscUX_yX6z#%Pl;Q&K}<0UcTw6w_EnT zJ=udzoW8qhCsS{IPmcDn?PCK=#pluY(ZkTqXWzT*@R{ABTi4u-o_N>UZc>w9vvk8K|tSSmh`zKufhE%)9R!nbX^&@J+MBZ`t?uWDhoR`tGKkOuh9zIoijzj}0sppGV(E4?{Pfy*o8K zd}cT3)-^YyC*F0on-u2V_l?Zy?NRuq+2ObBdwa46n>c-U(@v(|`kox^W823DmWt1# z@1uvIo6p{znjJo~8+7ZMo6!^RI@?VO^X~gb=JfU`eADdkTlT#@*@I1-zPo8BQ*V7w zj`p$bV*^XY=h64k!_duV?@rAQpVtu>vuw*J-sKjaM`sUjZ!h2U z)Y~ol-k$8iCQjeow3Df~z9&cf*!Ho3rQ-AG`{-flN4~plf8$$jd^>fp8ZC{_PssXgH4>iyJ;s=Z+%aW_Ob0_153r{(f85A(9PScZm0dUaW{3a z8ZC{_PssXgH4>iyJ;s=Z+%aW_Ob0_153r{(f85A(Du!8 z*PU;t%|{*kd}Eev`NX^2!t?0t;qC3^o1S{RW#8MAJ=nzQyPI}0_15>~Xdl}?Hn3ED z9(^A@3~k>mcis7R+I-Zp&o^e-mQTFPEj*9T9^T$wzUisATlT#@*@I1-zPo8BQ*V7w zj`p$bV*^XY=h64k!_f4fvEKVO-?{myW1nx#vMrx@ms@xqojtt0y?oPCZ@27wd$I?c zIDL21PNv@ao*eCC+s6i$iqE6(qlck+$@J<+yJ^E7K6TB_Znu2md%1<@(b>b>+sijS z^>)j?w~_m1zL#5g z9-TeBy}f+XQ*XEIdwa46n>c-U(@v(|`kox^W823DmWt1#@1uvMZr&ZemkgiT4Z3yB z&FG1Do$V%tdG~!Ib9#FezG-&&E&JY{?7=2Z-`%v6skgo-NBh|Jv4N%H^XU7!H?wP> ztUEIOwE3v(H)g-HufhE%)9R!nbX^&@J+MBZ`t?u zWDhoR`tGKkOuh9zIoijzj}0sppGWV(^{5%$ygN73ZrZq;I#_bEY|AI!ak+b#Rvp6tOUPT$?Mlc~49CrA6(_OXGb;`8V^m|m5kn`djUCc|fTgKk}OGkW4( zXS+#Z-hJQ5oZcRVZ<-x`%f7cKd$5VqcQ@^1>aFj|(LT0)Y+$MQyzKFL?Cdw6J)3Da zZQM;AEV)^>tu>vuw*J-sKjaM`sUjZ!h2U)Y~ol-k$8i zCQjeow3Df~z9&cf*!Ho3rQ)Y!*T)WqZa({MH*Ngh)WMRQWm`V+F1PSJI(v9~d-ar*A2olL#;JvrLPwvP=g6+a!jK6Ws4^Vx5^Y2){%4wl?3+wzHbxrOJ^ z*~8o0%QrpscFVrECws7o(|0%RWa_Q&$AgP zaFj|(LT0)Y+#eZ^4Rs! z!_duVzwM@t-RZ2e#^eMCws7o(|0%R zWa_Q&$RJ+W5VxgC#f1wtV7UZsB=!_VD)h z@=Z^@-LmiP$sTOt^xaK6nR@Gca4srbBGZ^-oO=Cfxr?WT>pse>gq%eH*t zU2frdboTJ}_VP_nz1_0!?a3Z&;`H53JDGaxdvdgoEeF@DvQ&KD{p4N0W#5e4j2&l} zx^>OX=y~6hW!6s$^X~gb=JfU`eADdkTlT#@*@I1-zPo8BQ*V7wj`sDI?6J+B+(+NX z28M1v`)xOE{NB{TlAC2)KJhNM@H{$uczb*Krl;O++4uHj4>oc7?xvkgz4bjgx*c!& z*uqlrdGvksFm&_TZ@X#Z_ofb(+$`JjiFdh$=h4~2+uO@GJ@t0WzPBfPu!+-mH|=EV zt?$Xv?ReA27M6<7qwk}Kp_|Ws+f5t4H+8V&X4#fcyvr>-kIo+6-d?`xskdA9y*=53 zO`N{FX(v-}eNT>V$D2O3uvB~=eIGpx-F)`jZrb?0se>gq%eH*tU2frdboTJ}_VP_n zz1_0!?a3Z&;`H53JDGaxdvbI;-t@7BrQ-AG`{-fl=Cj{+)5h;j9W1$7w&fG=atqI+ zvxm30mv4IN?UsFSPxfFFr|)js$<$lllcU@5rjIQw6`x1nM-M|cpZ&I*HhypFV9Cw0 zEuVOoTX-IwJ-ofWeA82Jx9od+vImar*A2olL#;J-BCQR^feguvB~= zeIGpx-F)`jZrb?0se>gq%eH*tU2frdboTJ}_VP_nz1_0!?a3Z&;`H53JDGaxd-7C# z-u<5Y=wYe&Jo-L*7`plFx81byds7EXZkBEN#Jk+W^XTm1?d|28o_f1w-`kTt*u?3( zn|3nw#`iV`&*M!WJuDTUN8d*eLpPuOwwpG7Z|Y#l&9W_@c$Zsv9-TeBy}f+XQ*XEI zdwa46n>c-U(@v(|c-O2u`rOca@>G1@jlG^8k8VEuZ8vTF-qgX8n`K)*@h-RUJUV-L zdwcn&r`~Sa_x5BDHgWpyrkzZ^@x6_~^LW!o4@<@8(f85A(9LJR?WT?2n>tu>vuw*J z-sKjaM`sUjZ!h2U)Y~ol-k$8iCQjeow3D^o`fm34JhpxGuvB~=eIGpx-F)`jZrb?0 zse>gq%eH*tU2frdboTJ}_VP_nz1_0!?a3Z&;`H53JDFSat?$Znr)S$o2TR50(f85A z(9LJR?WT?2n>tu>vuw*J-sKjaM`sUjZ!h2U)Y~ol-k$8iCQjeow3D^o`fm34JhpxG zuvB~=eIGpx-F)`jZrb?0se>gq%eH*tU2frdboTJ}_VP_nz1_0!?a3Z&;`H53J2!Uf zt?$awKDK>qV5#^#`aXIXy7}z4-L&z0QwK|KmTmdOyWGO_=W!9^=zAH!j*!Ho3rQ-AG`{-fl=Cj{+)5h;j9W1$7w&fG=atqI+ zvxm30mv4IN?UsFSPxfFFr|&Lz>D{Joo_Bq0``Exz@p<%p^e}Yu*>AgPAgP z(xt zJq+D^_SodO;5euvhVH59&F(xtJq+D^_SodO;5euvhVH59&F1+DG5V29}D?qwk}Kp_|Ws+f5t4H+8V&X4#fc zyvr>-kIo+6-d?`xskdA9y*=53O`Kld8#xtox;29}D?qwk}Kp_|Ws+f5t4 zH+8V&X4#fcyvr>-kIo+6-d?`xskdA9y*=53ZG78&w9jtxu)Z6AkDd)a6`x1nM-M|c zpZ&I*HhypFV9Cw0EuVOoTX-IwJ-ofWeA82Jx9od+vIlEEak+b#Rvp6tQo zs+|hYE`4tB*-akScfYefHn3ED9(^A@4BdS8+iu$Uy{UsGH_Nts;$3dxd35&h_V)5k zPrco;@9oJREKi-kIo+6-d?`xskdA9y**i$zWc44b~2T>lZW-O?PCK=#pluY z(ZkTqXTR;Hjo+I(SaP#$%O~FD7M@3E4{vWT-}KbmE&JZyjCypOIkmfKCsS{IPmcDn z?PCK=#pluY(ZkTqXTR;Hjo+I(SaP#$%O~FD7M@3E4{vWT-}KbmE&D!uSRS2ya(s8w zPNv@ao*eCC+s6i$iqE6(qlclJ&wkrY8^1Sou;gahmQTFPEj*9T9^T$wzUisATlPJ@ z2g}(xtJq+D^_Sod zO;5euvhV3VSl-6jbVr8g$yB<@!}{3vv4N%H^XU8NVd&-kIo+6-d?`xska|xzTtP=$v)y!x4n0ZZl1S&?o-*2Q}KE9ee^JN^Vx5^ zY2){%4wl?3+wzHbxrOJ^*~8o0%QrpscFHjMeLgenBR+N8d$;K3dE4hcl?^!+pGV(E z4?{Pf{kEGnesAhv$<4AYpLmyBcpjZSyuH1A(^GG!43ppIGs8aOQ@6c$i*BB`eeP4) zkW=w_^nLU&bo1G7yJ_S1rVf_eEZg#lce#b<(b>b>+sijS^>)fI`F%b!>?1yP+k3a@ z=6T!aK9vnQ6`x1nM-M|cpZ&I*HhypFV9Cw0EuVOoTX-IwJ-ofWeA82JGy4W6PlnH9 z!zI&qH|=EVt?$XvKDK>qV5#^#`aXIXy7}z4-L&z0QwK|KmTmdOyWGO_=b>+sijS^_J}UCUY`;Uhh8Y*k+HHsmapQ_tE#Ufu-W}==MhyxP3C0yyxx7(vCSSYQtu>vuw*J-sKjaM`sUjZ!h2U#9KG8-dFOpAnZ7%+ohnn| zlZW-O?PCK=#pluY(ZkTqXTR;Hjo+I(SaP#$%O~FD7M@3E4{vWT-}Je#44-%2 zx030*BipGm6+U@bAKN}QuvB~=eIGpx-F)`jZrb?0se>gq%eH*tU2frdboTJ}_VUm4 z)Hk;5ckIa?Y~u9YO*@%->w9vvk8K|tSSmh`zK}xxwf0hD)aJZraJzTi=tTeQf*K zz*6yf^nLU&bo1G7yJ_S1rVf_eEZg#lce#b<(b>b>+si-GQ{ULK-?1lqu!+-mH|=EV zt?$XvKDK>qV5#^#`aXIXy7}z4-L&z0QwK|KmTmdOyWGO_=-kIo+6-d_Hh zp8Cd?{f<4^gH4>iyJ;s=Z+%aW_Ob0_153r{(f85A(9LJR?WT?2n>tu>vuw*J-sKja zM`sUjpMCq>_d7Z-I~X~cSv~KqIkh`-_IR0^EIoZ6eIFZGDn5_Cj~<3@KKpGqZT#NU z!IGP0TR!nFx9~hVdw6>@>YdnS56|wS>^t{l4>oc7?xvkgz4bjg+Q+t!4J;L(N8d*e zLpPuOwwpG7Z|Y#l&9W_@c$Zsv9-TdW@@~}JxmUBpZ@Kqmk2jn-wYzC2Q*V7wj`p$b zV*^XY=h64k!_duVzwM@t-oc7?xvkgz4bjg+Q+t!4J;L(N8d*eLpPuOwwpG7Z|Y#l&9W_@c$Zsv9-Td0 zugb`tY|>=-E#KGKlRenP>ARbDGWFK?(xtJq+D^_S|5!w*UK}X`n}xbE`7dfo}FH0%aLWR^m+6ieDW~5K6$F1Exr$K zmnv6J{2o4YZjz^WgUVGIxsT4HOP@J8K0Cci@4=F%XJg+=m%Uz|`PA>_CU@!cP4n)u z$D7@?x>4yp*yLe!eezU2TYMkfE>*6c_&t2)+$2x$29>Acr@QsLX18V)j#twsgWDxf z-y1rwJ2IU-bMo|Vlj+soz`JA9EO&P59sbVVyOo>o@D^r|>g6&OZkH-oPy7}>b8eES zcZ14P@zdSd$8ns^}e2*2hUC~&wSoC z*|f_(-!!M&dbaM}w7OC0J=o-7bbWG_zCAgQ4F^k=tM+a(=O%f2H)wP6cI-Vi=`$zC zXQx-`Jy>${#_aF=M&^7&Z&s7xy{S*eE;BxJ_hvV(^m+6ieDW~5J~^7+ojf%=Je@2x zIX<(y=F^64bKgzM-lI>SIXON%y-M%Fnm042-uDd|886SAJiXgwdNnsPRG+;brRVgi z_`EahVQjjC<`+9O7 zJUhKS^Lg83(=Pjb<5q_1yUXmR)s0H;!6px*>r1Xz+2Z@)cByjp#P8uV=O%f2H)wP6 zcI-Vi=`$zCXQx-`J(xUIe_v0|gJ-9gXFhM6Y}#c%y^*2%8-Ir$rRVgi_`EahVQjjC zZO4{Q)syq!cByjpyq%lOxk>)_c;2$xj^1OFK67$>c6ycGOQuKl_j#EL&rUDTe3svH zJM|wiy6o?g-L$gLqxayGhtYj>ZdA7TKDb?~Ts`r7_{_OUu1EE^yxC1Fy+@Zmb8>uk zdX;`UdsNTPoC?oQFVB49_tLvfrdQeMQ}y)Rlx3~#^XNVJ=bLG7huZ5i!~PN8Z^_&7U5`!r%*pZD=~a4e%kb>XsW-miR?QBd zJbmwa+r4j5eQz?mX?3I0d$7sF=>uI%mb@L`_1L7(oE)E> zepB{b_If!v51yS~p833OvT2tr{fXa#lVxO?-h8Sv=PtMQXnXUG4xi^nrNj5Z?Na6H ziSOex=jQL>y&-RR>vzrW%_75%nIS-zlUY_~9ZL-gHAN72Xn>MoCxK-V# za1ScD-3TvJnzK=a8*UK}rk*7}vpE@@3R6RX6WmzlxJbDkFe#`Ja-nqAf^<{^r zm!&4hXJ%(sVIC}bdN%fx(tGsjGbhJ)mszjg_gyopr!%L*v(w8npSMlcJ2tuNo@|fm z;oOvEt?cvYJ$U*p!~1yW-VWB69iCp6njD{*omqu>u;l64*iTCD(WlRx9N%4Ly?Wnw z&8VKvoC?oQFVB45Hd%Ul58g)yOVz`(%j~ArjY{vq@Tf;lmfj62S7qcpxLvATJ#XhG zvn&>Mn^rd}eV$yk%dAgjM^^RZJh)w|Ts?1>o6lIjownVx>2rh9 znemzb9-bR~zc-`$`@BqrXQ!8EK5yGhj(TkR=wPXOcy^iHw7OC0^W>^sW_>CUq1|e8%$awC$cvpBt3UjL$4bC!S2N-uGQIs;4uj!n4!MGoNMK+|1iNd8*#s zlx3~HQM1?Ep|IrX={>m$$M?bQQswITJHEZW=k4^Ky>HyKx^ZWAXHKQdj8CRl={Ym1 zm&;Umc6xc{^S-Crk<*)DdGb`fyD7_BebdU$ZFbG44STsNBj>^GQswG-yWD)n^6j+k zo=u+{l+KLroSj~sw#nF;QN3KI!n4!MJD>MW)ULgmK2M&icQ<8Ot8Y*{eP-Cir;WQ* zJ^MVkU8;Q2Q*ZK}UA}`QPtVSNQhJX*edp}->a5*vy<~c|H}LGtsrP+bM#jrq={&m0!|3|tsd~3lmbLl@wbN&YO?=w8 zOVzW_gWIL*&+EyaOoi)JmfKIR*6JJ7PM;Yz@oD2ORnI;TZkMV*uP1vl6|Prd9xQo!HujU!d-Qr$roz*c zo7t)}{rLyJ?He?YZ*a2oF!FZvdF-Z~JiXgwdbKz3?98e6{TZ?pGpDq9Ygs+W^JnF`mdFb|eIJsbN; z>D{D$#83Qo@A4bHGo$*u{C#k{N$u4W-*FF1Z-(W`Q}ynq43n#O`8J%*v}-sjlbhMA zy@6+EPQ7pAu9ZHI-h)pbM%O1#)w}K7{Ii}tn`zg4+T3`Xy>DCDdvxhDC&y=}SLr=i z^7L%%Tj{db%QK()z1-w3eZFa)9lp=KyH@%Kaavz;Xmp*fHe0F-3-h(Ai&&Ix$E_=N^^Qqs< zP43d?o95Z!J$7C)JY2=+!9DooVRU`+RK45IaF~}FpSd^L=~cHVEP49uH||DdmzNx$ z*yzTX11Q4y?N?$ zL!aC1yxMIdrPLvUN6sl zYM-0jrO!9bvrEq|kIsXq&zu~eonEE)V9C?7=?s(GrQ-8WWPE0K&8H1pxhgZG9vw$+ zh2_zE@X5pI`sAtl-jeCE*UK}X+UF*B>GMtV?&zmu=h3InoE)E>UZwY7$qPDsaV3UW@^~qIwd}f%}ojZBvQ{T!>?$YO*=5!w2c5YBMW_;%4`0Vs5y$4I4IsJ(@ z`0Vh>?3%NgcFm{Fjo-&-c8_}W9Jv*iNAJNW52NdotMvHHFt0mz^312cm7Cn9&o{~S z^u3|WbK}nJ#;n5eYWiewyXIz8f8Jir&W$Yfyl=S4><*SZy&E+*gqHz*r3K67$>c6ycGgUKhAop^(X+oj6Y zQ`^kBNuEA8`0VhWU1nLU8UldinRAmoeQxmC;XS*|vQ{@Ly$72-jIK|v(&ID3 zyzbn|GoSiaZgQ7C-^gXCN4K3Dl#LmmIXON%y-M%F^5A4uPtJqerOMUwc5X7~Ci&!L z>K*s~eTxgH0Yrm)ASHDAoGP|to+Hz>UspE)@`JH1Np z!Sdi_RZq@?+oj6Y^LB1B=O(!xO@AZndz0Bss~eTxgH0Yr=ed!q^mh2Z?#OK9nUkk? zn@o@DZ~T7x?BRX%9$os($?@6gReFxxCs*n5eQ>)}xq9L^@R@Uy+?~qKUX_t$dh@N$ zoV(oGqwUQnzS+C(%&GYXPFAyP-kYswm)p$x?C<*WyWmSCd<;~^wZp-k!Zrt5^ch%jQ z-I!H4Uag0Da++`dJk5=nXarE-p*Wo-tWuEczJVq{nwU_=cZYY+TV0{ zX18V)j#qEq4&CeC4z|}0PcN&M! z85u8cF0a3B=zDD3!RyVi+tGXU_08pYJ5(<>+g07Dbof2ET~)50cju;AhI;gSbg-%( z-mclrsvDKwgXz&`y-LrSQN3KI!rSTP&F9^g>Fug`@A>$h?cG(~pl~z3xg2k=wuh1J zs&bVMzX!Lg%GHzK!8fzHWp`?K-P_AHKl%P`IQ|I@y+FUd$m1`Y*&@5bof2ET~)50{0`o`^)UOYysCG1%d%D9sPtdQ z#!j!^csu-F-?&xV;mh&n_0NC!oxbJW?cAXBW_)uw-d=4FBimKwDjj|gZdaA7C%>at z+0>h1J$Y5{?v`b%zESDBF)uqGyK=p6$vt>Gy}bFnTRSsaPcKtp<@IdrZ%Xgc*Eh>m zy54efd%awRtG&xu33-LeckTN?#(F7jPETcx7W*6IDQXqSCy;h@;hbqydm$&t9rM$EL-)BO5crn z+4wQb^!Q1KO%_nbf{KkAkXAi44!){0K(bxYav-#TDRZsSE*j_t4y{rn~uDSlX z{Eq$VZf{w(dggcNyD=|2AG>nBZ^=D)JH5R5{cJna0J$Y5{_LgO<`-Z+7^Rn}? zE7$v$+=I8%%hhvv2cx&st8x@pUfUv4F20PN8P&^WD!iRuuAcW>DjcuYm%;7I>)XTk=smi6vaFts zS%sC?yFulujNF6URrTj%Z;!4wpUYdAJ*tOuw=ApTo6noe+$?jex-+ACI&&4?PG8lZ zd_$(v=~b8qE3ao`e^Yvo9*_3Q@XhX)-!^RJs*K!&+g0`FV{ebHH=oN}m_4e8bGIz3 z;+xN#%iJt;tGY9zdOC9z-maeA_Aq(1{<-`Xti1l@cidI!z4Cf`GrswY-!^RJs*K!& z+o|;RWDoAaZXQOrC$H+=-mAA5Urz4=_;!t7B!oV#UN72kZ` zT;^t(Th*Nz)zg`)@a>v!{mGl%w!?euxMn>{_jT^*@wZ($zS;fq+lH-Nm63aJ5006cs+TUy}BECJ9G8Pcjd79 z=5l;Hy-M%F%In$Kf2kXFcjfqIx65xEwsKWQ-lMZ${n~c7-h6utvq$xCt{K1CO*=Qs z%wPR^zwgeB>gmi?_;$^={^ZSW+u=QST(cgf+bh?r|JvSmu=4sRzb{wm@VBG0U){ai z?$(>n{GPn3ciW8L%ua4sUGMhfonGAyyq#Hn=3C|U?98f6h3i$A2P>~_3ATk z%JI!^m)|yQ<*JOlM`yqKwe4=b`SuoOkLuxEGk&w1c5arLzxwlj-<=uN)0xZ4@n-ep z8~F12-F3S=GpawYH`|%*SG()ByY*)FJ$Y5{j_j4;;j(HuzS-UK+lH-Nl{Lf4?NI&p z@@oCpzP#SIu-nmhqn9^7@3!o=Yu`7^+u_Z6^O?@vUAOk=dh_#sQ~Binjb7gT%x`wnU47pyZ-+PQ&1X7u zciq~f>&?&mO;ui9PxhYw@HchKd$S5FuXlsW)pqOIcy#s6<@k1bmEMDu*RwIBd-R-K zFK=ceufH*SbvJN3{Z~G-HTRuw>FjUb?s>nd%B$Gmp4D}w(PcR-#1^JjpwFWw(3Tu_h2^vv~%XI9~OwZ05)SFT6(=k+ob-cB!Xe%@`_ZP&hUzBn7tO|xv(jY{vqZXQOr zCtqa;+k@LxsWDde|P^t}0hgeh1&&O?iDccskGSrfyJrGrqYT z-%hX6d$4l-jmy;Yey15PN6l&(-n;c>?98f+EYq9Mbms25wMW;RpZA-q_9{Jo4{leL zt0%vMZ|Kx7*B? zE0?L;k?G{k<@IjM^r(08ZF|{nn%%6rQRzL{&BN%v&R%7Q--Fv#CI2RU%xxE8?y?>tMz5@cAMF9Gmp4B@ezQA0 z`<}e2cX!LORo|%e9_;2}cIe)%hwb_9wzos=_06ywXVYD?zWwulr@A|{8?y?>tM&h4 zyWKsx$DWhv<;`s5^*2Vp#|~E2!`n5xS#_h*d$5~_*`a&49=7MZ+ujbf*EhqK(|dpI z-BtYt3OD1M%kk~>YWuR~?eKf-Ik{fm%tl^+WAuCMU{yW5U9+22H!8gc`|@_^9&fgX z?Xl^OUREu~H@lOsZcpyU{-(P#yEUtDyjuS+w%gs4d+a%xUf#?`UVmfsd+cCUJ-l7B zn^iX|y$Ad9cIX~&wukMp>5g7jEyp*zli927ZyWoYviIogo6GU-^lJOE^Zq! z-poc`FRRk;(Zi~Gc)Mmdt8P?!5BBBl&^_L458GqY9lfktj&F7+vsc^SHug7V@6p#c zm*d;%)%In}+u`@vb8@}BnT@=@oAzva-NDH+vP^G2)0w;L)*iiix9DE?cCfv6czRj2 z9N+9tX0NutZR~H#-lMN?F311s+V9Sc>d)(CD!iRu-u%4VGI#ClVLf?O@9vgmtG-d` zJs2LztQR(n|aJ#BpJ@3xVX1Pl5!OH8|*x!`iqpxo+$N%cu@6L?s&+BC>yq#X& z{Jh&TckS$9J$Y5{?v`b%zESBtnYx^8SCy-D_&vB?Rj!`=4&J=}`NwbUJ2UTYNAJR{t!y*gquZl{RrT<8&2CoRsPvvpT~4;E%2hi29^9@f zS5JNiZ(jfW<2Ux5nRmCN_vq`J%kjUu_PaBq`ty333U8;EH$U&zuDPC`yeF^f-QKcn z)i)}=CsUV`?W%H>4!;MttIE}r-@%*LKmYiReP`y~?dU!F`sQ-{ude;>%&7jnUZ%p^ z>E+GOyR~bsrzh{pt9rM$EL-)BO7F?k>(fa2fztL^q^6qwS zPyq#X&{JdMc=6ZVap1i7e zd&{y_-&EPS`^9e?_HtE5?!oP2kXhJdUv-hTfMzue=_;c2&80@~%7ijkU9f)th0r zqxb0R&8uu=c&|*a?grk@Ts`l%Wn{eE47+i6x6U^zTX(9AEYq9MbY^$W_~vfP>)oJo zRYvZ??W%Iw$vc@!r&nPfth}C${Y~jT`g-#!8yVg!)2q9Iw=-AI`)wH+zr20Bn}@lt z%By;Jw=7$|yg6&O-cGMR?>AJq2fKL~-JZOvcYDjS)!Q33?(EU!w~f21o_!B)SCz|7-pN!ty$bVS z<@IdrZ%Xgc>#K5=uAaQiUfm76ow<7QUANV4#W0Ue&w3Wtr+$Eyp)^Q(o@| zm8&vx4{mo;*~vSZN~c$09<02cjr~pOJ$k(Qi|=ON*{k&3z}uOtCvV(VyOp23xp^4d zJ$Y5{_LgO;TeTeD+)a7C8&s~!$UV57YTr(V!mIUmFb`H<&&K|y^d3E4{l#~)@9b6j zZs6_A)sr`FtKG^^-rPKl?Vh})XTk=sh~T`it*o-`T75-N4(Kt0!;VR=bs-yt#Q8+dX+z@Aj5us#~=j-`q`ky&F`n z%E&!F-x5MwT z_vr9de2?CP-#mKnhOJzck$cOV@y+M*-CwL-zf;eK{B`u* z=;h7NyDhuz+V{=!cKF-zhND}B_2@nL&BN&Sy_i1-7UXu*veHIxwpI- z-+V6L{l(h#JN0bHUq|1KUf%q?+p^oPecvo^hrbkKTjdJdAEnUe&ubt8l#v z>y_i1-7UXu*veHI*@G=_zV#<>cH0i`vE!QcDBah&vnS)_&E@rO%k=7Q;O)$-=lxsU zR=>8qakt(~-;-DM?#Nyl9xkhvMas$4zq&P{VS<@Ih*xhf;e^yX(eb9dd^qwCG*{iZ6ft|xmZ ze^YmI^u}ek-PfDtw{P@&?%BDm-@co6dX=3AE3ao`ze;Ccm8<8+Z*{XfyQ&+N4)5Kv zY}Gd^y$8E_7~P({x}NNv_Z!W4IciqR@ZPO2V`oP7@UL^{*6iM_!trW-8QiYCem8Vp zJ3L%gEyp*zTYlTHm8&wcOmBXsGk4dmJ-Xg}-fyb%ReIPSyq#X&{N%Ul-InRq-M~G& zo4P^i&G_bWd^^2L@4?FJcjLAjz53+a_;P$RJ-7U}aaYx|=WbcH%DzYM!EYW$wUs-&~Gwr&sAcSh-$(-k#ip+g0W2d3SD_yD6`C zgUVGIS*ABX)0w;L)*fANKJPbGIqJ2OsoRn1ht#G z9^9@fSI@h1)7(vYy&F`n%E&Uk`I*k#UAOk=dh>a|S+zs;^yVtOonGGjKKXY2?#yn?DjcuYm%-a@?iRoIYj210&WvyV z;@$L(Tz%&6x@Fnwxx8i5-Fowxw`4n1FPEwCc6xd9li#X$Tc%gp;LUpT$+zowXLe&& z;dr&a41PN|DjWPB+^#BDPksmA+)cS2^`5-7m+hw6&8i!f-hA4 z^z!ENdbeeIl?~pkH=p@U^X|;Io(^8Ew}XA18F|4SyQ*A0`98k6n{szmwr*4zS*ABX)0w;L)*k)x?oscy zbw}5Ox6{j;pUZE2?bv#5n)RstO?PK@YgXZSwf@HFdu~)Z{2tt{Dpyawk8kd#++CHe z8&yV@>CMk{=I*++N9);eH@4i3oZN%A)61KmcU$(gW9zwT)}!_}-JRL3S%u@(`WvI~ zxl!rxdvLp|Ts`?dzWKI+xvR>ndUv-hTlI}f@4?o?+tGXN zVPrcq>~^~|yEUtDys9TJquX<%(&P8wc2&80@*DW(+Xm*YDzEC@-Lh=eH!8gcTMv_0 zZ~V#sfZyvIw`x0ldHroi@3n`K?aZ*-?au7htithXd0D+1RIbX%J-A&}uAX=2runw% zj;$xJ>UY!ZX4Q>K@4@ismv7$q#{^z!EC=DXzf_&qipyee1i-I!H5>e21d!K!+AyJk15Zd7_t zroz{k(V0=bT&BX?>E+GO$8XE4c<+4t#`f;2Zcw-x-(0S*vgKs;GE}b0$UV4SRj!`- zO|#0b-VE!>t9p00EL-)BO7F>3`1&$BGpd)%RCqhRy!rX~ZFv>%osZwx-d)uV3OD1M z|0}<}duVn>C9DlJH5R5$@lPf^<{9o^7{7hJ$jF>zWKlM+grXNS7qcL+^#BD&+L|y-L!-C z=*U$)yj`=KRX3>JdN;Setnc9M%+-^>*R9$P54WqS$Z!$7oZiaa>6|Prd z9<02cjr~pOJ^FgHN{4TT-S}gWFZ*>d8A93OCo=!Fuwl-rX(BR(bP_(R=R5 zdKI?39e$5JC)dl(_NYw#VsN|Lc6&B^ZcsWi-mKDKaau04-Ek7S`cF)dtcCenjs&{wGvQ^&vV)ULnvR;KPZ-?Jw&&l<2vpp(PzZl%^ zw%wl1o*R^|SznHC?xwun4Jub<?4F(P>|i~4RqyVWWvW|Lrk3HoTVKY` zjOyhw72a-r^ONu4+v!z$4_02!#{Q=C9=*O=j&F9i{I+2$S7qcL-0n-e$FuXA>*>jR z@~YnLEz3~-dB1N~;q~NY_Udln?abAa@4B_$uKc!L*8O?^mfMEz>)3m2^wn~Fv%BTD z4O_V?BlqBTU)nvMo!4AXPu`PP^=@yute!V!wG8jw`Z9KAR4yykj( z@}9h^cYDh)xq9C3n^kx{d6~Vs8+bc&_2j#5?YAqxZI^X_-oNFxq5Hb-*^u|j@y%|R z-!^RJs*K!&_vpw~e2?CP-#m%kbWUfIycys8)F_^(%Dz#>iO|o-R#b;>PDr*e;vJ>)qUf( zyY=QXzbUWk-I`T6-mJnovs#AtZhaX$Gpd)%zIb=|X5Mfz^^3vnZrknI^xEU;WYuzf zv%BTD4O_V?Bj1kR&Fb#mcDLSq=J(`Py<4)kj7%qQF0Xf6rdM|ZZ)Yw$`8(zA=(g{s zonB?%l{GE}bL4EO9*ISMP^ z>;}D^dzB4d)tk|qcUQ%G=Y97V{W>e=W~J)Dy*w^Qv=eLMLo zo9=Gjjrz>rMW!n!mzmWky$qG>H^V(URgS`z|10y(W7~t1RlOO#d3RO3ciwkZx@Nsx zh2zbu^jM{kGsWK}&IJ*tOuvgLNFJ*safUuDzX&AU;bdGnRo)7g_fnYx@z*S_Aq ziuay*d)u*bze?AmC#!lhdh_n8c<;RLs&vhIxeCXdSLwZWc#qx=@5!oqHuxUQ`{ML; z>dm*Kza5+IZr;u6lW+DN+^rk43dgI{L1W7k1nD4Z0qjtm@6^&AY4Oz4N|9Vbyl!?qudwdaoVcqqoC*vZ}rt*k3vC z-1c3)8`(|Sba(T&O?S_H!`uzN=SH<_#&33q+Nteses`PuhOS3PR`q7|=G|5C-g)1l zuxh(2JrTyPLmlx_jmu=5FvkH>zDTezQB&PA$Xlb@!Jod$0Sl@|$N{PTr&6jja02&Ns`= z^qvgu(c9rY*{VHC=lzBIM&<4LX8PN)>2AIKjX(KjdpNnwtUl>ws9e7p?%7rKZmQdD z-YvaHhgbDx^yb}FZ@zEsVLiD@hc~a%d+qQZy&c|@t=gk>-e0(HRNk&{roSDV?$+Di z_>*t8hm*_9>XTlE%JrM!o*mk=Uu9Fac{_TK4zKFX=*_#U-hAKM!+LU+4sTwi_uAn- zdON%)TeU~&yuWbYsJva@On*Bz-L1F3@h9JG4=0zI)hE3SmFqXdJv+2#zsja;^LF$e z9bVO&(VKUtb_4t3?DpKM^my|sz1I%!(c9rYmf8Tdh@rP4Ar0bGPoyK;dt{Zz1I%!(c9sh;dUy$2lHg83@x`)?NNO@`6?TCH?_BW z@^|39_O}gv&y8x=jNj}IwL^P!w_{Vjc{_TK4zKFX=*{1DGE{%!%ix||h2zbu^j zM{lP$qxJMXdpCApoZhV)nY`J}Dw`fXS>3K2@3GUPq8#%c^+qyx(4>FWbBwy+?;v^=5RvOnqiUma);Ru%29{!<$#> zy>@ty-cD~u>*;&;ZtT7|y<0ajd9$0Fvh(Qis$5o%_uAh!^gTDKT{FI!O_`lcJ@Z>; zdOf;%J9>`}ujfhwZrdqw_7@yI~jaC+47UE8(Ft@ zt86*AzInM_Io@l3+tByisCLbIl?~opmCMfj9(lXXeM8ryBddC|Dp%p2tg2_DS7AN5 zN{2VE(tGXjd)@7pPUcPq-%hstWa~!Ot=%eH4z6!rZdZ=?+TS+xJvXY|W_TIiT$Rht z{2qC`&3!}Hqa&;OW|&;1_heN)8@&qa$yGYMd6nL4hu`aNw{$XhGWd3~y74Mz-Ei=6y-Mk&WM~7GS%`mx2@5!oqHhLA- zldE)i^D4d94!_skZs}z1Wbo}|%TKm$WZky=>fs)HcQTcZ+bpv;vy?NJ%}#qCsf z>;IbBc~8D0Q|atsX4&R$@IAKn<$C+w!0oDX*_q!XZ@0N`=z4Ty)oyb)biM7!GIn|u z)|0Drc=IZ~NB8S?M|Q7wquQe~^o!f6?ADWI^-q4IzT4$?<#@0CZA0I4qtgA=yQ|{8 zGrwh~*Q1-aqxb0WU+Qi*?7i*CGB$b@)|0Drc=IZ~NB8S?M|Q7wquQe~^o!f6?ADWI z^-q4IzT4$?<#@0CZA0I4Q`NKkGkO5?dUx^JnEUz9=#pjlU4O>^eU_; zSLyKPReF!^*X@q%UhhV=M`h?2w^P}zC(G)e{6>AZ%k9eXUi;gIzUQW@XZOq7Rq@`L z-!jwd(aqb@dvtizGow9vJG>{W>e=X3SWm9f;mu##o%>Zb-QBz!^_k!Q7pLcqdh>Ml z^=`Lo?gnmGj`!N%HuOC=RXw|3-mZ%G&is~{UXO0xj^1mhr^BN?8QNphjx1YF-gB$c zWoLDm~t;E^lUYJ2u_jyc_k&w`AxSw^QAj zQID>EvmI)uZdZ=?*y&MnnOS|(-*)zPsNSxM_s+Xrr7zpO9laZMH)Oh=4DGRLN0u!o z@3~d!@#ZhzotsrQ-QBz!^_k!Q7pLdVFYPz#eb=|`Z(R1gzu7E9J$i2QGWum^^yZ)8 zeGPK909a*-VyysS>$D6-=cWzeMba(S^)MtMGU!0yd?wY^2 zoA&kgRlN7iZ<eq(eV}nO~vZ|h)UWN7KDjnYZ<-2pU%BH)UccVV@`~TwhRlY%1?V9nM z-Jy2scI9}FogO8Z)zfn_Rc1zS-dz>%o%h`;ec5I^xZSE7v>o1Shxh2)k!8!tdu~;G zGkWvv?7oyucQ@}weezuy`ipnxwrc;pe~;Z}clOoop3ApnsCV9-9obHmsc^jc%e$-M zy_4_M)BnnL?A)%pLEGWIc6g7z9a*-VyysT6H={St&hAUuba(S^)FW+X7uLWRq@_=->uS@HLo|Xo_wPn-eY5j z_heN)8@&qa!Ec?cef@dg$!^M~yPJ2TKKZT;y>VH0Zmags`}f#wc4uGR?zwzRhU!mt zdV3X)SM_G}=G|5C-g)1x(wC8CD*cVOXX8$#_he{~z8zV%oV>@DyKz|i`gZbFHr?I4 z8}*sr{Keh6QQ>%Xxn28u`zqdh=G$hJjTwJCdzB4d)tk|qcUQ%G=Y6+I|Hazj&8sKh zXovUM*x@}{RnJDR!aSHKQ{i~@=G(bhWz*fwyHTI{&0pND8x@XMm)o_kx3A*8XTEJl zyV0+*>(P@{y&1iEcU8Q1-gm3?U#uP8yn6DDc6g7C9o~~w^=$Mi%!7F{6^=)5zMY#@ zHr?I48}*sryuIm0&#nh2t9mnf^X{s6?|I*?(!*4Ha+z6u(#ue}ely&&Q{^bE++02L zt?kHe);GIZWz*fwyHU@_Z^GGB^~=rlo(%2L+u=P~RnJDR%E&VP>dEim>8twkn|D{m zd(ZprReG3ePcAd7PkI?D*KdYF(y;sORHba5h!_ax=XrLwodgcu!WoXSI@lfz8iL5w>z?1y&IL@4&Rfj zba?YBz1I%!(c9rYSyj(QkFLLEcXsE;Z_&B`Y4Z5cf;=Mc1L!rccaqV;d^qG4sTwi_uAn-dON%)tLoX{d&@TOmfoYo zt9mnf^X{s6?|I*$U+mY$H!54ay4{W>e=9X%Qo+p-lM~-dNX?S?y7k2dEcR5?AOLODqFm|+^&7S zeHHIL@3&DI`WNmS)pw}AoqUx|cQ@aSy?QQxJFGq4yn6D!9o}POhxcUF^nd-B#@nI5I*WM&nPSC`wh zueYz_&Ckbg|An*hJLt{3vA-#s?r#2H?7Y8^x>aT7ReG---lMm}d$OCt_Rg@)-P85x z$g19q-n_dizW3ybpDd z@1t^6W?rTD+TlHVJG>`T%VB%;Wt+F7_vrAd-i+S7yDGl-Um7(Q!sy(W2CtqdLT|L{ces^r$SM|@2o{c+|y(dF^^tTRvJGXY5}2>JoUGb4 z<2SogZ#}H%7X1rn;}*SnH}*GW)1AJmKiQVSJ-G_Un^)<*c6g884&S`o4!*bD=I-fw zbYxX;MsJ=S*G{I=*~84T&E4R8Z0&E{&Yij2Wp2?PJMS-?4SV(G+tJ^SO?UdL{$yJQ z_v9)ZZ(gPM+TlHVJACtUJNVvqo4cp$(UDcX8NGRSTsxUcXAd*WHg|*Xv9-T(J9p;l zm*-87o%a{chP`_8?dWf}8*^2E@_S`)Pp-o8=2d#H9p0n2!#6LtgYRv(xqG@E9a+_z z(VJ(-wUeoI_As+-b2s=NTl*WgbJx6#&R%8X!8{o%L(A<{dsN>}{x9`SRnJ?!3hT*L zI=p$6-fM^V=py| z-i>3`+z*n4z%Rc}W1cBp5k%28OkS+@CH-oWJ5<#x}zZ(nxv zu-?fx{=(VtM!orV^qyRmqq3_0*28*kRXV(RmELQI_vq`*RlN6PUr+xlZ^z!F!>f8T zs<%TuJ5`Rt%FVLP=kf+7uP(oJvgiGV8Lg-9*}Jj(;`DCa$mGr4$W=R3R@L8nSkJ9W zhc~a%d+qQZeZ9Gg_nz$Q>3`+z*n4z%Rc}W1cBp5k%28OkS+@CH-oWJ5<>uAr@;A7l z!@E&oo(z?t<#wt)s&7{&SLr-iRnJDR!g_L*4sTwi_qHRK>CbR-kKOIqlyBaS-lM~- zdNZoGLp?iHj>5{#vd!o61}3jAH?KaIzrhV1-i-?LWT*@+w^Qv={dQ!}4cen`N0u!o z@3~d!@#a-}Z##0C{tPGg*ximz`R48DJvzL4^Iu)J-N5b2@m~AehQ8-Uwc9LL>ASgk zH|oi6%g`@wr?Ojr%P_fm@;kDso}FHW_2eoY-n>fh(Rr{hZ&$^8Pu|wk|H|93_vrBI z&3|>>b_2I7$9wH>8~UCbwcRhzzHe^cje7FiGW3hvsqEI1WqS3b!+Z4Y$g<_+J+~@7 z-n>fRqwnVC*_V@hbhl$uzIi)(j}EWi{8!g)H*mXhyx0D=q3^j->FUY1W7FNuyHQWR zB}2csoyzX!WoYlro?OP(tT$Kjy(jPFu;peqD%|eNv+ai5qq`lO^3B`Pdvtj9=D)gb zyMf!4$~&%ky&KgYm7!nUPGwh5E~D$U!+UJnk!8!tdu~;Fyt!I` z-fzLmZ=P*Axkq<9HszbQqwmpwWxDqDyfLdV4_0oLZ9ebbp6tQOH@jQi9odcEjcSj| z&@XPMva2VT(e>KlJvQygvgPDGw<7^Oh{ro6p-d!@P2Qvzyi3^^M+*dgeF(#p!vYt|ynV>$SssY}%1!%gK9gReHQx zl~?s{(3^Kx#d}YFv!4D}-j2O*s>|)3`5WBXSGRlSEm@{FpSNpfyPJ2Tp7-}q8T!TB>D7Hhrt8Vj9-DS#*>dup zTa_N)43k&&Px_m8SH*kJ`+KYOWt+F7cT-(%_srkm&c3?cGjGW z-QBz!^}N4_%Fr+FhTfeD^JHj`-VX1{s(Lnh71oohbogeNysCfF-@Lmj-h1BPTct1C zYzObb$*Ns5ezQB&PTj5?@3GUPPlopB?eLzg zs%N8DVLiD@hi`_-tNJJX&AY4Oz32VCRr<2ccJLmYtlBl>H@ida)a}ag9y>iszOn6g zXZ}*&bM`myMsQbYU)?u$Prfbdwv4S=Z?58dPu{tM$zh&Mh2za%-dz>%J^4;O z{jY4tt_LTpcFp+B?oc~*yK=n8PLGmrY`fi=zm)f!{mr{k-J&vdx!DaFU&VXv@E*M# z-jh}JZ1gIuCr9_X!_!ywdCic=yJ0gGQNuU+TlHVJG>{W>e=X3SWk}bb%&>~>dSB5T@~*= zm*0lbw=<)9db281my_w**V|X|-g9|drbo}ay)iobdbZ{-Wz*fwyHQWRD?^u?-H`EB zyw?uz(c9rYSyj(Quflrpuj>xauBtD;d3RO3_k8>f?(NN}p3bbw)a7Kl_VxBvy!Tw* zmeupdtjf@vXJ_}NY`VL7H|oiEWhi`Aw%knb$)1Cw;hbF9p3!q z*>*$j(cO;C7jK8}@y7nf?cAAf+ivV(Oiw`*T-U&VXReA|q& zQSoM&Cqr+Zo!ytR>F(y;s3+fDMy6Nw=2d#H9p0n2!+Wx-o{e6G$zk%U{`~0KxKr7C zGPFnEjx1YFmUUAGw^Qv=eLMLoo9=GjjrzR5f&PWLQGKT_C)2gBx3A*8=kivjvemEB z_2|i}-i+S7yDHv$-gm3?%gAz7W?rTD+TlHVJG>{W>e=Y4c=PjqOQyo{=*_otv&yEs zn|Gr=`As~$j9g|`pY$?RuHOvz>{K}lD>qk9cI9}wo=l~~o4>rfD&BkYoqGCay--R@roS^KR59zln#Jk;}~LlU|0(^_$_I zohnCR<$C!oZ+HIjTkhF?dAlm!d-ArPzF99<;dt{Zz1I%!(c9rYSyj)b-VFQ2?bK)X z@OJdK+g;!2-Kfue%bk38W|bXYU2fOD-oA?Wp7}PJPJYYVvG=+!E5CWR<>VgS?b!U{ z?eXT-C*Ns@_t@CsJy})Hrrr$u#qHE*_V9M}x7%Ic=-pJGe9xWC-R5#Uy=*gG`}%H| z+sV=Ox9raD4RG)m$oy^_lay-3kGhO@oZkOA^d&@WPmfoYot9mnf^X{s6?|I*;aJ(v4 zli4^LF$e9bVO&(VKTy#e2{DPKD!DxhhxTc=IZ~*ADN|+u=P~72kViZ}-LN z>C~HVM}Ipu-QE1M=4alUyTSL|sCLcx&F)Y;bvt;E{#V|Py+?;v^=9I-Z@3q5w^mcepR>k+8+1q__dOG#y+tJ^SO?NlHtofPu=5FvkH>zDTezQB& zPPHT7j=p^JcJv+{Ue%k?n|D{m?>+P0yHOoiinFz+v%4SV(G z+tJ^SO?T^WeK*g%H+O^Yxl!$!@tfVDcB&oucJ$?&x1;yy@T%U7-n_die)BW$?d7Um zh2zbu^jM{kGsWa#>`=Y1n{d*gO0oAu4~w`0@Y`di=4Gw;pa;CpUVyJq}mcc`6e zN4_0>`R48DJvzLqH={T2u8QCM%zJyeDp%on^D4d94)4+1;k`0F+Iwbi_r>Yy)SGWd ze>*na>E-H^-*Z!@N9j44S%u@(<#z4s?W=h6li${Fx7i&^*ORGqc=MNc_e<^ghF*pB zeQ|m^_2%2r-;Papdb#@K_uQ1}QF=~hR^fPcxn28u zdvuT8?bwxX-j3d*!>f8Tdh_mnsa@Zf!9BSO$D3E_y>@ty-VX1T>CxUZd%G`APp95| zJNnzP=}s?KpZuPiGCfMq$;>JouP(Q1Uw`Xi z6^=Kr(tGXj9=#pjE7POBXZChqoSsg-`F8ZTW7C~pu0HuaH)VR1o|BnXI9^?D*S`L? zFJrq$U$%KWdXEmT>dolQch_Ey>Tf-)=T@b|n^)<*c6g884)2xe(cUwAyDv^pr`~)! z`rEPb=<%vtR*v`D-!}9;H>zDTe)I0yw}bD|mu=pT-lM~-dNX?S-L;pa`dbg{xmD@# z=2d#H9p0mFM=q;p*Q4|P!r8D_Z@wM8~UCb)vg)8d3WvG!T0FP zHg8Ao(cx9S8U5wAKiTX1z6|ckRXEzDTe)I0yw}bD|mu=pT-lM~- zdNcaVZ-27a_k9`MldEvNd6nL4hxgjMf!#c;$Ikl;XTx5-`F8X@JJh4UahZLy4E5-_ z&E@v8&CmN@PR5(7a@mvL)^E4j9ZJ`esdRXAy`5@@%BuQX59_&A>G0-NdaoVc+`jDQ zVZCR*Z}-LN>C~HVNAJPy^v%!PZ&!}@*y&MnnOS|(%TT%AjOy*Gc<*_)tMp}?x1;yy z@T%U7>g`lJ`_=CBD%+l1rNf(7>AiM(I}}!a^KR5_S==? zJ$8DOTxM3E^fFYgH=}yHD&BkE?J9lQ=I!V`I=rekt8hD&&XZO3Z1gIuCs*n4=2d#{ zSI36ljTx<{@7cSt`{ML&-N@w4Znk6N*{O0AR&JJUK9@Hzd3CwnlfB-qiua!UZaw|4 zyd8Uw4zKE)VV)fA(YGVZmXr6~s`PmCD!pfCUZwBm=G~~zeE(mZo;T{v|D|K&^&2vk z&K_o#ZSDr&V{3oocHQais(A02zs*dqM>lUr@6q8^J-G~Sr_y_}s-BHrh4thr9p1c3 z@7bAG>ASgkH|jIr{}-p{je7HTsNS8~uDKg~Sh-oY`Mi63vL{o^W$4YbEhqQrZpWs4 z^LF$e9bT<3gKtOgZAX@|(W|hYT&2UCSLy$Hy!AbAZr+W`&Xb`swA@a$NA-57{;RXE z;=Skc-7-CTK7Owp->q4O-n_di-h1A6tMp}?x1;yy@M?V-d^>t?JF<+8UWN7KDjnXu zO3%s7H)Ye^&AU;b_xq>}{o-~iyY*&N|K$7i_Eo(1{NHn%W%g!vGJFqCR_(~LFVD6c za*ytIY|1xpNAJ<$tMzofc6g6XJF;v!dC#p%k2kMwPyRaIa`rdxMsWp6^!Klx^OQ-lM~#JsH|#V~6)-RXrQM3hT*LI=p#xd-B)umb1TkH>z7yhJJB7 zm7N~--hAG!87BYo?$Mj)T{*c&cRM!ao42F)=~G$U>K2uuU))Y*r$;@x-DY?8)$N}7E?K5GpSNpTa?f|GY|1uoNAJ<$(Vh(Lv9ZH@vZ|hqUWN7KDjnXux;^>pc+1(}yc^XmDnq}x zoyty+dUCtX?(D1EJ@Z|%Om9AK*9`OIDjaYA^6sj5@5y)S>3`+z*n4z%v?oJ*Z0zu! ztg2_DS7AN5N{4Tjq5AFZs%*Nuc{l3w{st;Tzqp-ScFQ)iakpJ_H*mXhyx0D=q2F_- z(&Nov-dz>%J^4;O{ja>8`_#qg+XrEH^IAmmXVF8xSU# z2#>6KJw1>F>EnQ;ksf)gRC>Hemetdnzm!dPH}6J$@?9DF#qHd(Teg{vyX~60f!mei zz4o^a{hm9O9&i5g?y7k2$#?4Mf937mukMb_rYA#tY}%1!%gK9gReHTzrf&zk9h>fM z-i`X?TQc;EcjK;V|Ga;T-DY?8)$N|kw`8by-rZ(7N>|mF-@Lmj-h19}uhN%o-p+n? zcVsp_8QNphjx1YF-gB$c>(w%SJJ{{mba(S^)FkTcuw{ma8)JD!tbZ@6p@gJy})HMz6xk zo7s4~fy3m@@>Mq7-Mkz1`NQw*&aCntuP(Q1UvFQX)18JsH}gx5Im~s-BHrm6_4ovFouR zt9mnf^X{s6@4W9;>1B9tnOS9rSC`whueYz_y=T5{UUiS&I+&aF&2CoNba(S^)HA;+ zN9n5i>oj;!jAiM%kKPXN$*Oub`YQgG<#x}H-=TB=%iC4)-pP0Lb|`Ff8NSEX{>JUx znQz-}>|r-fZ?}E9oqFbbZsEUfcVzc^H&r&xdbtY6n^)<*c6g884)4jTdNz3PPuU&H zwkK2R@a8Y?u8Q|gzN5E8VVleFJ+}5YZs*Q?+je6QldIeP;&v);*EiGOj!kzr@5Zj$ zuKc{;m6=!Ry>@ty-VX1{s(Lnf?@!qs%C;v{>G0++@2-mXPQIhJLt&fC@IAKnH*V+7 zeA{+o59`U%zu|1$qBrlx{-$iYyLmTu)pq44{|>x)^_d-+O;3jQ*t8>;nQ!cw-`Fj^ zM~7GSX7uLWRq@{W@$UAfdwUrwQ|+4Zo86&y>UMDNtNY@-qf>9b9sTXtba(S^?5gd` z&--1Od6nL4hxh31@Sd!Szuhx@bKlVQ=*X(xjNZJvD&9Neo7voQ8NT=A&5gs@xHF@k zohnCRy|3rfD&9Neo7voQ8NT=A&5gs@xHF@kohnCRy|3J>~`+#Hp@}Eo=l~~o4>rfD&BnZJNRZds7KFjF1MF$e%|-?WIbwD zm%%+cPln3Say!)?)wh$cvgz)Zx2xiNC*PC9mYdz6GPKtY@3Gkqb~|@=o8>57Po~o0 z&0pSKRo^#dc#n-9C6}4iC%p`n>o>zaI~A@+dvxC4a5n7Kn{P*dJ2u_@@^)2x@8o-O z*mAQQREGB2;XO8b6?QxN^3B`PdvthJZ$@w4-Ku-v*f+~ikDl9HZZF&XyzlMFdep2g zgL`zI43(kfcB(z9Zzo@6(_QnQxZCQ<_vEnUW;dt|?X|;uZ1gJZcJ$?&x1;yy@T%U7 z-n_e2_r9@jmZ2U!x4GP2w)uJA+mrRESzQMA=sX!JL(A<{dsN>}zRG5IX7%K^^(vd5 zT&2UCSLwZWc#qx=?|m`&9&gGvZ%6OZ;Z?mEy?J-5?tNq5EJHndZgaW4Z1eNJwU9{tait8};Vf(cg|uj}5u1Urye0 ztJ34mtMpzwyhm?WUk0~>?`^lad%7MSS=F1-n|HVB-Z%EmGSs8zHkaGWHb3usd-C6U z->SYtW#|{TQ`xO=roSDV9vgC1znr}1R;9$-`j3;_jElvvZ^AN7-g+=khRV=#JJlZ5x0A23>Cuy``sL(3w<o@=6b}DZ@ zm?uMJXt|wgkLugWSK0LF$yNPw@}66j9&cWy_uAn-db|2ExE*|NyUpFx_2|f|-i+3_ z|HbXP$9wH>8~UCb)vg)e+)eq-!+Iyb|2LcsZ`7M_N8htUdu-Z~Wy{HXZdH1`d6nL4 zhu^#HWa_>#)9caA+tGV;cvWvk>)Ze0cHQH>_O}gv&y8x=jBoCy{N`c3li&Xv&W1PY z&9|fP*`Yl)?Z~p_I-Z@3q5kzU^cxZ*E6lzIi)(j}EWu&1kQEH_PqH@m~Ae zhQ8-UwQI&VcT;}zu-?h<{|#ru8};Vf(f91o9-DS#*>dupTa_MfUZwZi;WyuQGL<*C zqc7jQ9lb|~SM}!V`sQc;2D@^+*Z#Jl@3~R!n(OURc)Oc-qn`Y}4E^GED!cW~^gTPY z$EF=www%1@R;9YP_uQ)Vc=IZ~XGg}H>+Py|@8oSg z{jad7~D>)rWAWy@`r*_+wP@I5$Lwc9MOvgz*T-KZzOFGIh$oyu;#S!LtN z&>np|vTQkd&#g+2H?PurbnS1RZ8^C|cRM!ao42F)=`}ua=kL&HCo&?YAq(d+hWmxy-CS>1C+AY(1OqZr+W0-rqxI=ohzB*{wIL zY&;p-qi;u+Ehq1}Rq65OReF!E{mrv2C->-X$EJMqcJv+{{-!c}GwRu?auil>mTf+l zH!yis|GYi<>)3R6^KR6W@5#_FZl|(aZ&ulOGPFnEjx1YF-gB$cbd+)eMj$Bm7(?a zXgS_%hxh2)k!8!tdu~;Fy!lISyKeRFs(A0@_v-0?Qj#&33q+Ns-><2`nI zlw4+3pY&yN*-hDWck^!4bNQS4j^3>*L-ls59PQED;XPSZ&qlApdUBNxZ~oHTu3Npk zD&9N!y?Xjzc{}zV9Uk?X@tfVDcItNJc#oYPC6}4iCw-Y*c2hRp-Mkz1T>hrMqj#&y zP|r?<Y(%Y_E zy}K&jJNdnO`d@iF_8uJ`^_uaU-Jy2scI9}FogO8ZnOC2Gu=Oo(_FX%?8`XZhFU~u+ zt8`vFyhm?`_heN)8@&qa$yGXhv#hFjhu*xqD&9N!y?Xjzc{}&3?&M_rX1Hgk%28Ok zS+@CH-pJ6Ke_n2XQ#Re*yc_l8yE62P+uaoAo%vhHW$x7N$TBv171oohbogdjRqqbH zd3RO3ck+An^uO{~=R4J1Gk&u>)K1;59PhEyqvSHP%>4Xe>#oY1eP=IsquQe~^o!fw z6y}}zTgYYZ)a}SJHhLA-ldE*~Wz*fwyHU@_-}EQWdp1?P z*ADN|+u=P~RnJDR!g}(m{@0FOGt8BWvc{l3$_?!O3dC#Vb_uAn-dON%)tLoY4Raj4s_PW8-RrTdJ@2-mX z⁢KUXR}LvhKEP#&16PJv()~a=gb*kCKfyPJ2To{zt&9KUUnjuxJsxQBJcU8Q1 ze*Atr_M4Z{%T!pozJ0xY74JRy4L4%`^|3Qcy+m5`+EB--h1ZTW|WN? z@5xj+9=-W?ZdTcJck^!4GruWE>8kqWW_nMC_UP^Ko~){8qgP=)`0cvEv#ILKZ{A%M z@0}mN-){Gt-NNzea=Z5R_Eo(1%(u-b8#CUMsc<}c^X=TMvgz*T-Kb}NQ;yPA^~=rl zo(%2L+u=P~RnJDR!j{wRxl`%!s@{y=yt^viJMX(ydKunZW>(qZ)#Y~W>+P#}@0o9# z(cQHBcYVKE-|xZ6RlG;Xnbk6UkBz+w^I&q7Zavv+-yM9r_Up;s$!~TC@6okiZ!cdx z^F1;fcV<*i=gF%0z32bHZ}K<&r~XdA^Pl{t-%|Yz6jr|8zWn6Pa=Y^NWba(Q)jhqN z^7Uld`jZatb+?;#dbB$;%!8HRyc_l8P4{rS^7`G-*{`}q>FmvDkG@AouHrp9t{K0K z?6vO>Zg=DMf2;rLZ}7dIe@A~uud>1SU~+x+d3!fx4<=vFri|>BdBrAUBCJHi+61P=AFCUZf57UzPt8tyK=nQ z9`$Y=%-fxNm0pkb=s1}EE3@(3lJTqdcJ!VdS&!Z_%wxlmZ)$JH&a)%y(OZUjY&h~S zwPWYOAstoSIy4bZ_$t2F9&cvTELZK(dU~%N zjNDEJ@5xcmeml7A%hP*ox|5^j_+ERw*^NxD(%(GHW7CtPa24N^qu%!Ac>9}o+pc?i z*lknpc6ZC$-~6BU_U9k`O}q6Q=&gg5v*Tp?W|>OYlUMcZ@SYj{@@(z!<{N8A@0H`d z_GS2XHxFaybyJ47tCr(EH)a&3etCP;W6POQGkUvj@MgAVnQDjj>`~9&jKWm>o*nAB zF{3bbJ-LjX*RBk2w`zxaZp)CW?UiQ2@xjQ-PwX=um(Y^L~GrP?)mHy^o9vhCV zZ-(!+$D7%0mZ|hN5A)n~CsXNqa@3>Oqh!yFetEa6zUjN&klW)uJA1NBwL|OaJvkZg znbAEq_IPs{-m^!)e7A03c+ZUPv9Z^?A>SB%yY=nq$T!x`jXQNW(RRR3K54R-ESw`LS(#`oalD&C{x%xW3F$HrcTd9dZp_Ug&LJGh-(m7{t(Pgccy zPu?vf)AwXmy!YgLdb;&wuYGrL58iIQy?piLTY5I`VC(JWt7pFHUhbZ3r^>4Mz31|G zxv^8>_$t0f&&k%?%U4g{>e;x1t+$u2p82MGxqGsmDy!o6p3C3m#!iLftN0#0CtGhX z|I#zxa0g#c_HI3_*X?a@k8XdkC3Ru3chWL3QPy!&Ro>PCg*t9Xx& zYu2N5Jy{j+J@39*uewp;_$uC`30V9volAdvu%`ZN~T7cL%py zPX13Gzw@6y{yx8n@4?Acyhq28zdW7C24BT{bR1c~nXcC!PVT|>=w z(|K(0RlG;Xk-t2h#|B@;dvx3_Z`Tc;Ud4N5cys-^?02{OTlPD4DjZ+Mo9#Bk+tIsO zWyit2W$o?MllNacHtyV@dOA;5)w@Aq;Jx)k&2ckISaZ4Wcs%gWc2y(iyTW=7d@vi0`zU)jx>@A?MbgPZlL9EFvyx0l~^ zH)pFmv@o~|cf-HknbJ=tsT&Q4{+HP_qUHnMJH&--1o z8@N}7hpTuGR=$4wCwsSUR5o05eLJ|hiuca@t>$`qvns3Nz2|+?eDlkA2SahH=dme$5-(l9k+RTdpy00_h9AC_0PN^)2nz7R<2)u<{P`) zeY5`m20iodB=dJy_3(CP^}O#;n2I;UddtbOs{Z7CnOdk2V_GFp9dfxXaT*aGVy8mGy&u!OU@6N2k?W+2dzJ7bMOkX|!g}>SF`VGHJwks>IcPk^y^wslk{LQ}M zJ9?BXtLjhs`t8Xwef9jS_uzQcBYz#6c4S#qf6~`)PnPMcC%=2!wO{XEPL@^mCw=|) zWSPEt{{P?nzx@C3|Kb0S|2O3I%g8c)^?ZE84L!a){c^Ibsz2%LwFc*A%kt9UbAZ#h|3)t~hB+mmJb>dD{XPWH8HuSebMRXE8odcmrSqX z&9L5bvaG5<>Fc*A%kdEilcJ1}QDOm_HaA6oxXbBEecohW>{}ISyt7b^!4rGc5pj=^}JgYuHwzG-g2_6sz2%L z?NvLnOkX|udv3e-dUwsTs-E4KzjbWg>#OH~Fm{8zq@Ak&)Q zZ|vkd^yXDNxCiUOH^cVal*7o|HQ)S=oqUJhylMycU_JO|*zMSrn^8Tzxq9Pe^}I8$ z+QEBp53Wbadvev@jOyvk)f+FvZ+^nIbFZ@XV0x6?vxDKclixa+*WHcLccWi@=5L^P zLuaoypS-h!ZEwD9Wa@d}n^76+!8!6xW%NC`2iK$Io*fLoot)15CyZ@xzu~u3di);T zqtl~iRDa`ktKH%~cV-pdE63BV!uHsBbowg1onGGjd~D&Kd$S7fmE-AFVS8*mx-Wn0 zV4m+#n7RJO+T(k>@0PCHdO8pOF#^BoE^*WXxsd~f&N(sf%;=gG|niRo7t9q`R>^4Zdvzuv%6LMRd%pn zciWWV&FabDgon2?mz~*|cWbs^rH6T&;eTTEZq)OB(~Ror%w=-aqxY7P=~iX%cILlf z&*j^`r}G=@$uc(FE&pP6RsH$+CVV%`=-sGnI5YZn>^wGlbhAu;Q<(gWC;tuOJ$Kud zyIbX*N2f|GICjS`T5wnSJ`>6`sN$ASG&3SFUEVWy;*HnPrql6(wXti zH*T+XbMwo5hw9lhm(%s!t=={;ckP$6+1`!(wvko!XMX#ia^J7=9nX$jzkNG=Z@1)I z?@s;Nz}eX2H`B@NRl7g+yCZwN<(l;<-R-_MzOnc1W;o1VExUP{`n=yEd$96)HukG_ zs=XQ2(`}ZaJ$B}sm#J*=y=Auy+dD5)?aip3uAclitZ($Ieh2O?yJcAKBD@4@7&?Ce+R zJUaZUOm0@;d+qAo$y7ML2a~I}-ko~el;PRR>dWLR9e%T1-3`2FkHV{ZxCfK3x()1_9ohSLm~X21<}-iu=5Fz3cWOH~RsH!d{+oR7_fU^cuD@j&>Ye;u z`gVADRS)<6DeGR3dMCfR-n`ppHobOux~d-T{S(Hvd%e8*(sbW$Veyp8Re2 z&FDf{|nlGGrs?3bM@wZV`oNTH{Q-ocV+Uj=lxsl$oeY&mdl>^Tj+9f znf`g-Y=*;byq%lw%H(Cwn z2X}*ZXZOX}dv3{k)LTw2(?1{Y;cRx-y}kO?!QG&@T^ZgT>dD)|yyfIF{qw%r42Ru# zJ2%~x$;+PkO*e3T6~Flj!#96z<-0edY`8MLUG=Mj?~dI4dVADkvmWNr;j1^#raQCj z)F6iuUU_JCvWV_ z^mO)m^}N4luG&@Y%kc1Jayzu%9`)?o!Mx?WqP~nR|jXO;>|L&-X8Vr z+rhl$aUI+{nx&4-KoB5)}!9ZZ`haV>Fo9DdH+Up)vjt^R!_fK zhSsw)<2T!(UN`n-dOCZ(dh%N`)vo%hV@LnB?^}1OZ<_U}ck&zdWqLY$y?Wli(Ok8w z+LzVSZFC)v=pLF=m>}B|7IePPKIXkoL z#@o?*a#U}JE+fm}B|7IePPKIXkoL#@o^F(cd=9x?6uP-{^b2@0!oNwUd)& zdiDIP~@G`P&Jv%%6W;@jD#=cBX zXRlY!`&+8KYFD){!^6wSvi0oj@SE*WuN(U^J)ON?J@0R+@~U0cz6=j9Bg@vav%_z; zL%nY7%k*^idiA8!+o|xquMXaAck9`~P_J2!dM9t}%k*^i`pxJ4-S&FB>Q@K<+Vzb) z)i=$0)I0NUA(!c&*^zJEvc9A1eRc4!UEjD@8dwMr=^ozH1gTim!vhLQO%QyO7@4MzRZ|&q{Sye7O@2JNdS}9ld97R^fV-EL(4ndTiFiJUV>!%d_*`m{Hh` zw{z27nY`?If7gzzui`g9Vff~+E!<=8!SpKZ#_8><@LRX6yY=VtjlS3WuKCPcJGck) zV0smH#fcJ}agX4!fB@^mI9_G>E zt9Z}Oo-EU=C*P5&cGa&A{y~x5{#?G% z_j=zopS+cq>FMnC>iJj4nX7hH`!YPdj4WHv&JMrX4)u1knH@|{mg&`V*=^?r{p#Rf zyS{O!`leZrdgt{wEz>`!wo8hnYHXg`sGi$o4>s+^Nmb} z?|pUfZo6C04u*Qo`YOKpB;Wk*%_w`WOmA2H>fm3yzHz7ern#P;?3t@Kzk4&vo-5Pa rRlhp;*RF5eslI8}SMkj!`Q~?TM%i;^db{dZ2mjjjjXTvh|1 Date: Tue, 13 May 2025 19:43:19 +0000 Subject: [PATCH 2/3] enable hadamard rotation, also fix a minor Qbmm bug Signed-off-by: cliu-us --- fms_mo/modules/bmm.py | 11 +- fms_mo/modules/linear.py | 11 +- fms_mo/prep.py | 8 +- fms_mo/quant/quantizers.py | 76 ++++++++++---- fms_mo/quant/rotation.py | 190 +++++++++++++++++++++++++++------- fms_mo/utils/hadamard_util.py | 74 +++++++------ fms_mo/utils/qconfig_utils.py | 62 +++++------ fms_mo/utils/utils.py | 44 +++++--- 8 files changed, 332 insertions(+), 144 deletions(-) diff --git a/fms_mo/modules/bmm.py b/fms_mo/modules/bmm.py index aa16deff..b50cc2aa 100644 --- a/fms_mo/modules/bmm.py +++ b/fms_mo/modules/bmm.py @@ -20,6 +20,7 @@ # Local from fms_mo.quant.quantizers import Qbypass, Qdynamic, get_activation_quantizer +from fms_mo.quant.rotation import RotQuantWrapper class QBmm(nn.Module): @@ -131,8 +132,10 @@ def __init__( ) self.calib_iterator = [] # To simplify update of clipvals in forward() - self.quantize_m1 = Qbypass() - self.quantize_calib_m1 = Qbypass() + quant_m1_def = Qbypass() if "rot_" not in self.qm1_mode else RotQuantWrapper() + quant_m2_def = Qbypass() if "rot_" not in self.qm2_mode else RotQuantWrapper() + self.quantize_m1 = quant_m1_def + self.quantize_calib_m1 = quant_m1_def if self.num_bits_m1 not in [32, 16]: self.quantize_m1 = get_activation_quantizer( self.qm1_mode if (not m1_bounded or "fp8" in qm1_mode) else "minmax", @@ -155,8 +158,8 @@ def __init__( symmetric=self.symmetric, ) - self.quantize_m2 = Qbypass() - self.quantize_calib_m2 = Qbypass() + self.quantize_m2 = quant_m2_def + self.quantize_calib_m2 = quant_m2_def if self.num_bits_m2 not in [32, 16]: self.quantize_m2 = get_activation_quantizer( self.qm2_mode if (not m2_bounded or "fp8" in qm2_mode) else "minmax", diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py index 26b383c6..13b89a6d 100644 --- a/fms_mo/modules/linear.py +++ b/fms_mo/modules/linear.py @@ -36,6 +36,7 @@ get_weight_quantizer, mask_fc_kij, ) +from fms_mo.quant.rotation import RotQuantWrapper from fms_mo.utils.import_utils import available_packages if available_packages["triton"]: @@ -158,8 +159,10 @@ def __init__( self.calib_iterator = [] # To simplify update of clipvals in forward() - self.quantize_feature = Qbypass() - self.quantize_calib_feature = Qbypass() + quantA_default = Qbypass() if "rot_" not in self.qa_mode else RotQuantWrapper() + quantW_default = Qbypass() if "rot_" not in self.qw_mode else RotQuantWrapper() + self.quantize_feature = quantA_default + self.quantize_calib_feature = quantA_default if self.num_bits_feature not in [32, 16]: self.quantize_feature = get_activation_quantizer( self.qa_mode, @@ -187,8 +190,8 @@ def __init__( quantizer2sync=self.quantize_feature, ) - self.quantize_weight = Qbypass() - self.quantize_calib_weight = Qbypass() + self.quantize_weight = quantW_default + self.quantize_calib_weight = quantW_default if self.num_bits_weight not in [32, 16]: self.quantize_weight = get_weight_quantizer( self.qw_mode, diff --git a/fms_mo/prep.py b/fms_mo/prep.py index 0e8501fa..25fc6886 100644 --- a/fms_mo/prep.py +++ b/fms_mo/prep.py @@ -215,7 +215,7 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False): base_params = {} if hasattr(module, "__constants__"): base_params = {k: getattr(module, k) for k in module.__constants__} - base_params["bias"] = module.bias is not None + base_params["bias"] = getattr(module, "bias", None) is not None base_params["device"] = next(module.parameters()).device # usually cuda module_output = module @@ -480,6 +480,12 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False): setattr(module_output, k, v) module_output._all_weights = module._all_weights + # For nn.Embedding + elif isinstance(module, nn.Embedding): + # simplest case, only support rotation for now, no quantization + Qemb = mapping.get(nn.Embedding, nn.Embedding) + module_output = Qemb(module) + return module_output diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py index abd24389..1c3ee5e3 100644 --- a/fms_mo/quant/quantizers.py +++ b/fms_mo/quant/quantizers.py @@ -77,7 +77,7 @@ def get_activation_quantizer( use_rot = False if "rot_" in qa_mode or "_rot" in qa_mode: use_rot = True - qa_mode.replace("rot_", "").replace("_rot", "") + qa_mode = qa_mode.replace("rot_", "").replace("_rot", "") if not use_swcap: QPACTLUT = { @@ -134,23 +134,27 @@ def get_activation_quantizer( ) elif qa_mode == "dorefa": act_quantizer = dorefa_quantize_activation - elif ( - qa_mode == "max" - ): # NOTE Need to be careful using this for activation, particular to 1 sided. - act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=False) - elif qa_mode == "minmax": - act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=True) + elif "max" in qa_mode: + # NOTE Need to be careful using this for activation, particular to 1 sided. + if "min" in qa_mode: + act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=True) + elif "pertoken" in qa_mode or "perToken" in qa_mode: + act_quantizer = QMaxDynamic(nbits, dim=-1) + elif "per_channel" in qa_mode or "perCh" in qa_mode: + act_quantizer = QMaxDynamic(nbits, dim=-2) + elif "sym" in qa_mode: + act_quantizer = Qmax( + nbits, + align_zero=True, + minmax=False, + extend_act_range=extend_act_range, + ) + else: + act_quantizer = Qmax(nbits, align_zero=align_zero, minmax=False) elif qa_mode == "fix": act_quantizer = QFixSymmetric( nbits, init_clip_val=clip_val, align_zero=align_zero ) - elif qa_mode == "maxsym": - act_quantizer = Qmax( - nbits, - align_zero=True, - minmax=False, - extend_act_range=extend_act_range, - ) elif qa_mode == "pactsym": act_quantizer = PACT2Sym( nbits, @@ -190,8 +194,6 @@ def get_activation_quantizer( perToken=perToken, emulate=True, ) - elif qa_mode == "pertokenmax": - act_quantizer = PerTokenMax(nbits) else: raise ValueError(f"unrecognized activation quantization mode {qa_mode}") else: # swcap-compatible activation quantizers @@ -266,7 +268,7 @@ def get_weight_quantizer( use_rot = False if "rot_" in qw_mode or "_rot" in qw_mode: use_rot = True - qw_mode.replace("rot_", "").replace("_rot", "") + qw_mode = qw_mode.replace("rot_", "").replace("_rot", "") weight_quantizer = None if not use_swcap: @@ -3495,7 +3497,7 @@ def __init__(self, num_bits): """ For per-token activation quantization using abs().max() as scale, Zero is aligned so that the levels are symmetric around zero (lossing one level) - Since the token length is un-known before running, the quatnization is dynamic, meaning + Since the token length is un-known before running, the quantization is dynamic, meaning no trainable quantization scales and the scales are computed at run time. """ super().__init__() @@ -3512,6 +3514,42 @@ def __repr__(self): return f"{self.__class__.__name__}(num_bits={self.num_bits}, quantizer=)" +class QMaxDynamic(nn.Module): + def __init__(self, num_bits, dim=-1): + """ + For per-token or per-channel quantization using abs().max() as scale, usually for activation + and could be used for Qbmm M2 as well. + (reduce) dim = -1 -> abs() will output a column vector (if input is 2D) => per token + dim = -2 -> per-channel + Zero is aligned so that the levels are symmetric around zero (lossing one level) + Since the token length is un-known before running, the quantizater can only calculate the + scales at the run times dynamically, meaning no trainable quantization scales is allowed. + (unless input seq length is always the same, not just padded to a fixed length.) + """ + super().__init__() + self.num_bits = num_bits + self.levels = 2 ** (self.num_bits - 1) - 1 + if isinstance(dim, str): + if "perCh" in dim or "per_channel" in dim: + dim = -2 + elif "perToken" in dim or "per_token" in dim or "per_Token" in dim: + dim = -1 + elif dim in [-1, -2]: + self.reduce_dim = dim + else: + raise ValueError( + f"Reduce dim can only be [-1, -2] or ['perCh', 'perToken'] but found {dim}" + ) + + def forward(self, input_tensor): + amax_dim = input_tensor.abs().max(dim=self.reduce_dim, keepdim=True)[0] + scales = amax_dim.clamp(min=1e-5).div(self.levels) + return input_tensor.div(scales).round().mul(scales) + + def __repr__(self): + return f"{self.__class__.__name__}(num_bits={self.num_bits}, quantizer=)" + + class Qdynamic(nn.Module): def __init__( self, @@ -4585,7 +4623,7 @@ def forward(self, x_orig): class Qbypass(nn.Module): """ - no quantization at all, straight-thru + No quantization at all, output the input_tensor directly. in place of lambda function when using nbits=32 and 16. to avoid issue when pickle (ie torch.save) of lambda (seems to be a problem only for DDP) diff --git a/fms_mo/quant/rotation.py b/fms_mo/quant/rotation.py index db18d73b..70cf284d 100644 --- a/fms_mo/quant/rotation.py +++ b/fms_mo/quant/rotation.py @@ -18,62 +18,178 @@ import torch # Local -from fms_mo.utils.hadamard_util import matmul_hadU_cuda +from fms_mo.utils.hadamard_util import matmul_hadU, matmul_hadU_cuda class RotQuantWrapper(torch.nn.Module): """Add a wrapper to fms-mo quantizers. Objects of this class could have two rotation tensors, and basic formula is: - self.quantizer(self.rot_left @ input_tensor @ self.rot_right) + quantizer(Rot_left @ input_tensor @ Rot_right) - NOTE rot_xxx could be optional, depending on whether it's for weights or activations. - For example, in SpinQuant QKV Linears will looks like (pseudo-code, "self" are not refering - to the same objects here): - qx = self.quantize_feature(x) # no rotation, just a normal quantizer - qw_q = self.quantize_weight(self.weight, R1_t) # need left rotation only - qw_k = self.quantize_weight(sefl.weight, R1_t) - qw_v = self.quantize_weight(sefl.weight, R1_t, R2) # need both left and right rotation + But Rot_xxx could be optional, depending on whether it's for weights or activations. - return F.linear(qx, qw, bias) + For weights, two possible use cases in SpinQuant are: + (A^-1 W) and (A^-1 W B). + Since linear.weight is already W^T and should stay as (rotated W)^T , these two cases will be + (A^-1 W)^T = W^T (A^-1)^T = W^T A, as A is a Hadamard matrix + (A^-1 W B)^T = B^T W^T A + ** Furthermore, depending on R1 is A (v_proj) or B (o_ and down_proj), computation could be + slightly different + if R1 is A (R_left): + calc W^T A first -> (W^T A)^T -> reshape -> *B -> .t() then ready for linear + else R1 is B (R_right): + calc B^T W^T first -> reshape -> *A -> ready for linear - for MLP down_proj - qx = self.quantize_feature(x, None, R4) # for activation, should be x @ R - qw = self.quantize_weight(sefl.weight, R4_t, R1) + For activation (online rotation), it will always be (input_tensor @ R_right) - return F.linear(qx, qw, bias) + then return F.linear(qx, qw, bias) - Also need to make sure self.R is pointing to a nn.Parameter() if training on R is needed. + NOTE + 0. If online_full_had == False and self.R_left is None => do nothing, apply quantizer ONLY. + 1. Make sure self.R is pointing to a nn.Parameter() if training on R is needed. + 2. Because R is a ptr to a nn.Param tensor, it CANNOT store a "transposed" copy, hence the use + of self.transpose flags if needed. """ - def __init__(self, quantizer, *args, **kwargs): + def __init__(self, quantizer=None, *args, **kwargs): self.online_full_had = kwargs.pop("online_full_had", None) - self.f32_had = kwargs.pop("f32_had", None) + self.compute_dtype = kwargs.pop("compute_dtype", torch.float64) super().__init__(*args, **kwargs) self.quantizer = quantizer self.R_left = None self.R_right = None - self.K_left = None # if K_xxx > 1, R_xxx is a special had matrix + self.K_left = None self.K_right = None + self.R1_is_left = True # see dosstring above + self.transpose_right = False # this flag is for online rotation only + # if K_xxx == 1, use exact hadamard matrix. (R_xxx won't be needed). but if K > 1, R will + # be one of the 12 special had matrix. (they are stored in a binary file) + + def forward(self, inp): + org_dtype = inp.dtype + + if self.R_left is not None: + # Case 1: Weight rotation + # as Activation rotation will only have R_right. If R_left exists for A => + # should have absorbed R_left for A into prev layer's W. + # Hence, R_left is not None can only mean weight rotation, not online => + # could be either 1) R_left only or 2) both R_left and R_right. + + in_feat, out_feat = inp.shape[-1], inp.shape[0] # input is W^T (out, in) + if self.R1_is_left: + # for q, k, v, up, gate, calc W^T A first. see details in docstring + inp = inp.to(self.compute_dtype) @ self.R_left.to(self.compute_dtype) + + if self.R_right is not None: + had_dim = self.R_right.shape[0] + inp = inp.t() # (W^T A) ^T = A^T W, shape is (in, out) + inp = inp.reshape(-1, out_feat // had_dim, had_dim) + inp = inp.to(self.compute_dtype) @ self.R_right.to( + self.compute_dtype + ) + inp = inp.reshape((in_feat, out_feat)).t() + + else: + assert self.R_right is not None, "R1_is_right but R_right is None." + + # for o, down, calc B^T W^T first, where R1 is B + inp = self.R_right.t().to(self.compute_dtype) @ inp.to( + self.compute_dtype + ) + had_dim = self.R_left.shape[0] + inp = inp.t() # this will be W, not W^T, i.e. (in, out) + w_shape = inp.shape + inp = inp.reshape(-1, in_feat // had_dim, had_dim) + inp = inp.to(self.compute_dtype) @ self.R_left.to(self.compute_dtype) + inp = inp.reshape((out_feat, in_feat)) + + elif self.R_right is not None or self.K_right == 1: + # Case 2: rotation for activation. should always be (inp @ R_right) + if self.online_full_had: + # Case 2-1: online, no training to R. When R_right is None (K==1), use exact size + if self.compute_dtype in [torch.float, torch.float64]: + # follow SpinQuant paper, use no higher than fp32 for online had + inp = inp.float() + + # matmul_hadU_cuda already include 1/sqrt(shape[-1]) + if self.transpose_right and self.R_right is not None: + inp = matmul_hadU_cuda(inp, self.R_right.t(), self.K_right) + else: + inp = matmul_hadU_cuda(inp, self.R_right, self.K_right) + # inp = matmul_hadU(inp) + else: + # Case 2-2: offline (such as last R before lm_head) + if self.transpose_right: + inp = inp.to(self.compute_dtype) @ self.R_right.t().to( + self.compute_dtype + ) + else: + inp = inp.to(self.compute_dtype) @ self.R_right.to( + self.compute_dtype + ) + + # Case 3: both R_left and R_right are None and K!=1=> No Rotation, apply quantizer if exist. + + inp = inp.to(org_dtype) + + if self.quantizer: + # with torch.no_grad(): + inp = self.quantizer(inp) + + return inp + + def __repr__(self): + """Simplified repr for RotQuantizer. Shows name and nbits.""" + repr_str = "Only(" + if self.quantizer is not None: + repr_str = f"{self.quantizer.__class__.__name__}(" + + if self.R_left is not None or self.online_full_had: + # will do W or A rotation + repr_str = ( + "Rot" + + repr_str + + f"{'' if self.R_left is None else 'Rl'},{'' if self.R_right is None else 'Rr'})" + ) + + return repr_str + + +class EmbeddingRotWrapper(torch.nn.Module): + """Simply add a Rotation after input embeddings. original code looks like + + input_embeds = self.embed_tokens(input_ids) + + This wrapper will be: + + input_embeds = self.embed_tokens(input_ids) + dtype = input_embeds.dtype + if self.R: + input_embeds = input_embeds @ self.R).to(dtype) + return input_embeds - def forward(self, input_tensor): - org_dtype = input_tensor.dtype + Also need to make sure self.R is pointing to a nn.Parameter() if training on R is needed. + """ - if self.online_full_had: - # online hadamard => rotation for activation. should be input_tensor @ R_right - # cannot be fused into W and no training, either. - if self.fp32_had: - input_tensor = input_tensor.float() - input_tensor = matmul_hadU_cuda( - input_tensor, self.R_right, self.K_right + def __init__(self, emb, *args, **kwargs): + super().__init__(*args, **kwargs) + self.emb = emb + self.R = None + self.compute_dtype = torch.float64 + + def forward(self, inp_ids): + inp_embeds = self.emb(inp_ids) + org_dtype = inp_embeds.dtype + if self.R is not None: + inp_embeds = ( + inp_embeds.to(self.compute_dtype) @ self.R.to(self.compute_dtype) ).to(org_dtype) - - return input_tensor - - # not online => rotation for weights, could be fused into W later. - if self.R_left: - input_tensor = self.R_left @ inp_tensor - if self.R_right: - inp_tensor = inp_tensor @ self.R_right - - return inp_tensor + return inp_embeds + + def __repr__(self): + """Simplified repr for RotEmb.""" + repr_str = f"Rot{str(self.emb)}" + if self.R is not None: + repr_str.replace(")", ", Rr)") + return repr_str diff --git a/fms_mo/utils/hadamard_util.py b/fms_mo/utils/hadamard_util.py index 99f51f33..9f92b2e0 100644 --- a/fms_mo/utils/hadamard_util.py +++ b/fms_mo/utils/hadamard_util.py @@ -21,22 +21,32 @@ sizes available in the safetensors file. [12, 20, 28, 36, 40, 44, 52, 60, 108, 140, 156, 172] """ +# Standard +from pathlib import Path + # Third Party -from fast_hadamard_transform import hadamard_transform +from fast_hadamard_transform import hadamard_transform # pylint: disable=import-error from safetensors import safe_open import torch +# TODO make sure it's a persistent cache so we don't need to load from file everytime +cwd = Path(__file__).parent +hadKs = {} +with safe_open(cwd / "hadk.safetensors", framework="pt", device="cuda") as f: + for K_str in f.keys(): # K is a str + hadKs[K_str] = f.get_tensor(K_str) + class HadamardTransform(torch.autograd.Function): """The unnormalized Hadamard transform (i.e. without dividing by sqrt(2))""" # TODO seems redundant, insdie hadamard_transform(), backward is already handled...? @staticmethod - def forward(ctx, u): + def forward(_ctx, u): return hadamard_transform(u) @staticmethod - def backward(ctx, grad): + def backward(_ctx, grad): return hadamard_transform(grad) @@ -44,15 +54,9 @@ def get_hadK(n, transpose=False): """Simplify the implementation and use binary tensors instead of text implementation.""" for K in [172, 156, 140, 108, 60, 52, 44, 40, 36, 28, 20, 12]: if n % K == 0 and is_pow2(n // K): - with safe_open("hadk.safetensors", framework="pt") as f: - assert ( - str(K) in f.keys() - ), f"Special size Hadamard {K} does not exist in the file." - hadK = f.get_tensor(str(K)) - + hadK = hadKs[str(K)] if transpose: hadK = hadK.T - break if hadK is None: @@ -67,35 +71,39 @@ def get_hadK(n, transpose=False): def matmul_hadU(X, transpose=False): + """Borrowed from SpinQuant.""" n = X.shape[-1] hadK, K = get_hadK(n, transpose) - input = X.clone().view(-1, n, 1) - output = input.clone() - while input.shape[1] > K: - input = input.view(input.shape[0], input.shape[1] // 2, 2, input.shape[2]) - output = output.view(input.shape) - output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :] - output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :] - output = output.view(input.shape[0], input.shape[1], -1) - (input, output) = (output, input) + input_ = X.clone().view(-1, n, 1) + output = input_.clone() + while input_.shape[1] > K: + input_ = input_.view(input_.shape[0], input_.shape[1] // 2, 2, input_.shape[2]) + output = output.view(input_.shape) + output[:, :, 0, :] = input_[:, :, 0, :] + input_[:, :, 1, :] + output[:, :, 1, :] = input_[:, :, 0, :] - input_[:, :, 1, :] + output = output.view(input_.shape[0], input_.shape[1], -1) + (input_, output) = (output, input_) del output if K > 1: # Do not explicitly repeat - OOM - # input = torch.bmm( - # hadK.repeat(len(input), 1, 1).to(input.device).to(input.dtype), input) + # input_ = torch.bmm( + # hadK.repeat(len(input_), 1, 1).to(input_.device).to(input_.dtype), input_) # Use bcast instead - input = hadK.view(1, K, K).to(input) @ input + input_ = hadK.view(1, K, K).to(input_) @ input_ - return input.view(X.shape) / torch.tensor(n).sqrt() + return input_.view(X.shape) / torch.tensor(n).sqrt() def matmul_hadUt(X): + """Borrowed from SpinQuant.""" return matmul_hadU(X, transpose=True) def random_hadamard_matrix(size, device): - # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation" + """Borrowed from SpinQuant.""" + # See https://cornell-relaxml.github.io/quip-sharp/ + # Section "Randomized Hadamard Transformation" Q = torch.randint(low=0, high=2, size=(size,)).to(torch.float64) Q = Q * 2 - 1 Q = torch.diag(Q) @@ -103,28 +111,31 @@ def random_hadamard_matrix(size, device): def hadamard_matrix(size, device): - # See https://cornell-relaxml.github.io/quip-sharp/ , Section "Randomized Hadamard Transformation" + """Borrowed from SpinQuant.""" Q = torch.eye(size) return matmul_hadU(Q).to(device) def matmul_hadU_cuda(X, hadK, K): + """Borrowed from SpinQuant.""" n = X.shape[-1] if K == 1: return HadamardTransform.apply(X.contiguous()) / torch.tensor(n).sqrt() # if transpose: # hadK = hadK.T.contiguous() - input = X.view(-1, K, n // K) - input = HadamardTransform.apply(input.contiguous()) / torch.tensor(n).sqrt() - input = hadK.to(input.device).to(input.dtype) @ input - return input.reshape(X.shape) + input_ = X.view(-1, K, n // K) + input_ = HadamardTransform.apply(input_.contiguous()) / torch.tensor(n).sqrt() + input_ = hadK.to(input_.device).to(input_.dtype) @ input_ + return input_.reshape(X.shape) -def matmul_hadUt_cuda(X, hadK, K): - return matmul_hadU_cuda(X, hadK, K, transpose=True) +# def matmul_hadUt_cuda(X, hadK, K): +# """Borrowed from SpinQuant.""" +# return matmul_hadU_cuda(X, hadK, K, transpose=True) def apply_exact_had_to_linear(module, had_dim=-1, output=False, R2=None): + """Borrowed from SpinQuant.""" assert isinstance(module, torch.nn.Linear) in_features, out_features = module.in_features, module.out_features @@ -163,6 +174,7 @@ def apply_exact_had_to_linear(module, had_dim=-1, output=False, R2=None): def is_pow2(n): + """Borrowed from SpinQuant.""" return (n & (n - 1) == 0) and (n > 0) diff --git a/fms_mo/utils/qconfig_utils.py b/fms_mo/utils/qconfig_utils.py index caafec16..ef8f973d 100644 --- a/fms_mo/utils/qconfig_utils.py +++ b/fms_mo/utils/qconfig_utils.py @@ -568,18 +568,9 @@ def check_config(config, model_dtype=None): ) # Set allowed qa_modes, qw_modes, bmm_modes - qa_mode_settings = [ - "pact", - "pact+", - "pactsym", - "pactsym+", - "max", - "minmax", - "maxsym", - "pertokenmax", - "lsq+", - "fix", - "brecq", + shared_modes = [ + "max_perToken", + "max_perCh", # fp8_e4m3 "fp8_e4m3_sat", "fp8_e4m3_scale", @@ -594,6 +585,23 @@ def check_config(config, model_dtype=None): "fp8_e5m2_scale_perCh", "fp8_e5m2_sat_perToken", "fp8_e5m2_scale_perToken", + # others + "_only", # was "rot_only" + "no_quant", # could be used for those nbits = 16 or 32 + ] + + qa_mode_settings = [ + "pact", + "pact+", + "pactsym", + "pactsym+", + "max", + "minmax", + "maxsym", + "pertokenmax", + "lsq+", + "fix", + "brecq", ] qw_mode_settings = [ "sawb", @@ -616,20 +624,6 @@ def check_config(config, model_dtype=None): "brecq", "adaround", "pertokenmax", - # fp8_e4m3 - "fp8_e4m3_sat", - "fp8_e4m3_scale", - "fp8_e4m3_sat_perCh", - "fp8_e4m3_scale_perCh", - "fp8_e4m3_sat_perToken", - "fp8_e4m3_scale_perToken", - # fp8_e5m2 - "fp8_e5m2_sat", - "fp8_e5m2_scale", - "fp8_e5m2_sat_perCh", - "fp8_e5m2_scale_perCh", - "fp8_e5m2_sat_perToken", - "fp8_e5m2_scale_perToken", ] bmm_mode_settings = [ "pact", @@ -639,10 +633,6 @@ def check_config(config, model_dtype=None): "max", "minmax", "pertokenmax", - "fp8_e4m3_sat", - "fp8_e4m3_scale_perToken", - "fp8_e5m2_sat", - "fp8_e5m2_scale_perToken", ] # Get strings in config for qa_modes, qw_modes, bmm_modes @@ -663,24 +653,24 @@ def check_config(config, model_dtype=None): # Check each for correct ranges for qa_mode_str in qa_modes_str: - qa_mode = config.get(qa_mode_str, "pact+") - if not qa_mode in qa_mode_settings: + qa_mode = config.get(qa_mode_str, "pact+").replace("rot_", "") + if not (qa_mode in qa_mode_settings or qa_mode in shared_modes): raise ValueError( f"{qa_mode_str} = {qa_mode} is not set to one of the following: " f"{qa_mode_settings}" ) for qw_mode_str in qw_modes_str: - qw_mode = config.get(qw_mode_str, "sawb+") - if not qw_mode in qw_mode_settings: + qw_mode = config.get(qw_mode_str, "sawb+").replace("rot_", "") + if not (qw_mode in qw_mode_settings or qw_mode in shared_modes): raise ValueError( f"{qw_mode_str} = {qw_mode} is not set to one of the following: " f"{qw_mode_settings}" ) for bmm_mode_str in bmm_modes_str: - bmm_mode = config.get(bmm_mode_str, "pactsym+") - if not bmm_mode in bmm_mode_settings: + bmm_mode = config.get(bmm_mode_str, "pactsym+").replace("rot_", "") + if not (bmm_mode in bmm_mode_settings or bmm_mode in shared_modes): raise ValueError( f"{bmm_mode_str} = {bmm_mode} is not set to one of the following: " f"{bmm_mode_settings}" diff --git a/fms_mo/utils/utils.py b/fms_mo/utils/utils.py index 38e2a1db..e3333163 100644 --- a/fms_mo/utils/utils.py +++ b/fms_mo/utils/utils.py @@ -23,6 +23,7 @@ # Standard from contextlib import ExitStack, contextmanager +from functools import partial from typing import Any, Callable, Dict, List, Tuple, Union from unittest import mock import logging @@ -71,7 +72,12 @@ def move_to(obj, device): return obj -def mockbmm(mat1, mat2, default_to_torch=False): +def mockbmm( + mat1, + mat2, + default_to_torch=False, + target_line_num=[0], +): """ This function is used to mock the behavior of the bmm function in PyTorch. It is used to work around the fact that the bmm function in PyTorch is not @@ -87,7 +93,9 @@ def mockbmm(mat1, mat2, default_to_torch=False): cf = sys._getframe() qbmm_mod = None qbmm_lineno = cf.f_back.f_lineno - while cf.f_back and qbmm_mod is None: + if qbmm_lineno not in target_line_num: + default_to_torch = True + while (not default_to_torch) and cf.f_back and qbmm_mod is None: # First frame is QBmm's forward itself, can start searching from previous stack cf = cf.f_back if ( @@ -102,13 +110,21 @@ def mockbmm(mat1, mat2, default_to_torch=False): return qbmm_mod(mat1, mat2) -def mockmatmul(mat1, mat2, default_to_torch=False): +def mockmatmul( + mat1, + mat2, + default_to_torch=False, + target_line_num=[0], +): """ Patches torch.matmul() with QBmm( torch.bmm() ) Args: mat1 (torch.Tensor): The first matrix to be multiplied. mat2 (torch.Tensor): The second matrix to be multiplied. + target_bmm_lineno: Only patch matmul/bmm on the line number previously found by qmodel_prep. + i.e., matmuls/bmms other than self-attn will not be patched. + => need to make sure qmodel_prep only found 2. Returns: torch.Tensor: The result of the mock matrix multiplication. @@ -124,7 +140,9 @@ def mockmatmul(mat1, mat2, default_to_torch=False): cf = sys._getframe() qbmm_mod = None qbmm_lineno = cf.f_back.f_lineno - while cf.f_back and qbmm_mod is None: + if qbmm_lineno not in target_line_num: + default_to_torch = True + while (not default_to_torch) and cf.f_back and qbmm_mod is None: cf = cf.f_back if ( "forward" in cf.f_code.co_name or "_attn" in cf.f_code.co_name @@ -134,16 +152,17 @@ def mockmatmul(mat1, mat2, default_to_torch=False): qbmm_mod = getattr(mod_calling_bmm_function, f"QBmm{qbmm_lineno}", None) del cf - # Didn't find the corresponding QBmm, default the call to torch.bmm + # Didn't find the corresponding QBmm, default the call to torch.bmm, which only accepts 3D if qbmm_mod is None and default_to_torch: - org_batch_header = mat1.shape[:2] - # Need to double check m1/m2 are 3d, otherwise reshape - if len(mat1.shape) > 3: + # Need to reshape if inputs are 2d or 4d + if len(mat1.shape) == len(mat2.shape) and len(mat2.shape) in [2, 4]: + tar_shape = [mat1.shape[-2], mat1.shape[-1]] + if len(mat1.shape) == 4: + tar_shape = mat1.shape[:2] + tar_shape mat1 = mat1.reshape([-1, mat1.shape[-2], mat1.shape[-1]]) - if len(mat2.shape) > 3: mat2 = mat2.reshape([-1, mat2.shape[-2], mat2.shape[-1]]) output = torch.bmm(mat1, mat2) - output = output.reshape([*org_batch_header, *output.shape[1:]]) + output = output.reshape(tar_shape) return output return qbmm_mod(mat1, mat2) @@ -158,13 +177,14 @@ def patch_torch_bmm(qcfg): if qcfg is not None: # could be 'torch.bmm', 'torch.matmul', or None ops_to_patch = qcfg.get("which2patch_contextmanager", None) + tar_ln = list(qcfg["bmm_prep"]["layers_with_bmm"].values())[0] # if qcfg["bmm_prep"]["bmm_only_in_self_attn"] is False, may need to enable default_to_torch # in mock functions, e.g. partial(mockmatmul, default_to_torch=True) # This is in case a model uses extra matmuls, and QBmmXXX is not found or attached properly. new_target = ( - mockbmm + partial(mockbmm, target_line_num=tar_ln) if ops_to_patch == "torch.bmm" - else mockmatmul + else partial(mockmatmul, target_line_num=tar_ln) if ops_to_patch == "torch.matmul" else None ) From 53c3ed11e3cb0d1137b38bc0ecc5647ce50630b8 Mon Sep 17 00:00:00 2001 From: cliu-us Date: Tue, 29 Jul 2025 11:13:14 -0400 Subject: [PATCH 3/3] minor bug fix Signed-off-by: cliu-us --- fms_mo/utils/hadamard_util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fms_mo/utils/hadamard_util.py b/fms_mo/utils/hadamard_util.py index 9f92b2e0..c38a024d 100644 --- a/fms_mo/utils/hadamard_util.py +++ b/fms_mo/utils/hadamard_util.py @@ -52,6 +52,7 @@ def backward(_ctx, grad): def get_hadK(n, transpose=False): """Simplify the implementation and use binary tensors instead of text implementation.""" + hadK = None for K in [172, 156, 140, 108, 60, 52, 44, 40, 36, 28, 20, 12]: if n % K == 0 and is_pow2(n // K): hadK = hadKs[str(K)]