AMD64.FoxArrayBaseOptimized.Mod 189 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885
  1. MODULE FoxArrayBaseOptimized; (** AUTHOR "fof"; PURPOSE ""; **)
  2. IMPORT SYSTEM, ArrayBase := FoxArrayBase, Machine, KernelLog, Commands ;
  3. CONST
  4. L2CacheSize = 512 * 1024; (* L1CacheSize = 16 * 1024; *)
  5. (* parameters for blocking matrix multiplication *)
  6. L1BlockN = 5; (* L1 block size -> nr of columns in a block that can be processed using L1 chache *)
  7. L2BARatio = 1;
  8. L0BlockKR = 4; (* L0 block size -> nr of elements that can be processed at once for type REAL *)
  9. L1MaxBlockKR = 336; (* L1CacheSize/SIZEOF(REAL)/2/6*)
  10. L2BlockSize = 81920;
  11. L0BlockKX = 2; (* L0 block size -> nr of elements that can be processed at once for type LONGREAL *)
  12. L1MaxBlockKX = 256; (* > L1CacheSize/SIZEOF(LONGREAL)/2/6*)
  13. (*
  14. DefaultL2CacheSize = 81920;
  15. L2SizeR = L2CacheSize DIV 8; MaxBlockKR = 336; (* ca L1CacheSize/SIZEOF(REAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  16. L2SizeX = L2CacheSize DIV 8; MaxBlockKX = 256; (* bit more than L1CacheSize/SIZEL(LONGREAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  17. *)
  18. debug = FALSE; parallel = TRUE; SSE = TRUE;
  19. MaxCachePoolSize = 0 (* disabled *) (* 646*1024*1024 *) (* enabled *) ;
  20. maxProcesses = 48;
  21. cMatMulDynamic* = -1; cMatMulScalarProduct* = 0;
  22. cMatMulNaive* = 1; cMatMulTransposed* = 2;
  23. cMatMulStride* = 3; cMatMulBlocked* = 4;
  24. VAR
  25. cBlockSize*: LONGINT; nrProcesses*: LONGINT;
  26. lastUsedBlockSize*: SIZE;
  27. allocT-, copyT-, zeroT-, compT-: HUGEINT;
  28. TYPE
  29. Cache = POINTER TO RECORD
  30. p: ANY;
  31. adr: ADDRESS; size: SIZE;
  32. prev, next: Cache;
  33. END;
  34. CachePool = OBJECT
  35. (*! provide heuristics for overal size *)
  36. VAR first, last: Cache;
  37. PROCEDURE & Init*;
  38. BEGIN
  39. NEW( first ); first.size := 0; (* sentinel *)
  40. NEW( last ); last.size := MAX( SIZE ); (* sentinel *)
  41. first.next := last; first.prev := NIL; last.prev := first; last.next := NIL;
  42. END Init;
  43. PROCEDURE Acquire( size: SIZE ): Cache;
  44. VAR c: Cache; t: HUGEINT;
  45. BEGIN {EXCLUSIVE}
  46. IF size = 0 THEN RETURN first END;
  47. Tic( t );
  48. c := last;
  49. WHILE (c.prev.size >= size) DO
  50. c := c.prev;
  51. END;
  52. IF c = last THEN
  53. NEW( c ); SYSTEM.NEW( c.p, size + 16 );
  54. c.adr := Align( c.p , 16 );
  55. c.size := size;
  56. ELSE
  57. c.prev.next := c.next;
  58. c.next.prev := c.prev;
  59. c.prev := NIL; c.next := NIL;
  60. END;
  61. Toc( t, allocT ); RETURN c;
  62. END Acquire;
  63. PROCEDURE Release( c: Cache );
  64. VAR t: Cache;
  65. BEGIN {EXCLUSIVE}
  66. IF (c=first) OR (c=NIL) THEN RETURN END;
  67. ASSERT(c.size > 0);
  68. IF c.size > MaxCachePoolSize THEN RETURN END;
  69. t := first;
  70. WHILE (t.size <= c.size) DO t := t.next; END;
  71. c.prev := t.prev; c.next := t; t.prev := c; c.prev.next := c;
  72. END Release;
  73. END CachePool;
  74. ComputationObj = OBJECT
  75. VAR done: BOOLEAN;
  76. PROCEDURE & Init*;
  77. BEGIN
  78. done := FALSE;
  79. END Init;
  80. PROCEDURE Compute; (*abstract*)
  81. END Compute;
  82. PROCEDURE Wait;
  83. BEGIN {EXCLUSIVE}
  84. AWAIT( done );
  85. END Wait;
  86. BEGIN {ACTIVE, EXCLUSIVE}
  87. Compute; done := TRUE;
  88. END ComputationObj;
  89. MatMulHObjR = OBJECT (ComputationObj)
  90. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  91. add: BOOLEAN;
  92. PROCEDURE & InitR*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  93. add: BOOLEAN );
  94. BEGIN
  95. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  96. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  97. SELF.IncC := IncC; SELF.StrideC := StrideC;
  98. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  99. SELF.Cols := Cols; SELF.add := add;
  100. END InitR;
  101. PROCEDURE Compute;
  102. BEGIN
  103. MatMulHBlockR( MatrixA, MatrixB, MatrixC, Stride, IncC,
  104. StrideC, RowsA, RowsB, Cols, add );
  105. END Compute;
  106. END MatMulHObjR;
  107. MatMulHObjX = OBJECT (ComputationObj)
  108. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  109. add: BOOLEAN;
  110. PROCEDURE & InitX*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  111. add: BOOLEAN );
  112. BEGIN
  113. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  114. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  115. SELF.IncC := IncC; SELF.StrideC := StrideC;
  116. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  117. SELF.Cols := Cols; SELF.add := add;
  118. END InitX;
  119. PROCEDURE Compute;
  120. BEGIN
  121. MatMulHBlockX( MatrixA, MatrixB, MatrixC, Stride, IncC,
  122. StrideC, RowsA, RowsB, Cols, add );
  123. END Compute;
  124. END MatMulHObjX;
  125. MultiplyObjectR = OBJECT (ComputationObj);
  126. VAR adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK:SIZE;
  127. start, finished: BOOLEAN;
  128. PROCEDURE & InitR*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  129. BEGIN
  130. Init; start := FALSE; finished := FALSE;
  131. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  132. SELF.M := M; SELF.N := N; SELF.K := K;
  133. SELF.IncC := IncC; SELF.StrideC := StrideC;
  134. SELF.L2BlockM := L2BlockM;
  135. SELF.L2BlockN := L2BlockN;
  136. SELF.L2BlockK := L2BlockK;
  137. END InitR;
  138. PROCEDURE Compute;
  139. BEGIN
  140. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  141. L2BlockN, L2BlockK );
  142. END Compute;
  143. END MultiplyObjectR;
  144. MultiplyObjectX = OBJECT (ComputationObj);
  145. VAR adrA, adrB:ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE;
  146. start, finished: BOOLEAN;
  147. PROCEDURE & InitX*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  148. BEGIN
  149. Init; start := FALSE; finished := FALSE;
  150. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  151. SELF.M := M; SELF.N := N; SELF.K := K;
  152. SELF.IncC := IncC; SELF.StrideC := StrideC;
  153. SELF.L2BlockM := L2BlockM;
  154. SELF.L2BlockN := L2BlockN;
  155. SELF.L2BlockK := L2BlockK;
  156. END InitX;
  157. PROCEDURE Compute;
  158. BEGIN
  159. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  160. L2BlockN, L2BlockK );
  161. END Compute;
  162. END MultiplyObjectX;
  163. VAR
  164. (* ran: Random.Generator; (* testing *)*)
  165. cachePool: CachePool;
  166. (*********** Part 0: assembler routines ***************)
  167. PROCEDURE -L1Block1XA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  168. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  169. MOV RAX, [RSP+K] ; RAX IS counter
  170. MOV RDX, [RSP+adrC]
  171. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  172. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  173. FLD QWORD [RDX] ; S.GET(dadr, x)
  174. loop8:
  175. CMP RAX, 8
  176. JL loop1
  177. FLD QWORD[RBX] ; S.GET(ladr, x)
  178. ADD RBX, 8 ; INC(ladr, incl)
  179. FLD QWORD[RCX] ; S.GET(ladr, y)
  180. ADD RCX, 8 ; INC(radr, incr)
  181. FMULP ; x := x*y
  182. FADDP ; z := z+x
  183. FLD QWORD[RBX] ; S.GET(ladr, x)
  184. ADD RBX, 8 ; INC(ladr, incl)
  185. FLD QWORD[RCX] ; S.GET(ladr, y)
  186. ADD RCX, 8 ; INC(radr, incr)
  187. FMULP ; x := x*y
  188. FADDP ; z := z+x
  189. FLD QWORD[RBX] ; S.GET(ladr, x)
  190. ADD RBX, 8 ; INC(ladr, incl)
  191. FLD QWORD[RCX] ; S.GET(ladr, y)
  192. ADD RCX, 8 ; INC(radr, incr)
  193. FMULP ; x := x*y
  194. FADDP ; z := z+x
  195. FLD QWORD[RBX] ; S.GET(ladr, x)
  196. ADD RBX, 8 ; INC(ladr, incl)
  197. FLD QWORD[RCX] ; S.GET(ladr, y)
  198. ADD RCX, 8 ; INC(radr, incr)
  199. FMULP ; x := x*y
  200. FADDP ; z := z+x
  201. FLD QWORD[RBX] ; S.GET(ladr, x)
  202. ADD RBX, 8 ; INC(ladr, incl)
  203. FLD QWORD[RCX] ; S.GET(ladr, y)
  204. ADD RCX, 8 ; INC(radr, incr)
  205. FMULP ; x := x*y
  206. FADDP ; z := z+x
  207. FLD QWORD[RBX] ; S.GET(ladr, x)
  208. ADD RBX, 8 ; INC(ladr, incl)
  209. FLD QWORD[RCX] ; S.GET(ladr, y)
  210. ADD RCX, 8 ; INC(radr, incr)
  211. FMULP ; x := x*y
  212. FADDP ; z := z+x
  213. FLD QWORD[RBX] ; S.GET(ladr, x)
  214. ADD RBX, 8 ; INC(ladr, incl)
  215. FLD QWORD[RCX] ; S.GET(ladr, y)
  216. ADD RCX, 8 ; INC(radr, incr)
  217. FMULP ; x := x*y
  218. FADDP ; z := z+x
  219. FLD QWORD[RBX] ; S.GET(ladr, x)
  220. ADD RBX, 8 ; INC(ladr, incl)
  221. FLD QWORD[RCX] ; S.GET(ladr, y)
  222. ADD RCX, 8 ; INC(radr, incr)
  223. FMULP ; x := x*y
  224. FADDP ; z := z+x
  225. SUB RAX, 8 ; DEC(len)
  226. JMP loop8 ;
  227. loop1:
  228. CMP RAX, 0 ; WHILE len > 0 DO
  229. JLE endL
  230. FLD QWORD[RBX] ; S.GET(ladr, x)
  231. ADD RBX, 8 ; INC(ladr, incl)
  232. FLD QWORD[RCX] ; S.GET(ladr, y)
  233. ADD RCX, 8 ; INC(radr, incr)
  234. FMULP ; x := x*y
  235. FADDP ; z := z+x
  236. DEC RAX ; DEC(len)
  237. JMP loop1 ;
  238. endL:
  239. FSTP QWORD[RDX] ; S.PUT(dadr, x)
  240. FWAIT ;
  241. ADD RSP, 32 ;
  242. END L1Block1XA;
  243. PROCEDURE -L1Block1XSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  244. (*
  245. matrixA, matrixB must be stored in special format
  246. K>0 guaranteed
  247. *)
  248. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  249. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  250. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  251. MOV RDX, [RSP+K] ; RDX IS counter
  252. XORPD XMM2, XMM2 ;
  253. kLoop8: ;
  254. CMP RDX, 8 ;
  255. JL kLoop2 ;
  256. MOVAPD XMM7, [RBX] ;
  257. MOVAPD XMM0, [RCX] ;
  258. ADD RCX, 16 ;
  259. ADD RBX, 16 ;
  260. MOVAPD XMM6, [RBX] ;
  261. MOVAPD XMM1, [RCX] ;
  262. ADD RCX, 16 ;
  263. ADD RBX, 16 ;
  264. MULPD XMM0, XMM7 ;
  265. ADDPD XMM2, XMM0 ;
  266. MOVAPD XMM5, [RBX] ;
  267. MOVAPD XMM3, [RCX] ;
  268. ADD RCX, 16 ;
  269. ADD RBX, 16 ;
  270. MULPD XMM1, XMM6 ;
  271. ADDPD XMM2, XMM1 ;
  272. MOVAPD XMM7, [RBX] ;
  273. MOVAPD XMM0, [RCX] ;
  274. ADD RCX, 16 ;
  275. ADD RBX, 16 ;
  276. MULPD XMM3, XMM5 ;
  277. ADDPD XMM2, XMM3 ;
  278. MULPD XMM0, XMM7 ;
  279. ADDPD XMM2, XMM0 ;
  280. SUB RDX, 8 ;
  281. JMP kLoop8 ;
  282. kLoop2: ;
  283. CMP RDX, 0 ;
  284. JLE horizontalAdd ;
  285. MOVAPD XMM7, [RBX] ;
  286. MOVAPD XMM0, [RCX] ;
  287. ADD RCX, 16 ;
  288. ADD RBX, 16 ;
  289. MULPD XMM0, XMM7 ;
  290. ADDPD XMM2, XMM0 ;
  291. SUB RDX, 2
  292. JMP kLoop2 ;
  293. horizontalAdd:
  294. MOV RDI, [RSP+adrC] ;
  295. MOVAPD XMM1, XMM2 ;
  296. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  297. ADDPD XMM2, XMM1 ;
  298. ADDSD XMM2, [RDI] ;
  299. MOVSD [RDI], XMM2 ;
  300. endL:
  301. ADD RSP, 32 ;
  302. END L1Block1XSSE;
  303. PROCEDURE -L1Block5XSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  304. (*
  305. matrixA and matrix B are stored in special format !
  306. K > 0 is guaranteed
  307. *)
  308. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  309. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  310. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  311. MOV RDX, [RSP+K] ; RDX IS counter
  312. XORPD XMM2, XMM2 ;
  313. XORPD XMM3, XMM3 ;
  314. XORPD XMM4, XMM4 ;
  315. XORPD XMM5, XMM5 ;
  316. XORPD XMM6, XMM6 ;
  317. kLoop8: ;
  318. CMP RDX, 8 ;
  319. JL kLoop2
  320. ; (*-- 0 -- *) ;
  321. MOVAPD XMM7, [RBX] ; get 4 elements OF A
  322. ADD RBX, 16 ;
  323. MOVAPD XMM0, [RCX] ; get 4 elements OF B
  324. ADD RCX, 16 ;
  325. MOVAPD XMM1, [RCX] ; get 4 elements OF B
  326. ADD RCX, 16 ;
  327. MULPD XMM0, XMM7 ;
  328. ADDPD XMM2, XMM0 ;
  329. MOVAPD XMM0, [RCX] ;
  330. ADD RCX, 16 ;
  331. MULPD XMM1, XMM7 ;
  332. ADDPD XMM3, XMM1 ;
  333. MOVAPD XMM1, [RCX] ;
  334. ADD RCX, 16 ;
  335. MULPD XMM0, XMM7 ;
  336. ADDPD XMM4, XMM0 ;
  337. MOVAPD XMM0, [RCX] ;
  338. ADD RCX, 16 ;
  339. MULPD XMM1, XMM7 ;
  340. ADDPD XMM5, XMM1 ;
  341. MOVAPD XMM1, [RCX] ;
  342. ADD RCX, 16 ;
  343. MULPD XMM0, XMM7 ;
  344. ADDPD XMM6, XMM0
  345. ; (*-- 2 -- *) ;
  346. MOVAPD XMM7, [RBX] ;
  347. ADD RBX, 16 ;
  348. MOVAPD XMM0, [RCX] ;
  349. ADD RCX, 16 ;
  350. MULPD XMM1, XMM7 ;
  351. ADDPD XMM2, XMM1 ;
  352. MOVAPD XMM1, [RCX] ;
  353. ADD RCX, 16 ;
  354. MULPD XMM0, XMM7 ;
  355. ADDPD XMM3, XMM0 ;
  356. MOVAPD XMM0, [RCX] ;
  357. ADD RCX, 16 ;
  358. MULPD XMM1, XMM7 ;
  359. ADDPD XMM4, XMM1 ;
  360. MOVAPD XMM1, [RCX] ;
  361. ADD RCX, 16 ;
  362. MULPD XMM0, XMM7 ;
  363. ADDPD XMM5, XMM0 ;
  364. MOVAPD XMM0, [RCX] ;
  365. ADD RCX, 16 ;
  366. MULPD XMM1, XMM7 ;
  367. ADDPD XMM6, XMM1
  368. ; (*-- 4 -- *) ;
  369. MOVAPD XMM7, [RBX] ;
  370. ADD RBX, 16 ;
  371. MOVAPD XMM1, [RCX] ;
  372. ADD RCX, 16 ;
  373. MULPD XMM0, XMM7 ;
  374. ADDPD XMM2, XMM0 ;
  375. MOVAPD XMM0, [RCX] ;
  376. ADD RCX, 16 ;
  377. MULPD XMM1, XMM7 ;
  378. ADDPD XMM3, XMM1 ;
  379. MOVAPD XMM1, [RCX] ;
  380. ADD RCX, 16 ;
  381. MULPD XMM0, XMM7 ;
  382. ADDPD XMM4, XMM0 ;
  383. MOVAPD XMM0, [RCX] ;
  384. ADD RCX, 16 ;
  385. MULPD XMM1, XMM7 ;
  386. ADDPD XMM5, XMM1 ;
  387. MOVAPD XMM1, [RCX] ;
  388. ADD RCX, 16 ;
  389. MULPD XMM0, XMM7 ;
  390. ADDPD XMM6, XMM0
  391. ; (*-- 6 -- *) ;
  392. MOVAPD XMM7, [RBX] ;
  393. ADD RBX, 16 ;
  394. MOVAPD XMM0, [RCX] ;
  395. ADD RCX, 16 ;
  396. MULPD XMM1, XMM7 ;
  397. ADDPD XMM2, XMM1 ;
  398. MOVAPD XMM1, [RCX] ;
  399. ADD RCX, 16 ;
  400. MULPD XMM0, XMM7 ;
  401. ADDPD XMM3, XMM0 ;
  402. MOVAPD XMM0, [RCX] ;
  403. ADD RCX, 16 ;
  404. MULPD XMM1, XMM7 ;
  405. ADDPD XMM4, XMM1 ;
  406. MOVAPD XMM1, [RCX] ;
  407. ADD RCX, 16 ;
  408. MULPD XMM0, XMM7 ;
  409. ADDPD XMM5, XMM0 ;
  410. MULPD XMM1, XMM7 ;
  411. ADDPD XMM6, XMM1 ;
  412. SUB RDX, 8
  413. JMP kLoop8 ;
  414. kLoop2: ;
  415. CMP RDX, 0 ;
  416. JLE horizontalAdd ;
  417. MOVAPD XMM7, [RBX] ;
  418. ADD RBX, 16 ;
  419. MOVAPD XMM0, [RCX] ;
  420. ADD RCX, 16 ;
  421. MOVAPD XMM1, [RCX] ;
  422. ADD RCX, 16 ;
  423. MULPD XMM0, XMM7 ;
  424. ADDPD XMM2, XMM0 ;
  425. MOVAPD XMM0, [RCX] ;
  426. ADD RCX, 16 ;
  427. MULPD XMM1, XMM7 ;
  428. ADDPD XMM3, XMM1 ;
  429. MOVAPD XMM1, [RCX] ;
  430. ADD RCX, 16 ;
  431. MULPD XMM0, XMM7 ;
  432. ADDPD XMM4, XMM0 ;
  433. MOVAPD XMM0, [RCX] ;
  434. ADD RCX, 16 ;
  435. MULPD XMM1, XMM7 ;
  436. ADDPD XMM5, XMM1 ;
  437. MULPD XMM0, XMM7 ;
  438. ADDPD XMM6, XMM0 ;
  439. SUB RDX, 2
  440. JMP kLoop2 ;
  441. horizontalAdd: ; add and store
  442. MOV RDI, [RSP+adrC] ;
  443. MOV RAX, [RSP+IncC] ;
  444. MOVAPD XMM1, XMM2 ;
  445. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  446. ADDPD XMM2, XMM1 ;
  447. ADDSD XMM2, [RDI] ;
  448. MOVSD [RDI], XMM2 ;
  449. ADD RDI, RAX ;
  450. MOVAPD XMM1, XMM3 ;
  451. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  452. ADDPD XMM3, XMM1 ;
  453. ADDSD XMM3, [RDI] ;
  454. MOVSD [RDI], XMM3 ;
  455. ADD RDI, RAX ;
  456. MOVAPD XMM1, XMM4 ;
  457. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  458. ADDPD XMM4, XMM1 ;
  459. ADDSD XMM4, [RDI] ;
  460. MOVSD [RDI], XMM4 ;
  461. ADD RDI, RAX ;
  462. MOVAPD XMM1, XMM5 ;
  463. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  464. ADDPD XMM5, XMM1 ;
  465. ADDSD XMM5, [RDI] ;
  466. MOVSD [RDI], XMM5 ;
  467. ADD RDI, RAX ;
  468. MOVAPD XMM1, XMM6 ;
  469. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  470. ADDPD XMM6, XMM1 ;
  471. ADDSD XMM6, [RDI] ;
  472. MOVSD [RDI], XMM6 ;
  473. endL:
  474. ADD RSP, 40 ;
  475. END L1Block5XSSE;
  476. PROCEDURE -L1Block1RA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  477. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  478. MOV RAX, [RSP+K] ; RAX IS counter
  479. MOV RDX, [RSP+adrC]
  480. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  481. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  482. FLD DWORD [RDX] ; S.GET(dadr, x)
  483. loop16:
  484. CMP RAX, 16
  485. JL loop1
  486. FLD DWORD[RBX] ; S.GET(ladr, x)
  487. ADD RBX, 4 ; INC(ladr, incl)
  488. FLD DWORD[RCX] ; S.GET(ladr, y)
  489. ADD RCX, 4 ; INC(radr, incr)
  490. FMULP ; x := x*y
  491. FADDP ; z := z+x
  492. FLD DWORD[RBX] ; S.GET(ladr, x)
  493. ADD RBX, 4 ; INC(ladr, incl)
  494. FLD DWORD[RCX] ; S.GET(ladr, y)
  495. ADD RCX, 4 ; INC(radr, incr)
  496. FMULP ; x := x*y
  497. FADDP ; z := z+x
  498. FLD DWORD[RBX] ; S.GET(ladr, x)
  499. ADD RBX, 4 ; INC(ladr, incl)
  500. FLD DWORD[RCX] ; S.GET(ladr, y)
  501. ADD RCX, 4 ; INC(radr, incr)
  502. FMULP ; x := x*y
  503. FADDP ; z := z+x
  504. FLD DWORD[RBX] ; S.GET(ladr, x)
  505. ADD RBX, 4 ; INC(ladr, incl)
  506. FLD DWORD[RCX] ; S.GET(ladr, y)
  507. ADD RCX, 4 ; INC(radr, incr)
  508. FMULP ; x := x*y
  509. FADDP ; z := z+x
  510. FLD DWORD[RBX] ; S.GET(ladr, x)
  511. ADD RBX, 4 ; INC(ladr, incl)
  512. FLD DWORD[RCX] ; S.GET(ladr, y)
  513. ADD RCX, 4 ; INC(radr, incr)
  514. FMULP ; x := x*y
  515. FADDP ; z := z+x
  516. FLD DWORD[RBX] ; S.GET(ladr, x)
  517. ADD RBX, 4 ; INC(ladr, incl)
  518. FLD DWORD[RCX] ; S.GET(ladr, y)
  519. ADD RCX, 4 ; INC(radr, incr)
  520. FMULP ; x := x*y
  521. FADDP ; z := z+x
  522. FLD DWORD[RBX] ; S.GET(ladr, x)
  523. ADD RBX, 4 ; INC(ladr, incl)
  524. FLD DWORD[RCX] ; S.GET(ladr, y)
  525. ADD RCX, 4 ; INC(radr, incr)
  526. FMULP ; x := x*y
  527. FADDP ; z := z+x
  528. FLD DWORD[RBX] ; S.GET(ladr, x)
  529. ADD RBX, 4 ; INC(ladr, incl)
  530. FLD DWORD[RCX] ; S.GET(ladr, y)
  531. ADD RCX, 4 ; INC(radr, incr)
  532. FMULP ; x := x*y
  533. FADDP ; z := z+x
  534. FLD DWORD[RBX] ; S.GET(ladr, x)
  535. ADD RBX, 4 ; INC(ladr, incl)
  536. FLD DWORD[RCX] ; S.GET(ladr, y)
  537. ADD RCX, 4 ; INC(radr, incr)
  538. FMULP ; x := x*y
  539. FADDP ; z := z+x
  540. FLD DWORD[RBX] ; S.GET(ladr, x)
  541. ADD RBX, 4 ; INC(ladr, incl)
  542. FLD DWORD[RCX] ; S.GET(ladr, y)
  543. ADD RCX, 4 ; INC(radr, incr)
  544. FMULP ; x := x*y
  545. FADDP ; z := z+x
  546. FLD DWORD[RBX] ; S.GET(ladr, x)
  547. ADD RBX, 4 ; INC(ladr, incl)
  548. FLD DWORD[RCX] ; S.GET(ladr, y)
  549. ADD RCX, 4 ; INC(radr, incr)
  550. FMULP ; x := x*y
  551. FADDP ; z := z+x
  552. FLD DWORD[RBX] ; S.GET(ladr, x)
  553. ADD RBX, 4 ; INC(ladr, incl)
  554. FLD DWORD[RCX] ; S.GET(ladr, y)
  555. ADD RCX, 4 ; INC(radr, incr)
  556. FMULP ; x := x*y
  557. FADDP ; z := z+x
  558. FLD DWORD[RBX] ; S.GET(ladr, x)
  559. ADD RBX, 4 ; INC(ladr, incl)
  560. FLD DWORD[RCX] ; S.GET(ladr, y)
  561. ADD RCX, 4 ; INC(radr, incr)
  562. FMULP ; x := x*y
  563. FADDP ; z := z+x
  564. FLD DWORD[RBX] ; S.GET(ladr, x)
  565. ADD RBX, 4 ; INC(ladr, incl)
  566. FLD DWORD[RCX] ; S.GET(ladr, y)
  567. ADD RCX, 4 ; INC(radr, incr)
  568. FMULP ; x := x*y
  569. FADDP ; z := z+x
  570. FLD DWORD[RBX] ; S.GET(ladr, x)
  571. ADD RBX, 4 ; INC(ladr, incl)
  572. FLD DWORD[RCX] ; S.GET(ladr, y)
  573. ADD RCX, 4 ; INC(radr, incr)
  574. FMULP ; x := x*y
  575. FADDP ; z := z+x
  576. FLD DWORD[RBX] ; S.GET(ladr, x)
  577. ADD RBX, 4 ; INC(ladr, incl)
  578. FLD DWORD[RCX] ; S.GET(ladr, y)
  579. ADD RCX, 4 ; INC(radr, incr)
  580. FMULP ; x := x*y
  581. FADDP ; z := z+x
  582. SUB RAX, 16 ; DEC(len)
  583. JMP loop16 ;
  584. loop1:
  585. CMP RAX, 0 ; WHILE len > 0 DO
  586. JLE endL
  587. FLD DWORD[RBX] ; S.GET(ladr, x)
  588. ADD RBX, 4 ; INC(ladr, incl)
  589. FLD DWORD[RCX] ; S.GET(ladr, y)
  590. ADD RCX, 4 ; INC(radr, incr)
  591. FMULP ; x := x*y
  592. FADDP ; z := z+x
  593. DEC RAX ; DEC(len)
  594. JMP loop1 ;
  595. endL:
  596. FSTP DWORD[RDX] ; S.PUT(dadr, x)
  597. FWAIT ;
  598. ADD RSP, 32 ;
  599. END L1Block1RA;
  600. PROCEDURE -L1Block1RSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  601. (*
  602. matrixA, matrixB must be stored in special format
  603. K>0 guaranteed
  604. *)
  605. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  606. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  607. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  608. MOV RDX, [RSP+K] ; RDX IS counter
  609. XORPS XMM2, XMM2 ;
  610. kLoop16: ;
  611. CMP RDX, 16 ;
  612. JL kLoop4 ;
  613. MOVAPS XMM7, [RBX] ;
  614. MOVAPS XMM0, [RCX] ;
  615. ADD RCX, 16 ;
  616. ADD RBX, 16 ;
  617. MOVAPS XMM6, [RBX] ;
  618. MOVAPS XMM1, [RCX] ;
  619. ADD RCX, 16 ;
  620. ADD RBX, 16 ;
  621. MULPS XMM0, XMM7 ;
  622. ADDPS XMM2, XMM0 ;
  623. MOVAPS XMM5, [RBX] ;
  624. MOVAPS XMM3, [RCX] ;
  625. ADD RCX, 16 ;
  626. ADD RBX, 16 ;
  627. MULPS XMM1, XMM6 ;
  628. ADDPS XMM2, XMM1 ;
  629. MOVAPS XMM7, [RBX] ;
  630. MOVAPS XMM0, [RCX] ;
  631. ADD RCX, 16 ;
  632. ADD RBX, 16 ;
  633. MULPS XMM3, XMM5 ;
  634. ADDPS XMM2, XMM3 ;
  635. MULPS XMM0, XMM7 ;
  636. ADDPS XMM2, XMM0 ;
  637. SUB RDX, 16 ;
  638. JMP kLoop16 ;
  639. kLoop4: ;
  640. CMP RDX, 0 ;
  641. JLE horizontalAdd ;
  642. MOVAPS XMM7, [RBX] ;
  643. MOVAPS XMM0, [RCX] ;
  644. ADD RCX, 16 ;
  645. ADD RBX, 16 ;
  646. MULPS XMM0, XMM7 ;
  647. ADDPS XMM2, XMM0 ;
  648. SUB RDX, 4
  649. JMP kLoop4 ;
  650. horizontalAdd:
  651. MOV RDI, [RSP+adrC] ;
  652. MOVLHPS XMM1, XMM2 ;
  653. ADDPS XMM1, XMM2 ;
  654. SHUFPS XMM2, XMM1, 48 ;
  655. ADDPS XMM2, XMM1 ;
  656. MOVHLPS XMM2, XMM2 ;
  657. ADDSS XMM2, [RDI] ;
  658. MOVSS [RDI], XMM2 ;
  659. endL:
  660. ADD RSP, 32 ;
  661. END L1Block1RSSE;
  662. PROCEDURE -L1Block5RSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  663. (*
  664. matrixA and matrix B are stored in special format !
  665. K > 0 is guaranteed
  666. *)
  667. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  668. MOV RBX, [RSP+adrA] ; RBX IS POINTER TO data OF matrix A
  669. MOV RCX, [RSP+adrB] ; RCX IS POINTER TO data OF matrix B
  670. MOV RDX, [RSP+K] ; RDX IS counter
  671. XORPS XMM2, XMM2 ;
  672. XORPS XMM3, XMM3 ;
  673. XORPS XMM4, XMM4 ;
  674. XORPS XMM5, XMM5 ;
  675. XORPS XMM6, XMM6 ;
  676. kLoop16: ;
  677. CMP RDX, 16 ;
  678. JL kLoop4 ; (*-- 0 -- *)
  679. MOVAPS XMM7, [RBX] ; get 4 elements OF A
  680. ADD RBX, 16 ;
  681. MOVAPS XMM0, [RCX] ; get 4 elements OF B
  682. ADD RCX, 16 ;
  683. MOVAPS XMM1, [RCX] ; get 4 elements OF B
  684. ADD RCX, 16 ;
  685. MULPS XMM0, XMM7 ;
  686. ADDPS XMM2, XMM0 ;
  687. MOVAPS XMM0, [RCX] ;
  688. ADD RCX, 16 ;
  689. MULPS XMM1, XMM7 ;
  690. ADDPS XMM3, XMM1 ;
  691. MOVAPS XMM1, [RCX] ;
  692. ADD RCX, 16 ;
  693. MULPS XMM0, XMM7 ;
  694. ADDPS XMM4, XMM0 ;
  695. MOVAPS XMM0, [RCX] ;
  696. ADD RCX, 16 ;
  697. MULPS XMM1, XMM7 ;
  698. ADDPS XMM5, XMM1 ;
  699. MOVAPS XMM1, [RCX] ;
  700. ADD RCX, 16 ;
  701. MULPS XMM0, XMM7 ;
  702. ADDPS XMM6, XMM0
  703. ; (*-- 4 -- *) ;
  704. MOVAPS XMM7, [RBX] ;
  705. ADD RBX, 16 ;
  706. MOVAPS XMM0, [RCX] ;
  707. ADD RCX, 16 ;
  708. MULPS XMM1, XMM7 ;
  709. ADDPS XMM2, XMM1 ;
  710. MOVAPS XMM1, [RCX] ;
  711. ADD RCX, 16 ;
  712. MULPS XMM0, XMM7 ;
  713. ADDPS XMM3, XMM0 ;
  714. MOVAPS XMM0, [RCX] ;
  715. ADD RCX, 16 ;
  716. MULPS XMM1, XMM7 ;
  717. ADDPS XMM4, XMM1 ;
  718. MOVAPS XMM1, [RCX] ;
  719. ADD RCX, 16 ;
  720. MULPS XMM0, XMM7 ;
  721. ADDPS XMM5, XMM0 ;
  722. MOVAPS XMM0, [RCX] ;
  723. ADD RCX, 16 ;
  724. MULPS XMM1, XMM7 ;
  725. ADDPS XMM6, XMM1
  726. ; (*-- 8 -- *) ;
  727. MOVAPS XMM7, [RBX] ;
  728. ADD RBX, 16 ;
  729. MOVAPS XMM1, [RCX] ;
  730. ADD RCX, 16 ;
  731. MULPS XMM0, XMM7 ;
  732. ADDPS XMM2, XMM0 ;
  733. MOVAPS XMM0, [RCX] ;
  734. ADD RCX, 16 ;
  735. MULPS XMM1, XMM7 ;
  736. ADDPS XMM3, XMM1 ;
  737. MOVAPS XMM1, [RCX] ;
  738. ADD RCX, 16 ;
  739. MULPS XMM0, XMM7 ;
  740. ADDPS XMM4, XMM0 ;
  741. MOVAPS XMM0, [RCX] ;
  742. ADD RCX, 16 ;
  743. MULPS XMM1, XMM7 ;
  744. ADDPS XMM5, XMM1 ;
  745. MOVAPS XMM1, [RCX] ;
  746. ADD RCX, 16 ;
  747. MULPS XMM0, XMM7 ;
  748. ADDPS XMM6, XMM0
  749. ; (*-- 12 -- *) ;
  750. MOVAPS XMM7, [RBX] ;
  751. ADD RBX, 16 ;
  752. MOVAPS XMM0, [RCX] ;
  753. ADD RCX, 16 ;
  754. MULPS XMM1, XMM7 ;
  755. ADDPS XMM2, XMM1 ;
  756. MOVAPS XMM1, [RCX] ;
  757. ADD RCX, 16 ;
  758. MULPS XMM0, XMM7 ;
  759. ADDPS XMM3, XMM0 ;
  760. MOVAPS XMM0, [RCX] ;
  761. ADD RCX, 16 ;
  762. MULPS XMM1, XMM7 ;
  763. ADDPS XMM4, XMM1 ;
  764. MOVAPS XMM1, [RCX] ;
  765. ADD RCX, 16 ;
  766. MULPS XMM0, XMM7 ;
  767. ADDPS XMM5, XMM0 ;
  768. MULPS XMM1, XMM7 ;
  769. ADDPS XMM6, XMM1 ;
  770. SUB RDX, 16
  771. JMP kLoop16 ;
  772. kLoop4: ;
  773. CMP RDX, 0 ;
  774. JLE horizontalAdd ;
  775. MOVAPS XMM7, [RBX] ;
  776. ADD RBX, 16 ;
  777. MOVAPS XMM0, [RCX] ;
  778. ADD RCX, 16 ;
  779. MOVAPS XMM1, [RCX] ;
  780. ADD RCX, 16 ;
  781. MULPS XMM0, XMM7 ;
  782. ADDPS XMM2, XMM0 ;
  783. MOVAPS XMM0, [RCX] ;
  784. ADD RCX, 16 ;
  785. MULPS XMM1, XMM7 ;
  786. ADDPS XMM3, XMM1 ;
  787. MOVAPS XMM1, [RCX] ;
  788. ADD RCX, 16 ;
  789. MULPS XMM0, XMM7 ;
  790. ADDPS XMM4, XMM0 ;
  791. MOVAPS XMM0, [RCX] ;
  792. ADD RCX, 16 ;
  793. MULPS XMM1, XMM7 ;
  794. ADDPS XMM5, XMM1 ;
  795. MULPS XMM0, XMM7 ;
  796. ADDPS XMM6, XMM0 ;
  797. SUB RDX, 4
  798. JMP kLoop4 ;
  799. horizontalAdd: ; add and store
  800. MOV RDI, [RSP+adrC] ;
  801. MOV RAX, [RSP+IncC] ;
  802. MOVLHPS XMM1, XMM2 ;
  803. ADDPS XMM1, XMM2 ;
  804. SHUFPS XMM2, XMM1, 48 ;
  805. ADDPS XMM2, XMM1 ;
  806. MOVHLPS XMM2, XMM2 ;
  807. ADDSS XMM2, [RDI] ;
  808. MOVSS [RDI], XMM2 ;
  809. ADD RDI, RAX ;
  810. MOVLHPS XMM1, XMM3 ;
  811. ADDPS XMM1, XMM3 ;
  812. SHUFPS XMM3, XMM1, 48 ;
  813. ADDPS XMM3, XMM1 ;
  814. MOVHLPS XMM3, XMM3 ;
  815. ADDSS XMM3, [RDI] ;
  816. MOVSS [RDI], XMM3 ;
  817. ADD RDI, RAX ;
  818. MOVLHPS XMM1, XMM4 ;
  819. ADDPS XMM1, XMM4 ;
  820. SHUFPS XMM4, XMM1, 48 ;
  821. ADDPS XMM4, XMM1 ;
  822. MOVHLPS XMM4, XMM4 ;
  823. ADDSS XMM4, [RDI] ;
  824. MOVSS [RDI], XMM4 ;
  825. ADD RDI, RAX ;
  826. MOVLHPS XMM1, XMM5 ;
  827. ADDPS XMM1, XMM5 ;
  828. SHUFPS XMM5, XMM1, 48 ;
  829. ADDPS XMM5, XMM1 ;
  830. MOVHLPS XMM5, XMM5 ;
  831. ADDSS XMM5, [RDI] ;
  832. MOVSS [RDI], XMM5 ;
  833. ADD RDI, RAX ;
  834. MOVLHPS XMM1, XMM6 ;
  835. ADDPS XMM1, XMM6 ;
  836. SHUFPS XMM6, XMM1, 48 ;
  837. ADDPS XMM6, XMM1 ;
  838. MOVHLPS XMM6, XMM6 ;
  839. ADDSS XMM6, [RDI] ;
  840. MOVSS [RDI], XMM6 ;
  841. endL:
  842. ADD RSP, 40 ;
  843. END L1Block5RSSE;
  844. PROCEDURE -Align4( adr: ADDRESS ): ADDRESS;
  845. CODE {SYSTEM.AMD64}
  846. MOV RAX, [RSP+adr] ;
  847. NEG RAX ;
  848. AND RAX, 3H ;
  849. ADD RAX, [RSP+adr] ;
  850. ADD RSP, 8
  851. END Align4;
  852. PROCEDURE -Align2( adr: ADDRESS ): ADDRESS;
  853. CODE {SYSTEM.AMD64}
  854. MOV RAX, [RSP+adr] ;
  855. NEG RAX ;
  856. AND RAX, 1H ;
  857. ADD RAX, [RSP+adr] ;
  858. ADD RSP, 8
  859. END Align2;
  860. PROCEDURE -ZeroR( adr: ADDRESS; count: SIZE );
  861. (** For 32 bit types *)
  862. CODE {SYSTEM.AMD64}
  863. MOV RDI, [RSP+adr] ; address OF dest index
  864. MOV RCX, [RSP+count] ; counter
  865. MOV RAX, 0 ; value
  866. CLD ; incremental
  867. REP ;
  868. STOSD ;
  869. ADD RSP, 16 ;
  870. END ZeroR;
  871. PROCEDURE -ZeroX( adr: ADDRESS; count: SIZE );
  872. (** For 64 bit types *)
  873. CODE {SYSTEM.AMD64}
  874. MOV RDI, [RSP+adr] ; address OF dest index
  875. MOV RCX, [RSP+count] ; counter
  876. SHL RCX, 1 ;
  877. MOV RAX, 0 ; value
  878. CLD ; incremental
  879. REP ;
  880. STOSD ;
  881. ADD RSP, 16 ;
  882. END ZeroX;
  883. PROCEDURE -ZeroRI( adr: SIZE; inc, count: SIZE );
  884. (** For 32 bit types *)
  885. CODE {SYSTEM.AMD64}
  886. MOV RDI, [RSP+adr] ; address OF dest index
  887. MOV RBX, [RSP+inc] ;
  888. MOV RCX, [RSP+count] ; counter
  889. CMP RBX, 4 ;
  890. JE fastzero ;
  891. MOV RAX, 0 ;
  892. loopL:
  893. CMP RCX, 0 ;
  894. JLE endL ;
  895. MOV [RDI], RAX ;
  896. ADD RDI, RBX ;
  897. DEC RCX ;
  898. JMP loopL ;
  899. fastzero:
  900. MOV RAX, 0 ; value
  901. CLD ; incremental
  902. REP ;
  903. STOSD ;
  904. endL:
  905. ADD RSP, 24 ;
  906. END ZeroRI;
  907. PROCEDURE -ZeroXI( adr: ADDRESS; inc, count: SIZE );
  908. (** For 32 bit types *)
  909. CODE {SYSTEM.AMD64}
  910. MOV RDI, [RSP+adr] ; address OF dest index
  911. MOV RBX, [RSP+inc] ;
  912. MOV RCX, [RSP+count] ; counter
  913. MOV RAX, 0 ;
  914. CMP RBX, 8 ;
  915. JE fastzero ;
  916. loopL:
  917. CMP RCX, 0 ;
  918. JLE endL ;
  919. MOV [RDI], RAX ;
  920. MOV [RDI+4], RAX ;
  921. ADD RDI, RBX ;
  922. DEC RCX ;
  923. JMP loopL ;
  924. fastzero:
  925. SHL RCX, 1 ;
  926. CLD ; incremental
  927. REP ;
  928. STOSD ;
  929. endL:
  930. ADD RSP, 24 ;
  931. END ZeroXI;
  932. PROCEDURE -MovR( from, to0, frominc, count: SIZE );
  933. CODE {SYSTEM.AMD64}
  934. MOV RDI, [RSP+to0] ; TO
  935. MOV RSI, [RSP+from] ; from
  936. MOV RCX, [RSP+count] ; count
  937. MOV RBX, [RSP+frominc] ; inc
  938. CMP RBX, 4 ;
  939. JE fastmove ;
  940. loopL:
  941. CMP RCX, 0 ;
  942. JLE endL ;
  943. MOV RAX, [RSI] ;
  944. MOV [RDI], RAX ;
  945. ADD RSI, RBX ;
  946. ADD RDI, 4 ;
  947. DEC RCX ;
  948. JMP loopL ;
  949. fastmove:
  950. CLD ; incremental
  951. REP ;
  952. MOVSD ; move rest IN one byte steps
  953. endL:
  954. ADD RSP, 32 ;
  955. END MovR;
  956. PROCEDURE -MovX( from, to0: ADDRESS; frominc, count:SIZE );
  957. CODE {SYSTEM.AMD64}
  958. MOV RDI, [RSP+to0] ; TO
  959. MOV RSI, [RSP+from] ; from
  960. MOV RCX, [RSP+count] ; count
  961. MOV RBX, [RSP+frominc] ; inc
  962. CMP RBX, 8 ;
  963. JE fastmove ;
  964. loopL:
  965. CMP RCX, 0 ;
  966. JLE endL ;
  967. MOV RAX, [RSI] ;
  968. MOV [RDI], RAX ;
  969. MOV RAX, [RSI+4] ;
  970. MOV [RDI+4], RAX ;
  971. ADD RSI, RBX ;
  972. ADD RDI, 8 ;
  973. DEC RCX ;
  974. JMP loopL ;
  975. fastmove:
  976. SHL RCX, 1 ;
  977. CLD ; incremental
  978. REP ;
  979. MOVSD ; move rest IN one byte steps
  980. endL:
  981. ADD RSP, 32 ;
  982. END MovX;
  983. PROCEDURE -MovR5( src: ADDRESS; inc, stride: SIZE; dest: ADDRESS; count: SIZE);
  984. CODE {SYSTEM.AMD64}
  985. MOV RSI, [RSP+src] ; src
  986. MOV RBX, [RSP+inc] ; inc
  987. MOV RCX, [RSP+stride] ; stride
  988. MOV RDI, [RSP+dest] ; dest
  989. loopL:
  990. MOV RAX, [RSP+count] ; count
  991. CMP RAX, 0 ;
  992. JLE endL ;
  993. SUB RAX, 4 ;
  994. MOV [RSP+count], RAX ;
  995. MOV RDX, RSI ;
  996. MOV RAX, [RDX] ;
  997. MOV [RDI], RAX ;
  998. ADD RDX, RBX ;
  999. MOV RAX, [RDX] ;
  1000. MOV [RDI+16], RAX ;
  1001. ADD RDX, RBX ;
  1002. MOV RAX, [RDX] ;
  1003. MOV [RDI+32], RAX ;
  1004. ADD RDX, RBX ;
  1005. MOV RAX, [RDX] ;
  1006. MOV [RDI+48], RAX ;
  1007. ADD RDX, RBX ;
  1008. MOV RAX, [RDX] ;
  1009. MOV [RDI+64], RAX ;
  1010. ADD RSI, RCX ;
  1011. ADD RDI, 4 ;
  1012. MOV RDX, RSI ;
  1013. MOV RAX, [RDX] ;
  1014. MOV [RDI], RAX ;
  1015. ADD RDX, RBX ;
  1016. MOV RAX, [RDX] ;
  1017. MOV [RDI+16], RAX ;
  1018. ADD RDX, RBX ;
  1019. MOV RAX, [RDX] ;
  1020. MOV [RDI+32], RAX ;
  1021. ADD RDX, RBX ;
  1022. MOV RAX, [RDX] ;
  1023. MOV [RDI+48], RAX ;
  1024. ADD RDX, RBX ;
  1025. MOV RAX, [RDX] ;
  1026. MOV [RDI+64], RAX ;
  1027. ADD RSI, RCX ;
  1028. ADD RDI, 4 ;
  1029. MOV RDX, RSI ;
  1030. MOV RAX, [RDX] ;
  1031. MOV [RDI], RAX ;
  1032. ADD RDX, RBX ;
  1033. MOV RAX, [RDX] ;
  1034. MOV [RDI+16], RAX ;
  1035. ADD RDX, RBX ;
  1036. MOV RAX, [RDX] ;
  1037. MOV [RDI+32], RAX ;
  1038. ADD RDX, RBX ;
  1039. MOV RAX, [RDX] ;
  1040. MOV [RDI+48], RAX ;
  1041. ADD RDX, RBX ;
  1042. MOV RAX, [RDX] ;
  1043. MOV [RDI+64], RAX ;
  1044. ADD RSI, RCX ;
  1045. ADD RDI, 4 ;
  1046. MOV RDX, RSI ;
  1047. MOV RAX, [RDX] ;
  1048. MOV [RDI], RAX ;
  1049. ADD RDX, RBX ;
  1050. MOV RAX, [RDX] ;
  1051. MOV [RDI+16], RAX ;
  1052. ADD RDX, RBX ;
  1053. MOV RAX, [RDX] ;
  1054. MOV [RDI+32], RAX ;
  1055. ADD RDX, RBX ;
  1056. MOV RAX, [RDX] ;
  1057. MOV [RDI+48], RAX ;
  1058. ADD RDX, RBX ;
  1059. MOV RAX, [RDX] ;
  1060. MOV [RDI+64], RAX ;
  1061. ADD RSI, RCX ;
  1062. ADD RDI, 4 ;
  1063. ADD RDI, 64 ;
  1064. JMP loopL ;
  1065. endL:
  1066. ADD RSP, 40 ;
  1067. END MovR5;
  1068. (* *)
  1069. PROCEDURE AddAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1070. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1071. MOV RAX, [RBP+len] ;
  1072. MOV RBX, [RBP+ladr] ;
  1073. MOV RCX, [RBP+radr] ;
  1074. MOV RDX, [RBP+dadr] ;
  1075. start:
  1076. CMP RAX, 0 ;
  1077. JLE endL ;
  1078. FLD QWORD [RBX] ;
  1079. ADD RBX, [RBP+linc] ;
  1080. FLD QWORD [RCX] ;
  1081. ADD RCX, [RBP+rinc] ;
  1082. FADDP ;
  1083. FSTP QWORD [RDX] ;
  1084. ADD RDX, [RBP+dinc] ;
  1085. DEC RAX ;
  1086. JMP start ;
  1087. endL:
  1088. FWAIT ;
  1089. END AddAXAXLoopA;
  1090. PROCEDURE AddARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1091. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1092. MOV RAX, [RBP+len] ;
  1093. MOV RBX, [RBP+ladr] ;
  1094. MOV RCX, [RBP+radr] ;
  1095. MOV RDX, [RBP+dadr] ;
  1096. start:
  1097. CMP RAX, 0 ;
  1098. JLE endL ;
  1099. FLD DWORD [RBX] ;
  1100. ADD RBX, [RBP+linc] ;
  1101. FLD DWORD [RCX] ;
  1102. ADD RCX, [RBP+rinc] ;
  1103. FADDP ;
  1104. FSTP DWORD [RDX] ;
  1105. ADD RDX, [RBP+dinc] ;
  1106. DEC RAX ;
  1107. JMP start ;
  1108. endL:
  1109. FWAIT ;
  1110. END AddARARLoopA;
  1111. PROCEDURE AddAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1112. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1113. MOV RAX, [RBP+len] ;
  1114. CMP RAX, 0 ;
  1115. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1116. MOV RBX, [RBP+ladr] ;
  1117. MOV RCX, [RBP+radr] ;
  1118. MOV RDX, [RBP+dadr] ;
  1119. ; check IF data are contiguous IN memory
  1120. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1121. JNE single ; not continuous- > simplest method
  1122. CMP [RBP+rinc], 8 ; check right FOR contiunuity
  1123. JNE single ; not continuous- > simplest method
  1124. CMP [RBP+dinc], 8 ; check destination FOR contiunuity
  1125. JNE single ; not continuous- > simplest method
  1126. ; check FOR alignment
  1127. MOV RSI, RBX ;
  1128. AND RSI, 7 ; ladr MOD 8
  1129. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1130. JNE unaligned ; not 64 bit aligned
  1131. MOV RSI, RCX ;
  1132. AND RSI, 7 ; radr MOD 8
  1133. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1134. JNE unaligned ; not 64 bit aligned
  1135. MOV RSI, RDX ;
  1136. AND RSI, 7 ; dadr MOD 8
  1137. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1138. JNE unaligned ; not 64 bit aligned
  1139. MOV RSI, RBX ;
  1140. AND RSI, 8 ; 16 byte alignment
  1141. MOV RDI, RCX ;
  1142. AND RDI, 8 ; 16 byte alignment
  1143. CMP RSI, RDI ;
  1144. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1145. MOV RDI, RDX ;
  1146. AND RDI, 8 ; 16 byte alignment
  1147. CMP RSI, RDI ;
  1148. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1149. CMP RSI, 8 ;
  1150. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1151. ; one single element processing TO achieve 128 bt alignment
  1152. MOVSD XMM1, [RBX] ;
  1153. MOVSD XMM0, [RCX] ;
  1154. ADDSD XMM0, XMM1 ;
  1155. MOVSD [RDX], XMM0 ;
  1156. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1157. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  1158. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  1159. DEC RAX ; one element has been processed
  1160. aligned:
  1161. aligned8:
  1162. CMP RAX, 8 ;
  1163. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1164. MOVAPD XMM0, [RBX] ;
  1165. MOVAPD XMM1, [RBX+16] ;
  1166. MOVAPD XMM2, [RBX+32] ;
  1167. MOVAPD XMM3, [RBX+48] ;
  1168. ADD RBX, 64 ;
  1169. MOVAPD XMM4, [RCX] ;
  1170. MOVAPD XMM5, [RCX+16] ;
  1171. MOVAPD XMM6, [RCX+32] ;
  1172. MOVAPD XMM7, [RCX+48] ;
  1173. ADD RCX, 64 ;
  1174. ADDPD XMM0, XMM4 ;
  1175. ADDPD XMM1, XMM5 ;
  1176. ADDPD XMM2, XMM6 ;
  1177. ADDPD XMM3, XMM7 ;
  1178. MOVAPD [RDX], XMM0 ;
  1179. MOVAPD [RDX+16], XMM1 ;
  1180. MOVAPD [RDX+32], XMM2 ;
  1181. MOVAPD [RDX+48], XMM3 ;
  1182. ADD RDX, 64 ;
  1183. SUB RAX, 8 ;
  1184. JMP aligned8 ;
  1185. ; LOOP FOR 2 pieces aligned
  1186. aligned2: ;
  1187. CMP RAX, 2 ;
  1188. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1189. MOVAPD XMM0, [RBX] ;
  1190. ADD RBX, 16 ;
  1191. MOVAPD XMM1, [RCX] ;
  1192. ADD RCX, 16 ;
  1193. ADDPD XMM0, XMM1 ;
  1194. MOVAPD [RDX], XMM0 ;
  1195. ADD RDX, 16 ;
  1196. SUB RAX, 2 ;
  1197. JMP aligned2 ;
  1198. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1199. unaligned: ;
  1200. unaligned8: ;
  1201. CMP RAX, 8 ;
  1202. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1203. MOVUPD XMM0, [RBX] ;
  1204. MOVUPD XMM1, [RBX+16] ;
  1205. MOVUPD XMM2, [RBX+32] ;
  1206. MOVUPD XMM3, [RBX+48] ;
  1207. ADD RBX, 64 ;
  1208. MOVUPD XMM4, [RCX] ;
  1209. MOVUPD XMM5, [RCX+16] ;
  1210. MOVUPD XMM6, [RCX+32] ;
  1211. MOVUPD XMM7, [RCX+48] ;
  1212. ADD RCX, 64 ;
  1213. ADDPD XMM0, XMM4 ;
  1214. ADDPD XMM1, XMM5 ;
  1215. ADDPD XMM2, XMM6 ;
  1216. ADDPD XMM3, XMM7 ;
  1217. MOVUPD [RDX], XMM0 ;
  1218. MOVUPD [RDX+16], XMM1 ;
  1219. MOVUPD [RDX+32], XMM2 ;
  1220. MOVUPD [RDX+48], XMM3 ;
  1221. ADD RDX, 64 ;
  1222. SUB RAX, 8 ;
  1223. JMP unaligned8 ;
  1224. ; LOOP FOR 2 pieces aligned
  1225. unaligned2: ;
  1226. CMP RAX, 2 ;
  1227. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1228. MOVUPD XMM0, [RBX] ;
  1229. ADD RBX, 16 ;
  1230. MOVUPD XMM1, [RCX] ;
  1231. ADD RCX, 16 ;
  1232. ADDPD XMM0, XMM1 ;
  1233. MOVUPD [RDX], XMM0 ;
  1234. ADD RDX, 16 ;
  1235. SUB RAX, 2 ;
  1236. JMP unaligned2 ;
  1237. ; one piece left OR non-contiguous data
  1238. single:
  1239. singlepieces: ;
  1240. CMP RAX, 0 ;
  1241. JLE endL ; len <= 0- > EXIT
  1242. MOVSD XMM0, [RBX]
  1243. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1244. MOVSD XMM1, [RCX]
  1245. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1246. ADDSD XMM0, XMM1 ;
  1247. MOVSD [RDX], XMM0
  1248. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1249. DEC RAX ; DEC(len)
  1250. JMP singlepieces ;
  1251. endL:
  1252. END AddAXAXLoopSSE;
  1253. PROCEDURE AddARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1254. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1255. MOV RAX, [RBP+len] ;
  1256. CMP RAX, 0 ;
  1257. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1258. MOV RBX, [RBP+ladr] ;
  1259. MOV RCX, [RBP+radr] ;
  1260. MOV RDX, [RBP+dadr] ;
  1261. ; check IF data are contiguous IN memory
  1262. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1263. JNE single ; not continuous- > simplest method
  1264. CMP [RBP+rinc], 4 ; check right FOR contiunuity
  1265. JNE single ; not continuous- > simplest method
  1266. CMP [RBP+dinc], 4 ; check destination FOR contiunuity
  1267. JNE single ; not continuous- > simplest method
  1268. ; check FOR alignment
  1269. MOV RSI, RBX ;
  1270. AND RSI, 3 ; ladr MOD 4
  1271. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1272. JNE unaligned ; not 32 bit aligned
  1273. MOV RSI, RCX ;
  1274. AND RSI, 3 ; radr MOD 4
  1275. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1276. JNE unaligned ; not 32 bit aligned
  1277. MOV RSI, RDX ;
  1278. AND RSI, 3 ; dadr MOD 4
  1279. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1280. JNE unaligned ; not 32 bit aligned
  1281. MOV RSI, RBX ;
  1282. AND RSI, 8+4 ; 16 byte alignment?
  1283. MOV RDI, RCX ;
  1284. AND RDI, 8+4 ; 16 byte alignment?
  1285. CMP RSI, RDI ;
  1286. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1287. MOV RDI, RDX ;
  1288. AND RDI, 8+4 ; 16 byte alignment
  1289. CMP RSI, RDI ;
  1290. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1291. CMP RSI, 0 ;
  1292. JE aligned ; already aligned
  1293. align:
  1294. ; one single element processing UNTIL 128 bt alignment achieved
  1295. MOVSS XMM1, [RBX] ;
  1296. MOVSS XMM0, [RCX] ;
  1297. ADDSS XMM0, XMM1 ;
  1298. MOVSS [RDX], XMM0 ;
  1299. ADD RBX, 4 ;
  1300. ADD RCX, 4 ;
  1301. ADD RDX, 4 ;
  1302. DEC RAX ; one element has been processed ;
  1303. CMP RAX, 0 ; all elements already processed?
  1304. JLE single ;
  1305. MOV RSI, RBX ;
  1306. AND RSI, 8+4 ;
  1307. CMP RSI, 0 ;
  1308. JNE align ;
  1309. aligned:
  1310. aligned16:
  1311. CMP RAX, 16 ;
  1312. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1313. MOVAPS XMM0, [RBX] ;
  1314. MOVAPS XMM1, [RBX+16] ;
  1315. MOVAPS XMM2, [RBX+32] ;
  1316. MOVAPS XMM3, [RBX+48] ;
  1317. ADD RBX, 64 ;
  1318. MOVAPS XMM4, [RCX] ;
  1319. MOVAPS XMM5, [RCX+16] ;
  1320. MOVAPS XMM6, [RCX+32] ;
  1321. MOVAPS XMM7, [RCX+48] ;
  1322. ADD RCX, 64 ;
  1323. ADDPS XMM0, XMM4 ;
  1324. ADDPS XMM1, XMM5 ;
  1325. ADDPS XMM2, XMM6 ;
  1326. ADDPS XMM3, XMM7 ;
  1327. MOVAPS [RDX], XMM0 ;
  1328. MOVAPS [RDX+16], XMM1 ;
  1329. MOVAPS [RDX+32], XMM2 ;
  1330. MOVAPS [RDX+48], XMM3 ;
  1331. ADD RDX, 64 ;
  1332. SUB RAX, 16 ;
  1333. JMP aligned16 ;
  1334. ; LOOP FOR 2 pieces aligned
  1335. aligned4: ;
  1336. CMP RAX, 4 ;
  1337. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1338. MOVAPS XMM0, [RBX] ;
  1339. ADD RBX, 16 ;
  1340. MOVAPS XMM1, [RCX] ;
  1341. ADD RCX, 16 ;
  1342. ADDPS XMM0, XMM1 ;
  1343. MOVAPS [RDX], XMM0 ;
  1344. ADD RDX, 16 ;
  1345. SUB RAX, 4 ;
  1346. JMP aligned4 ;
  1347. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1348. unaligned: ;
  1349. unaligned16: ;
  1350. CMP RAX, 16 ;
  1351. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1352. MOVUPS XMM0, [RBX] ;
  1353. MOVUPS XMM1, [RBX+16] ;
  1354. MOVUPS XMM2, [RBX+32] ;
  1355. MOVUPS XMM3, [RBX+48] ;
  1356. ADD RBX, 64 ;
  1357. MOVUPS XMM4, [RCX] ;
  1358. MOVUPS XMM5, [RCX+16] ;
  1359. MOVUPS XMM6, [RCX+32] ;
  1360. MOVUPS XMM7, [RCX+48] ;
  1361. ADD RCX, 64 ;
  1362. ADDPS XMM0, XMM4 ;
  1363. ADDPS XMM1, XMM5 ;
  1364. ADDPS XMM2, XMM6 ;
  1365. ADDPS XMM3, XMM7 ;
  1366. MOVUPS [RDX], XMM0 ;
  1367. MOVUPS [RDX+16], XMM1 ;
  1368. MOVUPS [RDX+32], XMM2 ;
  1369. MOVUPS [RDX+48], XMM3 ;
  1370. ADD RDX, 64 ;
  1371. SUB RAX, 16 ;
  1372. JMP unaligned16 ;
  1373. ; LOOP FOR 2 pieces aligned
  1374. unaligned4: ;
  1375. CMP RAX, 4 ;
  1376. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1377. MOVUPS XMM0, [RBX] ;
  1378. ADD RBX, 16 ;
  1379. MOVUPS XMM1, [RCX] ;
  1380. ADD RCX, 16 ;
  1381. ADDPS XMM0, XMM1 ;
  1382. MOVUPS [RDX], XMM0 ;
  1383. ADD RDX, 16 ;
  1384. SUB RAX, 4 ;
  1385. JMP unaligned4 ;
  1386. ; one piece left OR non-contiguous data
  1387. single:
  1388. singlepieces: ;
  1389. CMP RAX, 0 ;
  1390. JLE endL ; len <= 0- > EXIT
  1391. MOVSS XMM0, [RBX]
  1392. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1393. MOVSS XMM1, [RCX]
  1394. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1395. ADDSS XMM0, XMM1 ;
  1396. MOVSS [RDX], XMM0
  1397. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1398. DEC RAX ; DEC(len)
  1399. JMP singlepieces ;
  1400. endL:
  1401. END AddARARLoopSSE;
  1402. (* *)
  1403. PROCEDURE SubAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1404. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1405. MOV RAX, [RBP+len] ;
  1406. MOV RBX, [RBP+ladr] ;
  1407. MOV RCX, [RBP+radr] ;
  1408. MOV RDX, [RBP+dadr] ;
  1409. start:
  1410. CMP RAX, 0 ;
  1411. JLE endL ;
  1412. FLD QWORD [RBX] ;
  1413. ADD RBX, [RBP+linc] ;
  1414. FLD QWORD [RCX] ;
  1415. ADD RCX, [RBP+rinc] ;
  1416. FSUBP ;
  1417. FSTP QWORD [RDX] ;
  1418. ADD RDX, [RBP+dinc] ;
  1419. DEC RAX ;
  1420. JMP start ;
  1421. endL:
  1422. FWAIT ;
  1423. END SubAXAXLoopA;
  1424. PROCEDURE SubARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1425. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1426. MOV RAX, [RBP+len] ;
  1427. MOV RBX, [RBP+ladr] ;
  1428. MOV RCX, [RBP+radr] ;
  1429. MOV RDX, [RBP+dadr] ;
  1430. start:
  1431. CMP RAX, 0 ;
  1432. JLE endL ;
  1433. FLD DWORD [RBX] ;
  1434. ADD RBX, [RBP+linc] ;
  1435. FLD DWORD [RCX] ;
  1436. ADD RCX, [RBP+rinc] ;
  1437. FSUBP ;
  1438. FSTP DWORD [RDX] ;
  1439. ADD RDX, [RBP+dinc] ;
  1440. DEC RAX ;
  1441. JMP start ;
  1442. endL:
  1443. FWAIT ;
  1444. END SubARARLoopA;
  1445. PROCEDURE SubAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1446. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1447. MOV RAX, [RBP+len] ;
  1448. CMP RAX, 0 ;
  1449. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1450. MOV RBX, [RBP+ladr] ;
  1451. MOV RCX, [RBP+radr] ;
  1452. MOV RDX, [RBP+dadr] ;
  1453. ; check IF data are contiguous IN memory
  1454. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1455. JNE single ; not continuous- > simplest method
  1456. CMP [RBP+rinc], 8 ; check right FOR contiunuity
  1457. JNE single ; not continuous- > simplest method
  1458. CMP [RBP+dinc], 8 ; check destination FOR contiunuity
  1459. JNE single ; not continuous- > simplest method
  1460. ; check FOR alignment
  1461. MOV RSI, RBX ;
  1462. AND RSI, 7 ; ladr MOD 8
  1463. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1464. JNE unaligned ; not 64 bit aligned
  1465. MOV RSI, RCX ;
  1466. AND RSI, 7 ; radr MOD 8
  1467. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1468. JNE unaligned ; not 64 bit aligned
  1469. MOV RSI, RDX ;
  1470. AND RSI, 7 ; dadr MOD 8
  1471. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1472. JNE unaligned ; not 64 bit aligned
  1473. MOV RSI, RBX ;
  1474. AND RSI, 8 ; 16 byte alignment
  1475. MOV RDI, RCX ;
  1476. AND RDI, 8 ; 16 byte alignment
  1477. CMP RSI, RDI ;
  1478. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1479. MOV RDI, RDX ;
  1480. AND RDI, 8 ; 16 byte alignment
  1481. CMP RSI, RDI ;
  1482. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1483. CMP RSI, 8 ;
  1484. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1485. ; one single element processing TO achieve 128 bt alignment
  1486. MOVSD XMM1, [RBX] ;
  1487. MOVSD XMM0, [RCX] ;
  1488. SUBSD XMM0, XMM1 ;
  1489. MOVSD [RDX], XMM0 ;
  1490. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1491. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  1492. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  1493. DEC RAX ; one element has been processed
  1494. aligned:
  1495. aligned8:
  1496. CMP RAX, 8 ;
  1497. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1498. MOVAPD XMM0, [RBX] ;
  1499. MOVAPD XMM1, [RBX+16] ;
  1500. MOVAPD XMM2, [RBX+32] ;
  1501. MOVAPD XMM3, [RBX+48] ;
  1502. ADD RBX, 64 ;
  1503. MOVAPD XMM4, [RCX] ;
  1504. MOVAPD XMM5, [RCX+16] ;
  1505. MOVAPD XMM6, [RCX+32] ;
  1506. MOVAPD XMM7, [RCX+48] ;
  1507. ADD RCX, 64 ;
  1508. SUBPD XMM0, XMM4 ;
  1509. SUBPD XMM1, XMM5 ;
  1510. SUBPD XMM2, XMM6 ;
  1511. SUBPD XMM3, XMM7 ;
  1512. MOVAPD [RDX], XMM0 ;
  1513. MOVAPD [RDX+16], XMM1 ;
  1514. MOVAPD [RDX+32], XMM2 ;
  1515. MOVAPD [RDX+48], XMM3 ;
  1516. ADD RDX, 64 ;
  1517. SUB RAX, 8 ;
  1518. JMP aligned8 ;
  1519. ; LOOP FOR 2 pieces aligned
  1520. aligned2: ;
  1521. CMP RAX, 2 ;
  1522. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1523. MOVAPD XMM0, [RBX] ;
  1524. ADD RBX, 16 ;
  1525. MOVAPD XMM1, [RCX] ;
  1526. ADD RCX, 16 ;
  1527. SUBPD XMM0, XMM1 ;
  1528. MOVAPD [RDX], XMM0 ;
  1529. ADD RDX, 16 ;
  1530. SUB RAX, 2 ;
  1531. JMP aligned2 ;
  1532. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1533. unaligned: ;
  1534. unaligned8: ;
  1535. CMP RAX, 8 ;
  1536. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1537. MOVUPD XMM0, [RBX] ;
  1538. MOVUPD XMM1, [RBX+16] ;
  1539. MOVUPD XMM2, [RBX+32] ;
  1540. MOVUPD XMM3, [RBX+48] ;
  1541. ADD RBX, 64 ;
  1542. MOVUPD XMM4, [RCX] ;
  1543. MOVUPD XMM5, [RCX+16] ;
  1544. MOVUPD XMM6, [RCX+32] ;
  1545. MOVUPD XMM7, [RCX+48] ;
  1546. ADD RCX, 64 ;
  1547. SUBPD XMM0, XMM4 ;
  1548. SUBPD XMM1, XMM5 ;
  1549. SUBPD XMM2, XMM6 ;
  1550. SUBPD XMM3, XMM7 ;
  1551. MOVUPD [RDX], XMM0 ;
  1552. MOVUPD [RDX+16], XMM1 ;
  1553. MOVUPD [RDX+32], XMM2 ;
  1554. MOVUPD [RDX+48], XMM3 ;
  1555. ADD RDX, 64 ;
  1556. SUB RAX, 8 ;
  1557. JMP unaligned8 ;
  1558. ; LOOP FOR 2 pieces aligned
  1559. unaligned2: ;
  1560. CMP RAX, 2 ;
  1561. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1562. MOVUPD XMM0, [RBX] ;
  1563. ADD RBX, 16 ;
  1564. MOVUPD XMM1, [RCX] ;
  1565. ADD RCX, 16 ;
  1566. SUBPD XMM0, XMM1 ;
  1567. MOVUPD [RDX], XMM0 ;
  1568. ADD RDX, 16 ;
  1569. SUB RAX, 2 ;
  1570. JMP unaligned2 ;
  1571. ; one piece left OR non-contiguous data
  1572. single:
  1573. singlepieces: ;
  1574. CMP RAX, 0 ;
  1575. JLE endL ; len <= 0- > EXIT
  1576. MOVSD XMM0, [RBX]
  1577. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1578. MOVSD XMM1, [RCX]
  1579. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1580. SUBSD XMM0, XMM1 ;
  1581. MOVSD [RDX], XMM0
  1582. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1583. DEC RAX ; DEC(len)
  1584. JMP singlepieces ;
  1585. endL:
  1586. END SubAXAXLoopSSE;
  1587. PROCEDURE SubARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1588. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1589. MOV RAX, [RBP+len] ;
  1590. CMP RAX, 0 ;
  1591. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1592. MOV RBX, [RBP+ladr] ;
  1593. MOV RCX, [RBP+radr] ;
  1594. MOV RDX, [RBP+dadr] ;
  1595. ; check IF data are contiguous IN memory
  1596. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1597. JNE single ; not continuous- > simplest method
  1598. CMP [RBP+rinc], 4 ; check right FOR contiunuity
  1599. JNE single ; not continuous- > simplest method
  1600. CMP [RBP+dinc], 4 ; check destination FOR contiunuity
  1601. JNE single ; not continuous- > simplest method
  1602. ; check FOR alignment
  1603. MOV RSI, RBX ;
  1604. AND RSI, 3 ; ladr MOD 4
  1605. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1606. JNE unaligned ; not 32 bit aligned
  1607. MOV RSI, RCX ;
  1608. AND RSI, 3 ; radr MOD 4
  1609. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1610. JNE unaligned ; not 32 bit aligned
  1611. MOV RSI, RDX ;
  1612. AND RSI, 3 ; dadr MOD 4
  1613. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1614. JNE unaligned ; not 32 bit aligned
  1615. MOV RSI, RBX ;
  1616. AND RSI, 8+4 ; 16 byte alignment?
  1617. MOV RDI, RCX ;
  1618. AND RDI, 8+4 ; 16 byte alignment?
  1619. CMP RSI, RDI ;
  1620. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1621. MOV RDI, RDX ;
  1622. AND RDI, 8+4 ; 16 byte alignment
  1623. CMP RSI, RDI ;
  1624. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1625. CMP RSI, 0 ;
  1626. JE aligned ; already aligned
  1627. align:
  1628. ; one single element processing UNTIL 128 bt alignment achieved
  1629. MOVSS XMM1, [RBX] ;
  1630. MOVSS XMM0, [RCX] ;
  1631. SUBSS XMM0, XMM1 ;
  1632. MOVSS [RDX], XMM0 ;
  1633. ADD RBX, 4 ;
  1634. ADD RCX, 4 ;
  1635. ADD RDX, 4 ;
  1636. DEC RAX ; one element has been processed ;
  1637. CMP RAX, 0 ; all elements already processed?
  1638. JLE single ;
  1639. MOV RSI, RBX ;
  1640. AND RSI, 8+4 ;
  1641. CMP RSI, 0 ;
  1642. JNE align ;
  1643. aligned:
  1644. aligned16:
  1645. CMP RAX, 16 ;
  1646. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1647. MOVAPS XMM0, [RBX] ;
  1648. MOVAPS XMM1, [RBX+16] ;
  1649. MOVAPS XMM2, [RBX+32] ;
  1650. MOVAPS XMM3, [RBX+48] ;
  1651. ADD RBX, 64 ;
  1652. MOVAPS XMM4, [RCX] ;
  1653. MOVAPS XMM5, [RCX+16] ;
  1654. MOVAPS XMM6, [RCX+32] ;
  1655. MOVAPS XMM7, [RCX+48] ;
  1656. ADD RCX, 64 ;
  1657. SUBPS XMM0, XMM4 ;
  1658. SUBPS XMM1, XMM5 ;
  1659. SUBPS XMM2, XMM6 ;
  1660. SUBPS XMM3, XMM7 ;
  1661. MOVAPS [RDX], XMM0 ;
  1662. MOVAPS [RDX+16], XMM1 ;
  1663. MOVAPS [RDX+32], XMM2 ;
  1664. MOVAPS [RDX+48], XMM3 ;
  1665. ADD RDX, 64 ;
  1666. SUB RAX, 16 ;
  1667. JMP aligned16 ;
  1668. ; LOOP FOR 2 pieces aligned
  1669. aligned4: ;
  1670. CMP RAX, 4 ;
  1671. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1672. MOVAPS XMM0, [RBX] ;
  1673. ADD RBX, 16 ;
  1674. MOVAPS XMM1, [RCX] ;
  1675. ADD RCX, 16 ;
  1676. SUBPS XMM0, XMM1 ;
  1677. MOVAPS [RDX], XMM0 ;
  1678. ADD RDX, 16 ;
  1679. SUB RAX, 4 ;
  1680. JMP aligned4 ;
  1681. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1682. unaligned: ;
  1683. unaligned16: ;
  1684. CMP RAX, 16 ;
  1685. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1686. MOVUPS XMM0, [RBX] ;
  1687. MOVUPS XMM1, [RBX+16] ;
  1688. MOVUPS XMM2, [RBX+32] ;
  1689. MOVUPS XMM3, [RBX+48] ;
  1690. ADD RBX, 64 ;
  1691. MOVUPS XMM4, [RCX] ;
  1692. MOVUPS XMM5, [RCX+16] ;
  1693. MOVUPS XMM6, [RCX+32] ;
  1694. MOVUPS XMM7, [RCX+48] ;
  1695. ADD RCX, 64 ;
  1696. SUBPS XMM0, XMM4 ;
  1697. SUBPS XMM1, XMM5 ;
  1698. SUBPS XMM2, XMM6 ;
  1699. SUBPS XMM3, XMM7 ;
  1700. MOVUPS [RDX], XMM0 ;
  1701. MOVUPS [RDX+16], XMM1 ;
  1702. MOVUPS [RDX+32], XMM2 ;
  1703. MOVUPS [RDX+48], XMM3 ;
  1704. ADD RDX, 64 ;
  1705. SUB RAX, 16 ;
  1706. JMP unaligned16 ;
  1707. ; LOOP FOR 2 pieces aligned
  1708. unaligned4: ;
  1709. CMP RAX, 4 ;
  1710. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1711. MOVUPS XMM0, [RBX] ;
  1712. ADD RBX, 16 ;
  1713. MOVUPS XMM1, [RCX] ;
  1714. ADD RCX, 16 ;
  1715. SUBPS XMM0, XMM1 ;
  1716. MOVUPS [RDX], XMM0 ;
  1717. ADD RDX, 16 ;
  1718. SUB RAX, 4 ;
  1719. JMP unaligned4 ;
  1720. ; one piece left OR non-contiguous data
  1721. single:
  1722. singlepieces: ;
  1723. CMP RAX, 0 ;
  1724. JLE endL ; len <= 0- > EXIT
  1725. MOVSS XMM0, [RBX]
  1726. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1727. MOVSS XMM1, [RCX]
  1728. ADD RCX, [RBP+rinc] ; INC(ladr, incl)
  1729. SUBSS XMM0, XMM1 ;
  1730. MOVSS [RDX], XMM0
  1731. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  1732. DEC RAX ; DEC(len)
  1733. JMP singlepieces ;
  1734. endL:
  1735. END SubARARLoopSSE;
  1736. PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1737. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1738. MOV RAX, [RBP+len] ; eax := len
  1739. MOV RBX, [RBP+ladr] ; ebx := ladr
  1740. MOV RCX, [RBP+radr] ; ecx := radr
  1741. MOV RDX, [RBP+dadr] ; edx := dadr
  1742. FLD QWORD [RDX] ; S.GET(dadr, x)
  1743. start:
  1744. CMP RAX, 0 ; WHILE len > 0 DO
  1745. JLE endL
  1746. FLD QWORD [RBX] ; S.GET(ladr, x)
  1747. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1748. FLD QWORD [RCX] ; S.GET(ladr, y)
  1749. FMULP ; x := x*y
  1750. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  1751. FADDP ; z := z+x
  1752. DEC RAX ; DEC(len)
  1753. JMP start ;
  1754. endL:
  1755. FSTP QWORD [RDX] ; S.PUT(dadr, x)
  1756. FWAIT ;
  1757. END SPAXAXLoopA;
  1758. PROCEDURE SPARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1759. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  1760. MOV RAX, [RBP+len] ; eax := len
  1761. MOV RBX, [RBP+ladr] ; ebx := ladr
  1762. MOV RCX, [RBP+radr] ; ecx := radr
  1763. MOV RDX, [RBP+dadr] ; edx := dadr
  1764. FLD DWORD [RDX] ; S.GET(dadr, x)
  1765. start:
  1766. CMP RAX, 0 ; WHILE len > 0 DO
  1767. JLE endL
  1768. FLD DWORD [RBX] ; S.GET(ladr, x)
  1769. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1770. FLD DWORD [RCX] ; S.GET(ladr, y)
  1771. FMULP ; x := x*y
  1772. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  1773. FADDP ; z := z+x
  1774. DEC RAX ; DEC(len)
  1775. JMP start ;
  1776. endL:
  1777. FSTP DWORD [RDX] ; S.PUT(dadr, x)
  1778. FWAIT ;
  1779. END SPARARLoopA;
  1780. (* sse version of scalar product *)
  1781. PROCEDURE SPAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1782. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  1783. ; register initialization
  1784. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  1785. CMP RAX, 0 ;
  1786. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1787. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  1788. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  1789. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  1790. XORPD XMM0, XMM0 ;
  1791. MOVSD XMM0, [RDX] ; destination- > low bytes OF xmm0
  1792. CMP [RBP+linc], 8 ; check left FOR contiunuity
  1793. JNE single ; not continuous- > simplest method
  1794. CMP [RBP+rinc], 8 ; check dest FOR continuity
  1795. JNE single ; not continuous- > simplest method
  1796. ; check FOR alignment
  1797. MOV RSI, RBX ;
  1798. AND RSI, 7 ; ladr MOD 8
  1799. CMP RSI, 0 ; RCX = 0- > 64 Bit alignment
  1800. JNE unaligned ; not 64 bit aligned
  1801. MOV RSI, RCX ;
  1802. AND RSI, 7 ; radr MOD 8
  1803. CMP RSI, 0 ; = 0- > 64 Bit alignment
  1804. JNE unaligned ; not 64 bit aligned
  1805. MOV RSI, RBX ;
  1806. AND RSI, 8 ; 16 byte alignment
  1807. MOV RDI, RCX ;
  1808. AND RDI, 8 ; 16 byte alignment
  1809. CMP RSI, RDI ;
  1810. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1811. CMP RSI, 8 ;
  1812. JNE aligned ; ladr and dadr already 128 bit aligned
  1813. ; one single element processing TO achieve 128 bt alignment
  1814. MOVSD XMM1, [RBX] ;
  1815. MOVSD XMM2, [RCX] ;
  1816. MULSD XMM1, XMM2 ;
  1817. ADDSD XMM0, XMM1 ;
  1818. ADD RBX, 8 ; now RBX IS 16 byte aligned
  1819. ADD RCX, 8 ; now RDX IS 16 byte aligned ;
  1820. DEC RAX ; one element has been processed
  1821. ; LOOP FOR 4 pieces aligned
  1822. aligned:
  1823. aligned6:
  1824. CMP RAX, 6 ;
  1825. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1826. MOVAPD XMM1, [RBX] ;
  1827. MOVAPD XMM2, [RBX+16] ;
  1828. MOVAPD XMM3, [RBX+32] ;
  1829. MOVAPD XMM4, [RCX] ;
  1830. MOVAPD XMM5, [RCX+16] ;
  1831. MOVAPD XMM6, [RCX+32] ;
  1832. MULPD XMM1, XMM4 ;
  1833. ADDPD XMM0, XMM1 ;
  1834. MULPD XMM2, XMM5 ;
  1835. ADDPD XMM0, XMM2 ;
  1836. MULPD XMM3, XMM6 ;
  1837. ADDPD XMM0, XMM3 ;
  1838. ADD RBX, 48 ;
  1839. ADD RCX, 48 ;
  1840. SUB RAX, 6 ;
  1841. JMP aligned6 ;
  1842. ; LOOP FOR 2 pieces aligned
  1843. aligned2:
  1844. CMP RAX, 2 ;
  1845. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1846. MOVAPD XMM1, [RBX] ;
  1847. MOVAPD XMM2, [RCX] ;
  1848. MULPD XMM1, XMM2 ;
  1849. ADDPD XMM0, XMM1 ;
  1850. ADD RBX, 16 ;
  1851. ADD RCX, 16 ;
  1852. SUB RAX, 2 ;
  1853. JMP aligned2 ;
  1854. unaligned:
  1855. unaligned6:
  1856. CMP RAX, 6 ;
  1857. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1858. MOVUPD XMM1, [RBX] ;
  1859. MOVUPD XMM2, [RBX+16] ;
  1860. MOVUPD XMM3, [RBX+32] ;
  1861. MOVUPD XMM4, [RCX] ;
  1862. MOVUPD XMM5, [RCX+16] ;
  1863. MOVUPD XMM6, [RCX+32] ;
  1864. MULPD XMM1, XMM4 ;
  1865. ADDPD XMM0, XMM1 ;
  1866. MULPD XMM2, XMM5 ;
  1867. ADDPD XMM0, XMM2 ;
  1868. MULPD XMM3, XMM6 ;
  1869. ADDPD XMM0, XMM3 ;
  1870. ADD RBX, 48 ;
  1871. ADD RCX, 48 ;
  1872. SUB RAX, 6 ;
  1873. JMP unaligned6 ;
  1874. ; LOOP FOR 2 pieces aligned
  1875. unaligned2:
  1876. CMP RAX, 2 ;
  1877. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1878. MOVUPD XMM1, [RBX] ;
  1879. MOVUPD XMM2, [RCX] ;
  1880. MULPD XMM1, XMM2 ;
  1881. ADDPD XMM0, XMM1 ;
  1882. ADD RBX, 16 ;
  1883. ADD RCX, 16 ;
  1884. SUB RAX, 2 ;
  1885. JMP unaligned2 ;
  1886. horizontaladd: ;
  1887. MOVAPD XMM1, XMM0 ;
  1888. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  1889. ADDPD XMM0, XMM1 ;
  1890. JMP singlepieces ;
  1891. single:
  1892. singlepieces: ;
  1893. CMP RAX, 0 ;
  1894. JLE store ; len <= 0- > EXIT
  1895. MOVSD XMM1, [RBX]
  1896. MOVSD XMM2, [RCX]
  1897. MULSD XMM1, XMM2
  1898. ADDSD XMM0, XMM1
  1899. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  1900. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  1901. DEC RAX ; DEC(len)
  1902. JMP singlepieces ;
  1903. store:
  1904. MOVSD [RDX], XMM0 ;
  1905. endL:
  1906. END SPAXAXLoopSSE;
  1907. (* sse version of scalar product *)
  1908. PROCEDURE SPARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1909. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  1910. ; register initialization
  1911. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  1912. CMP RAX, 0 ;
  1913. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  1914. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  1915. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  1916. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  1917. XORPS XMM0, XMM0 ;
  1918. MOVSS XMM0, [RDX] ; destination- > low bytes OF xmm0
  1919. CMP [RBP+linc], 4 ; check left FOR contiunuity
  1920. JNE single ; not continuous- > simplest method
  1921. CMP [RBP+rinc], 4 ; check dest FOR continuity
  1922. JNE single ; not continuous- > simplest method
  1923. ; check FOR alignment
  1924. MOV RSI, RBX ;
  1925. AND RSI, 3 ; ladr MOD 4
  1926. CMP RSI, 0 ; RCX = 0- > 32 Bit alignment
  1927. JNE unaligned ; not 32 bit aligned
  1928. MOV RSI, RCX ;
  1929. AND RSI, 3 ; radr MOD 4
  1930. CMP RSI, 0 ; = 0- > 32 Bit alignment
  1931. JNE unaligned ; not 32 bit aligned
  1932. MOV RSI, RBX ;
  1933. AND RSI, 8+4 ; 16 byte alignment
  1934. MOV RDI, RCX ;
  1935. AND RDI, 8+4 ; 16 byte alignment
  1936. CMP RSI, RDI ;
  1937. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1938. CMP RSI, 0 ;
  1939. JE aligned ; already aligned
  1940. align:
  1941. ; one single element processing UNTIL 128 bt alignment achieved
  1942. MOVSS XMM1, [RBX] ;
  1943. MOVSS XMM2, [RCX] ;
  1944. MULSS XMM1, XMM2 ;
  1945. ADDSS XMM0, XMM1 ;
  1946. ADD RBX, 4 ;
  1947. ADD RCX, 4 ;
  1948. DEC RAX ; one element has been processed ;
  1949. CMP RAX, 0 ; all elements already processed?
  1950. JLE single ;
  1951. MOV RSI, RBX ;
  1952. AND RSI, 8+4 ;
  1953. CMP RSI, 0 ;
  1954. JNE align ;
  1955. aligned:
  1956. aligned12:
  1957. CMP RAX, 12 ;
  1958. JL aligned4 ; len < 4- > EXIT TO singlepieces
  1959. MOVAPS XMM1, [RBX] ;
  1960. MOVAPS XMM2, [RBX+16] ;
  1961. MOVAPS XMM3, [RBX+32] ;
  1962. MOVAPS XMM4, [RCX] ;
  1963. MOVAPS XMM5, [RCX+16] ;
  1964. MOVAPS XMM6, [RCX+32] ;
  1965. MULPS XMM1, XMM4 ;
  1966. ADDPS XMM0, XMM1 ;
  1967. MULPS XMM2, XMM5 ;
  1968. ADDPS XMM0, XMM2 ;
  1969. MULPS XMM3, XMM6 ;
  1970. ADDPS XMM0, XMM3 ;
  1971. ADD RBX, 48 ;
  1972. ADD RCX, 48 ;
  1973. SUB RAX, 12 ;
  1974. JMP aligned12 ;
  1975. ; LOOP FOR 2 pieces aligned
  1976. aligned4:
  1977. CMP RAX, 4 ;
  1978. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1979. MOVAPS XMM1, [RBX] ;
  1980. MOVAPS XMM2, [RCX] ;
  1981. MULPS XMM1, XMM2 ;
  1982. ADDPS XMM0, XMM1 ;
  1983. ADD RBX, 16 ;
  1984. ADD RCX, 16 ;
  1985. SUB RAX, 4 ;
  1986. JMP aligned4 ;
  1987. unaligned:
  1988. unaligned12:
  1989. CMP RAX, 12 ;
  1990. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1991. MOVUPS XMM1, [RBX] ;
  1992. MOVUPS XMM2, [RBX+16] ;
  1993. MOVUPS XMM3, [RBX+32] ;
  1994. MOVUPS XMM4, [RCX] ;
  1995. MOVUPS XMM5, [RCX+16] ;
  1996. MOVUPS XMM6, [RCX+32] ;
  1997. MULPS XMM1, XMM4 ;
  1998. ADDPS XMM0, XMM1 ;
  1999. MULPS XMM2, XMM5 ;
  2000. ADDPS XMM0, XMM2 ;
  2001. MULPS XMM3, XMM6 ;
  2002. ADDPS XMM0, XMM3 ;
  2003. ADD RBX, 48 ;
  2004. ADD RCX, 48 ;
  2005. SUB RAX, 12 ;
  2006. JMP unaligned12 ;
  2007. ; LOOP FOR 2 pieces aligned
  2008. unaligned4:
  2009. CMP RAX, 4 ;
  2010. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2011. MOVUPS XMM1, [RBX] ;
  2012. MOVUPS XMM2, [RCX] ;
  2013. MULPS XMM1, XMM2 ;
  2014. ADDPS XMM0, XMM1 ;
  2015. ADD RBX, 16 ;
  2016. ADD RCX, 16 ;
  2017. SUB RAX, 4 ;
  2018. JMP unaligned4 ;
  2019. horizontaladd: ;
  2020. MOVAPS XMM1, XMM0 ;
  2021. ; 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *)
  2022. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  2023. ADDPS XMM1, XMM0 ;
  2024. MOVAPS XMM0, XMM1
  2025. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  2026. ADDPS XMM0, XMM1 ;
  2027. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  2028. JMP singlepieces ;
  2029. single:
  2030. singlepieces: ;
  2031. CMP RAX, 0 ;
  2032. JLE store ; len <= 0- > EXIT
  2033. MOVSS XMM1, [RBX]
  2034. MOVSS XMM2, [RCX]
  2035. MULSS XMM1, XMM2
  2036. ADDSS XMM0, XMM1
  2037. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2038. ADD RCX, [RBP+rinc] ; INC(radr, incr)
  2039. DEC RAX ; DEC(len)
  2040. JMP singlepieces ;
  2041. store:
  2042. MOVSS [RDX], XMM0 ;
  2043. endL:
  2044. END SPARARLoopSSE;
  2045. PROCEDURE MulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2046. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2047. MOV RAX, [RBP+len] ; eax := len
  2048. MOV RBX, [RBP+ladr] ; ebx := ladr
  2049. MOV RCX, [RBP+radr] ; ecx := radr
  2050. MOV RDX, [RBP+dadr] ; edx := dadr
  2051. start:
  2052. CMP RAX, 0 ; WHILE len > 0 DO
  2053. JLE endL
  2054. FLD QWORD [RBX] ; S.GET(ladr, x)
  2055. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2056. FLD QWORD [RCX] ; S.GET(ladr, y)
  2057. FMULP ; x := x*y
  2058. FSTP QWORD [RDX]
  2059. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2060. DEC RAX ; DEC(len)
  2061. JMP start ;
  2062. endL:
  2063. FWAIT ;
  2064. END MulAXSXLoopA;
  2065. PROCEDURE MulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2066. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2067. MOV RAX, [RBP+len] ; eax := len
  2068. MOV RBX, [RBP+ladr] ; ebx := ladr
  2069. MOV RCX, [RBP+radr] ; ecx := radr
  2070. MOV RDX, [RBP+dadr] ; edx := dadr
  2071. start:
  2072. CMP RAX, 0 ; WHILE len > 0 DO
  2073. JLE endL
  2074. FLD DWORD [RBX] ; S.GET(ladr, x)
  2075. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2076. FLD DWORD [RCX] ; S.GET(ladr, y)
  2077. FMULP ; x := x*y
  2078. FSTP DWORD [RDX]
  2079. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2080. DEC RAX ; DEC(len)
  2081. JMP start ;
  2082. endL:
  2083. FWAIT ;
  2084. END MulARSRLoopA;
  2085. PROCEDURE IncMulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2086. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2087. MOV RAX, [RBP+len] ; eax := len
  2088. MOV RBX, [RBP+ladr] ; ebx := ladr
  2089. MOV RCX, [RBP+radr] ; ecx := radr
  2090. MOV RDX, [RBP+dadr] ; edx := dadr
  2091. start:
  2092. CMP RAX, 0 ; WHILE len > 0 DO
  2093. JLE endL
  2094. FLD QWORD [RBX] ; S.GET(ladr, x)
  2095. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2096. FLD QWORD [RCX] ; S.GET(ladr, y)
  2097. FMULP ; x := x*y
  2098. FLD QWORD [RDX+8] ;
  2099. FADDP ;
  2100. FSTP QWORD [RDX]
  2101. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2102. DEC RAX ; DEC(len)
  2103. JMP start ;
  2104. endL:
  2105. FWAIT ;
  2106. END IncMulAXSXLoopA;
  2107. PROCEDURE IncMulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2108. CODE {SYSTEM.AMD64, SYSTEM.FPU}
  2109. MOV RAX, [RBP+len] ; eax := len
  2110. MOV RBX, [RBP+ladr] ; ebx := ladr
  2111. MOV RCX, [RBP+radr] ; ecx := radr
  2112. MOV RDX, [RBP+dadr] ; edx := dadr
  2113. start:
  2114. CMP RAX, 0 ; WHILE len > 0 DO
  2115. JLE endL
  2116. FLD DWORD [RBX] ; S.GET(ladr, x)
  2117. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2118. FLD DWORD [RCX] ; S.GET(ladr, y)
  2119. FMULP ; x := x*y
  2120. FLD DWORD [RDX+8] ;
  2121. FADDP ;
  2122. FSTP DWORD [RDX]
  2123. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2124. DEC RAX ; DEC(len)
  2125. JMP start ;
  2126. endL:
  2127. FWAIT ;
  2128. END IncMulARSRLoopA;
  2129. PROCEDURE MulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2130. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2131. (*
  2132. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2133. 2.) process starting unaligned data ( using single instructions)
  2134. 3.) process aligned data
  2135. 4.) process remaining unaligned data (using single instructions)
  2136. *)
  2137. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2138. ; register initialization
  2139. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2140. CMP RAX, 0 ;
  2141. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2142. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2143. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2144. MOV RCX, [RBP+radr] ;
  2145. MOVSD XMM0, [RCX] ;
  2146. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2147. ; check IF data are contiguous IN memory
  2148. CMP [RBP+linc], 8 ; check left FOR contiunuity
  2149. JNE single ; not continuous- > simplest method
  2150. CMP [RBP+dinc], 8 ; check dest FOR continuity
  2151. JNE single ; not continuous- > simplest method
  2152. ; check FOR alignment
  2153. MOV RCX, RBX ;
  2154. AND RCX, 7 ; ladr MOD 8
  2155. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2156. JNE unaligned ; not 64 bit aligned
  2157. MOV RCX, RDX ;
  2158. AND RCX, 7 ; dadr MOD 8
  2159. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2160. JNE unaligned ; not 64 bit aligned
  2161. MOV RSI, RBX ;
  2162. AND RSI, 8 ; 16 byte alignment
  2163. MOV RDI, RDX ;
  2164. AND RDI, 8 ; 16 byte alignment
  2165. CMP RSI, RDI ;
  2166. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2167. CMP RSI, 8 ;
  2168. JNE aligned ; ladr and dadr already 128 bit aligned
  2169. ; one single element processing TO achieve 128 bt alignment
  2170. MOVSD XMM1, [RBX] ;
  2171. MULSD XMM1, XMM0 ;
  2172. MOVSD [RDX], XMM1 ;
  2173. ADD RBX, 8 ; now RBX IS 16 byte aligned
  2174. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  2175. DEC RAX ; one element has been processed
  2176. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2177. aligned:
  2178. aligned8:
  2179. CMP RAX, 8 ;
  2180. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2181. MOVAPD XMM1, [RBX] ;
  2182. MOVAPD XMM2, [RBX+16] ;
  2183. MOVAPD XMM3, [RBX+32] ;
  2184. MOVAPD XMM4, [RBX+48] ;
  2185. ADD RBX, 64 ;
  2186. MULPD XMM1, XMM0 ;
  2187. MULPD XMM2, XMM0 ;
  2188. MULPD XMM3, XMM0 ;
  2189. MULPD XMM4, XMM0 ;
  2190. MOVAPD [RDX], XMM1 ;
  2191. MOVAPD [RDX+16], XMM2 ;
  2192. MOVAPD [RDX+32], XMM3 ;
  2193. MOVAPD [RDX+48], XMM4 ;
  2194. ADD RDX, 64 ;
  2195. SUB RAX, 8 ;
  2196. JMP aligned8 ;
  2197. ; LOOP FOR 2 pieces aligned
  2198. aligned2: ;
  2199. CMP RAX, 2 ;
  2200. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2201. MOVAPD XMM1, [RBX] ;
  2202. ADD RBX, 16 ;
  2203. MULPD XMM1, XMM0 ;
  2204. MOVAPD [RDX], XMM1 ;
  2205. ADD RDX, 16 ;
  2206. SUB RAX, 2 ;
  2207. JMP aligned2 ;
  2208. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2209. unaligned: ;
  2210. unaligned8: ;
  2211. CMP RAX, 8 ;
  2212. JL unaligned2 ; len < 12- > EXIT
  2213. MOVUPD XMM1, [RBX] ;
  2214. MOVUPD XMM2, [RBX+16] ;
  2215. MOVUPD XMM3, [RBX+32] ;
  2216. MOVUPD XMM4, [RBX+48] ;
  2217. ADD RBX, 64
  2218. MULPD XMM1, XMM0 ;
  2219. MULPD XMM2, XMM0 ;
  2220. MULPD XMM3, XMM0 ;
  2221. MULPD XMM4, XMM0 ;
  2222. MOVUPD [RDX], XMM1 ;
  2223. MOVUPD [RDX+16], XMM2 ;
  2224. MOVUPD [RDX+32], XMM3 ;
  2225. MOVUPD [RDX+48], XMM4 ;
  2226. ADD RDX, 64 ;
  2227. SUB RAX, 8 ;
  2228. JMP unaligned8 ;
  2229. ; LOOP FOR 2 pieces unaligned
  2230. unaligned2: ;
  2231. CMP RAX, 2 ;
  2232. JL singlepieces ; len < 2- > EXIT
  2233. MOVUPD XMM1, [RBX] ;
  2234. ADD RBX, 16 ;
  2235. MULPD XMM1, XMM0 ;
  2236. MOVUPD [RDX], XMM1 ;
  2237. ADD RDX, 16 ;
  2238. SUB RAX, 2 ;
  2239. JMP unaligned2 ;
  2240. ; one piece left OR non-contiguous data
  2241. single:
  2242. singlepieces: ;
  2243. CMP RAX, 0 ;
  2244. JLE endL ; len <= 0- > EXIT
  2245. MOVSD XMM1, [RBX]
  2246. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2247. MULSD XMM1, XMM0
  2248. MOVSD [RDX], XMM1
  2249. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2250. DEC RAX ; DEC(len)
  2251. JMP singlepieces ;
  2252. endL:
  2253. END MulAXSXLoopSSE;
  2254. PROCEDURE MulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2255. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2256. (*
  2257. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2258. 2.) process starting unaligned data ( using single instructions)
  2259. 3.) process aligned data
  2260. 4.) process remaining unaligned data (using single instructions)
  2261. *)
  2262. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2263. ; register initialization
  2264. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2265. CMP RAX, 0 ;
  2266. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2267. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2268. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2269. MOV RCX, [RBP+radr] ;
  2270. MOVSS XMM0, [RCX] ;
  2271. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2272. ; check IF data are contiguous IN memory
  2273. CMP [RBP+linc], 4 ; check left FOR contiunuity
  2274. JNE single ; not continuous- > simplest method
  2275. CMP [RBP+dinc], 4 ; check dest FOR continuity
  2276. JNE single ; not continuous- > simplest method
  2277. ; check FOR alignment
  2278. MOV RCX, RBX ;
  2279. AND RCX, 3 ; ladr MOD 4
  2280. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2281. JNE unaligned ; not 32 bit aligned
  2282. MOV RCX, RDX ;
  2283. AND RCX, 3 ; dadr MOD 4
  2284. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2285. JNE unaligned ; not 64 bit aligned
  2286. MOV RSI, RBX ;
  2287. AND RSI, 8+4 ; 16 byte alignment
  2288. MOV RDI, RDX ;
  2289. AND RDI, 8+4 ; 16 byte alignment
  2290. CMP RSI, RDI ;
  2291. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2292. CMP RSI, 0 ;
  2293. JE aligned ; already aligned
  2294. align:
  2295. ; one single element processing UNTIL 128 bt alignment achieved
  2296. MOVSS XMM1, [RBX] ;
  2297. MULSS XMM1, XMM0 ;
  2298. MOVSS [RDX], XMM1 ;
  2299. ADD RBX, 4 ;
  2300. ADD RDX, 4 ;
  2301. DEC RAX ; one element has been processed ;
  2302. CMP RAX, 0 ; all elements already processed?
  2303. JLE single
  2304. MOV RSI, RBX ;
  2305. AND RSI, 8+4 ;
  2306. CMP RSI, 0 ;
  2307. JNE align ;
  2308. aligned:
  2309. aligned16:
  2310. CMP RAX, 16 ;
  2311. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2312. MOVAPS XMM1, [RBX] ;
  2313. MOVAPS XMM2, [RBX+16] ;
  2314. MOVAPS XMM3, [RBX+32] ;
  2315. MOVAPS XMM4, [RBX+48] ;
  2316. ADD RBX, 64 ;
  2317. MULPS XMM1, XMM0 ;
  2318. MULPS XMM2, XMM0 ;
  2319. MULPS XMM3, XMM0 ;
  2320. MULPS XMM4, XMM0 ;
  2321. MOVAPS [RDX], XMM1 ;
  2322. MOVAPS [RDX+16], XMM2 ;
  2323. MOVAPS [RDX+32], XMM3 ;
  2324. MOVAPS [RDX+48], XMM4 ;
  2325. ADD RDX, 64 ;
  2326. SUB RAX, 16 ;
  2327. JMP aligned16 ;
  2328. ; LOOP FOR 2 pieces aligned
  2329. aligned4: ;
  2330. CMP RAX, 4 ;
  2331. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2332. MOVAPS XMM1, [RBX] ;
  2333. ADD RBX, 16 ;
  2334. MULPS XMM1, XMM0 ;
  2335. MOVAPS [RDX], XMM1 ;
  2336. ADD RDX, 16 ;
  2337. SUB RAX, 4 ;
  2338. JMP aligned4 ;
  2339. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2340. unaligned: ;
  2341. unaligned16: ;
  2342. CMP RAX, 16 ;
  2343. JL unaligned4 ; len < 12- > EXIT
  2344. MOVUPS XMM1, [RBX] ;
  2345. MOVUPS XMM2, [RBX+16] ;
  2346. MOVUPS XMM3, [RBX+32] ;
  2347. MOVUPS XMM4, [RBX+48] ;
  2348. ADD RBX, 64
  2349. MULPS XMM1, XMM0 ;
  2350. MULPS XMM2, XMM0 ;
  2351. MULPS XMM3, XMM0 ;
  2352. MULPS XMM4, XMM0 ;
  2353. MOVUPS [RDX], XMM1 ;
  2354. MOVUPS [RDX+16], XMM2 ;
  2355. MOVUPS [RDX+32], XMM3 ;
  2356. MOVUPS [RDX+48], XMM4 ;
  2357. ADD RDX, 64 ;
  2358. SUB RAX, 16 ;
  2359. JMP unaligned16 ;
  2360. ; LOOP FOR 2 pieces unaligned
  2361. unaligned4: ;
  2362. CMP RAX, 4 ;
  2363. JL singlepieces ; len < 2- > EXIT
  2364. MOVUPS XMM1, [RBX] ;
  2365. ADD RBX, 16 ;
  2366. MULPS XMM1, XMM0 ;
  2367. MOVUPS [RDX], XMM1 ;
  2368. ADD RDX, 16 ;
  2369. SUB RAX, 4 ;
  2370. JMP unaligned4 ;
  2371. ; one piece left OR non-contiguous data
  2372. single:
  2373. singlepieces: ;
  2374. CMP RAX, 0 ;
  2375. JLE endL ; len <= 0- > EXIT
  2376. MOVSS XMM1, [RBX]
  2377. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2378. MULSS XMM1, XMM0
  2379. MOVSS [RDX], XMM1
  2380. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2381. DEC RAX ; DEC(len)
  2382. JMP singlepieces ;
  2383. endL:
  2384. END MulARSRLoopSSE;
  2385. PROCEDURE IncMulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2386. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2387. (*
  2388. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2389. 2.) process starting unaligned data ( using single instructions)
  2390. 3.) process aligned data
  2391. 4.) process remaining unaligned data (using single instructions)
  2392. *)
  2393. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2394. ; register initialization
  2395. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2396. CMP RAX, 0 ;
  2397. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2398. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2399. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2400. MOV RCX, [RBP+radr] ;
  2401. MOVSD XMM0, [RCX] ;
  2402. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2403. ; check IF data are contiguous IN memory
  2404. CMP [RBP+linc], 8 ; check left FOR contiunuity
  2405. JNE single ; not continuous- > simplest method
  2406. CMP [RBP+dinc], 8 ; check dest FOR continuity
  2407. JNE single ; not continuous- > simplest method
  2408. ; check FOR alignment
  2409. MOV RCX, RBX ;
  2410. AND RCX, 7 ; ladr MOD 8
  2411. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2412. JNE unaligned ; not 64 bit aligned
  2413. MOV RCX, RDX ;
  2414. AND RCX, 7 ; dadr MOD 8
  2415. CMP RCX, 0 ; RCX = 0- > 64 Bit alignment
  2416. JNE unaligned ; not 64 bit aligned
  2417. MOV RSI, RBX ;
  2418. AND RSI, 8 ; 16 byte alignment
  2419. MOV RDI, RDX ;
  2420. AND RDI, 8 ; 16 byte alignment
  2421. CMP RSI, RDI ;
  2422. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2423. CMP RSI, 8 ;
  2424. JNE aligned ; ladr and dadr already 128 bit aligned
  2425. ; one single element processing TO achieve 128 bt alignment
  2426. MOVSD XMM1, [RBX] ;
  2427. MULSD XMM1, XMM0 ;
  2428. MOVSD XMM2, [RDX] ;
  2429. ADDSD XMM1, XMM2 ;
  2430. MOVSD [RDX], XMM1 ;
  2431. ADD RBX, 8 ; now RBX IS 16 byte aligned
  2432. ADD RDX, 8 ; now RDX IS 16 byte aligned ;
  2433. DEC RAX ; one element has been processed
  2434. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2435. aligned:
  2436. aligned8:
  2437. CMP RAX, 8 ;
  2438. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2439. MOVAPD XMM1, [RBX] ;
  2440. MOVAPD XMM2, [RBX+16] ;
  2441. MOVAPD XMM3, [RBX+32] ;
  2442. MOVAPD XMM4, [RBX+48] ;
  2443. ADD RBX, 64 ;
  2444. MULPD XMM1, XMM0 ;
  2445. MULPD XMM2, XMM0 ;
  2446. MULPD XMM3, XMM0 ;
  2447. MULPD XMM4, XMM0 ;
  2448. MOVAPD XMM5, [RDX] ;
  2449. ADDPD XMM1, XMM5
  2450. MOVAPD [RDX], XMM1 ;
  2451. MOVAPD XMM6, [RDX+16] ;
  2452. ADDPD XMM2, XMM6
  2453. MOVAPD [RDX+16], XMM2 ;
  2454. MOVAPD XMM7, [RDX+32] ;
  2455. ADDPD XMM3, XMM7
  2456. MOVAPD [RDX+32], XMM3 ;
  2457. MOVAPD XMM5, [RDX+48] ;
  2458. ADDPD XMM4, XMM5
  2459. MOVAPD [RDX+48], XMM4 ;
  2460. ADD RDX, 64 ;
  2461. SUB RAX, 8 ;
  2462. JMP aligned8 ;
  2463. ; LOOP FOR 2 pieces aligned
  2464. aligned2: ;
  2465. CMP RAX, 2 ;
  2466. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2467. MOVAPD XMM1, [RBX] ;
  2468. ADD RBX, 16 ;
  2469. MULPD XMM1, XMM0 ;
  2470. MOVAPD XMM2, [RDX] ;
  2471. ADDPD XMM1, XMM2
  2472. MOVAPD [RDX], XMM1 ;
  2473. ADD RDX, 16 ;
  2474. SUB RAX, 2 ;
  2475. JMP aligned2 ;
  2476. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2477. unaligned: ;
  2478. unaligned8: ;
  2479. CMP RAX, 8 ;
  2480. JL unaligned2 ; len < 12- > EXIT
  2481. MOVUPD XMM1, [RBX] ;
  2482. MOVUPD XMM2, [RBX+16] ;
  2483. MOVUPD XMM3, [RBX+32] ;
  2484. MOVUPD XMM4, [RBX+48] ;
  2485. ADD RBX, 64
  2486. MULPD XMM1, XMM0 ;
  2487. MULPD XMM2, XMM0 ;
  2488. MULPD XMM3, XMM0 ;
  2489. MULPD XMM4, XMM0 ;
  2490. MOVUPD XMM5, [RDX] ;
  2491. ADDPD XMM1, XMM5
  2492. MOVUPD [RDX], XMM1 ;
  2493. MOVUPD XMM6, [RDX+16] ;
  2494. ADDPD XMM2, XMM6
  2495. MOVUPD [RDX+16], XMM2 ;
  2496. MOVUPD XMM7, [RDX+32] ;
  2497. ADDPD XMM3, XMM7
  2498. MOVUPD [RDX+32], XMM3 ;
  2499. MOVUPD XMM5, [RDX+48] ;
  2500. ADDPD XMM4, XMM5
  2501. MOVUPD [RDX+48], XMM4 ;
  2502. ADD RDX, 64 ;
  2503. SUB RAX, 8 ;
  2504. JMP unaligned8 ;
  2505. ; LOOP FOR 2 pieces unaligned
  2506. unaligned2: ;
  2507. CMP RAX, 2 ;
  2508. JL singlepieces ; len < 2- > EXIT
  2509. MOVUPD XMM1, [RBX] ;
  2510. ADD RBX, 16 ;
  2511. MULPD XMM1, XMM0 ;
  2512. MOVUPD XMM2, [RDX] ;
  2513. ADDPD XMM1, XMM2
  2514. MOVUPD [RDX], XMM1 ;
  2515. ADD RDX, 16 ;
  2516. SUB RAX, 2 ;
  2517. JMP unaligned2 ;
  2518. ; one piece left OR non-contiguous data
  2519. single:
  2520. singlepieces: ;
  2521. CMP RAX, 0 ;
  2522. JLE endL ; len <= 0- > EXIT
  2523. MOVSD XMM1, [RBX]
  2524. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2525. MULSD XMM1, XMM0
  2526. MOVSD XMM2, [RDX] ;
  2527. ADDSD XMM1, XMM2
  2528. MOVSD [RDX], XMM1
  2529. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2530. DEC RAX ; DEC(len)
  2531. JMP singlepieces ;
  2532. endL:
  2533. END IncMulAXSXLoopSSE;
  2534. PROCEDURE IncMulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2535. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2536. (*
  2537. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2538. 2.) process starting unaligned data ( using single instructions)
  2539. 3.) process aligned data
  2540. 4.) process remaining unaligned data (using single instructions)
  2541. *)
  2542. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2543. ; register initialization
  2544. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2545. CMP RAX, 0 ;
  2546. JLE endL ; nothing TO be done, RAX > 0 guaranteed from here on
  2547. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2548. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2549. MOV RCX, [RBP+radr] ;
  2550. MOVSS XMM0, [RCX] ;
  2551. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2552. ; check IF data are contiguous IN memory
  2553. CMP [RBP+linc], 4 ; check left FOR contiunuity
  2554. JNE single ; not continuous- > simplest method
  2555. CMP [RBP+dinc], 4 ; check dest FOR continuity
  2556. JNE single ; not continuous- > simplest method
  2557. ; check FOR alignment
  2558. MOV RCX, RBX ;
  2559. AND RCX, 3 ; ladr MOD 4
  2560. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2561. JNE unaligned ; not 32 bit aligned
  2562. MOV RCX, RDX ;
  2563. AND RCX, 3 ; dadr MOD 4
  2564. CMP RCX, 0 ; RCX = 0- > 32 Bit alignment
  2565. JNE unaligned ; not 64 bit aligned
  2566. MOV RSI, RBX ;
  2567. AND RSI, 8+4 ; 16 byte alignment
  2568. MOV RDI, RDX ;
  2569. AND RDI, 8+4 ; 16 byte alignment
  2570. CMP RSI, RDI ;
  2571. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2572. CMP RSI, 0 ;
  2573. JE aligned ; already aligned
  2574. align:
  2575. ; one single element processing UNTIL 128 bt alignment achieved
  2576. MOVSS XMM1, [RBX] ;
  2577. MULSS XMM1, XMM0 ;
  2578. MOVSS XMM2, [RDX] ;
  2579. ADDSS XMM1, XMM2 ;
  2580. MOVSS [RDX], XMM1 ;
  2581. ADD RBX, 4 ;
  2582. ADD RDX, 4 ;
  2583. DEC RAX ; one element has been processed ;
  2584. CMP RAX, 0 ; all elements already processed?
  2585. JLE single
  2586. MOV RSI, RBX ;
  2587. AND RSI, 8+4 ;
  2588. CMP RSI, 0 ;
  2589. JNE align ;
  2590. aligned:
  2591. aligned16:
  2592. CMP RAX, 16 ;
  2593. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2594. MOVAPS XMM1, [RBX] ;
  2595. MOVAPS XMM2, [RBX+16] ;
  2596. MOVAPS XMM3, [RBX+32] ;
  2597. MOVAPS XMM4, [RBX+48] ;
  2598. ADD RBX, 64 ;
  2599. MULPS XMM1, XMM0 ;
  2600. MULPS XMM2, XMM0 ;
  2601. MULPS XMM3, XMM0 ;
  2602. MULPS XMM4, XMM0 ;
  2603. MOVAPS XMM5, [RDX] ;
  2604. ADDPS XMM1, XMM5 ;
  2605. MOVAPS [RDX], XMM1 ;
  2606. MOVAPS XMM6, [RDX+16] ;
  2607. ADDPS XMM2, XMM6 ;
  2608. MOVAPS [RDX+16], XMM2 ;
  2609. MOVAPS XMM7, [RDX+32] ;
  2610. ADDPS XMM3, XMM7 ;
  2611. MOVAPS [RDX+32], XMM3 ;
  2612. MOVAPS XMM5, [RDX+48] ;
  2613. ADDPS XMM4, XMM5 ;
  2614. MOVAPS [RDX+48], XMM4 ;
  2615. ADD RDX, 64 ;
  2616. SUB RAX, 16 ;
  2617. JMP aligned16 ;
  2618. ; LOOP FOR 2 pieces aligned
  2619. aligned4: ;
  2620. CMP RAX, 4 ;
  2621. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2622. MOVAPS XMM1, [RBX] ;
  2623. ADD RBX, 16 ;
  2624. MULPS XMM1, XMM0 ;
  2625. MOVAPS XMM2, [RDX] ;
  2626. ADDPS XMM1, XMM2 ;
  2627. MOVAPS [RDX], XMM1 ;
  2628. ADD RDX, 16 ;
  2629. SUB RAX, 4 ;
  2630. JMP aligned4 ;
  2631. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2632. unaligned: ;
  2633. unaligned16: ;
  2634. CMP RAX, 16 ;
  2635. JL unaligned4 ; len < 12- > EXIT
  2636. MOVUPS XMM1, [RBX] ;
  2637. MOVUPS XMM2, [RBX+16] ;
  2638. MOVUPS XMM3, [RBX+32] ;
  2639. MOVUPS XMM4, [RBX+48] ;
  2640. ADD RBX, 64
  2641. MULPS XMM1, XMM0 ;
  2642. MULPS XMM2, XMM0 ;
  2643. MULPS XMM3, XMM0 ;
  2644. MULPS XMM4, XMM0 ;
  2645. MOVUPS XMM5, [RDX] ;
  2646. ADDPS XMM1, XMM5 ;
  2647. MOVUPS [RDX], XMM1 ;
  2648. MOVUPS XMM6, [RDX+16] ;
  2649. ADDPS XMM2, XMM6 ;
  2650. MOVUPS [RDX+16], XMM2 ;
  2651. MOVUPS XMM7, [RDX+32] ;
  2652. ADDPS XMM3, XMM7 ;
  2653. MOVUPS [RDX+32], XMM3 ;
  2654. MOVUPS XMM5, [RDX+48] ;
  2655. ADDPS XMM4, XMM5 ;
  2656. MOVUPS [RDX+48], XMM4 ;
  2657. ADD RDX, 64 ;
  2658. SUB RAX, 16 ;
  2659. JMP unaligned16 ;
  2660. ; LOOP FOR 2 pieces unaligned
  2661. unaligned4: ;
  2662. CMP RAX, 4 ;
  2663. JL singlepieces ; len < 2- > EXIT
  2664. MOVUPS XMM1, [RBX] ;
  2665. ADD RBX, 16 ;
  2666. MULPS XMM1, XMM0 ;
  2667. MOVUPS XMM2, [RDX] ;
  2668. ADDPS XMM1, XMM2 ;
  2669. MOVUPS [RDX], XMM1 ;
  2670. ADD RDX, 16 ;
  2671. SUB RAX, 4 ;
  2672. JMP unaligned4 ;
  2673. ; one piece left OR non-contiguous data
  2674. single:
  2675. singlepieces: ;
  2676. CMP RAX, 0 ;
  2677. JLE endL ; len <= 0- > EXIT
  2678. MOVSS XMM1, [RBX]
  2679. ADD RBX, [RBP+linc] ; INC(ladr, incl)
  2680. MULSS XMM1, XMM0
  2681. MOVSS XMM2, [RDX] ;
  2682. ADDSS XMM1, XMM2 ;
  2683. MOVSS [RDX], XMM1
  2684. ADD RDX, [RBP+dinc] ; INC(radr, incr)
  2685. DEC RAX ; DEC(len)
  2686. JMP singlepieces ;
  2687. endL:
  2688. END IncMulARSRLoopSSE;
  2689. (*
  2690. PROCEDURE AlignedSPXSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2691. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2692. ; ; register initialization
  2693. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  2694. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  2695. MOV RSI, [RBP+radr] ; RSI reserved for radr
  2696. MOV RAX, [RBP+len] ; RAX reserverd for length
  2697. MOV RCX, [RBP+stride] ; RCX reserved for stride
  2698. XORPD XMM2, XMM2 ;
  2699. XORPD XMM3, XMM3 ;
  2700. XORPD XMM4, XMM4 ;
  2701. XORPD XMM5, XMM5 ;
  2702. XORPD XMM6, XMM6 ;
  2703. XOR RDI, RDI ;
  2704. aligned4:
  2705. CMP RAX, 4 ;
  2706. JL aligned2 ; ; len < 4- > exit to singlepieces
  2707. MOV RSI, [RBP+radr] ;
  2708. ADD RSI, RDI ;
  2709. MOVAPD XMM7, [RBX] ;
  2710. MOVAPD XMM0, [RSI] ;
  2711. ADD RSI, RCX ;
  2712. MOVAPD XMM1, [RSI] ;
  2713. MULPD XMM0, XMM7 ;
  2714. ADDPD XMM2, XMM0 ;
  2715. ADD RSI, RCX ;
  2716. MOVAPD XMM0, [RSI] ;
  2717. MULPD XMM1, XMM7 ;
  2718. ADDPD XMM3, XMM1 ;
  2719. ADD RSI, RCX ;
  2720. MOVAPD XMM1, [RSI] ;
  2721. MULPD XMM0, XMM7 ;
  2722. ADDPD XMM4, XMM0 ;
  2723. ADD RSI, RCX ;
  2724. MOVAPD XMM0, [RSI] ;
  2725. MULPD XMM1, XMM7 ;
  2726. ADDPD XMM5, XMM1 ;
  2727. MULPD XMM0, XMM7 ;
  2728. ADDPD XMM6, XMM0 ;
  2729. ADD RBX, 16 ;
  2730. ADD RDI, 16 ;
  2731. MOV RSI, [RBP+radr] ;
  2732. ADD RSI, RDI ;
  2733. MOVAPD XMM7, [RBX] ;
  2734. MOVAPD XMM0, [RSI] ;
  2735. ADD RSI, RCX ;
  2736. MOVAPD XMM1, [RSI] ;
  2737. MULPD XMM0, XMM7 ;
  2738. ADDPD XMM2, XMM0 ;
  2739. ADD RSI, RCX ;
  2740. MOVAPD XMM0, [RSI] ;
  2741. MULPD XMM1, XMM7 ;
  2742. ADDPD XMM3, XMM1 ;
  2743. ADD RSI, RCX ;
  2744. MOVAPD XMM1, [RSI] ;
  2745. MULPD XMM0, XMM7 ;
  2746. ADDPD XMM4, XMM0 ;
  2747. ADD RSI, RCX ;
  2748. MOVAPD XMM0, [RSI] ;
  2749. MULPD XMM1, XMM7 ;
  2750. ADDPD XMM5, XMM1 ;
  2751. MULPD XMM0, XMM7 ;
  2752. ADDPD XMM6, XMM0 ;
  2753. ADD RBX, 16 ;
  2754. ADD RDI, 16 ;
  2755. SUB RAX, 4 ;
  2756. JMP aligned4 ;
  2757. aligned2:
  2758. CMP RAX, 2 ;
  2759. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2760. MOV RSI, [RBP+radr] ;
  2761. ADD RSI, RDI ;
  2762. MOVAPD XMM7, [RBX] ;
  2763. MOVAPD XMM0, [RSI] ;
  2764. ADD RSI, RCX ;
  2765. MOVAPD XMM1, [RSI] ;
  2766. MULPD XMM0, XMM7 ;
  2767. ADDPD XMM2, XMM0 ;
  2768. ADD RSI, RCX ;
  2769. MOVAPD XMM0, [RSI] ;
  2770. MULPD XMM1, XMM7 ;
  2771. ADDPD XMM3, XMM1 ;
  2772. ADD RSI, RCX ;
  2773. MOVAPD XMM1, [RSI] ;
  2774. MULPD XMM0, XMM7 ;
  2775. ADDPD XMM4, XMM0 ;
  2776. ADD RSI, RCX ;
  2777. MOVAPD XMM0, [RSI] ;
  2778. MULPD XMM1, XMM7 ;
  2779. ADDPD XMM5, XMM1 ;
  2780. MULPD XMM0, XMM7 ;
  2781. ADDPD XMM6, XMM0 ;
  2782. ADD RBX, 16 ;
  2783. ADD RDI, 16 ;
  2784. SUB RAX, 2 ;
  2785. JMP aligned2 ;
  2786. horizontaladd: ;
  2787. MOVAPD XMM1, XMM2 ;
  2788. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2789. ADDPD XMM2, XMM1 ;
  2790. MOVAPD XMM1, XMM3 ;
  2791. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2792. ADDPD XMM3, XMM1 ;
  2793. MOVAPD XMM1, XMM4 ;
  2794. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2795. ADDPD XMM4, XMM1 ;
  2796. MOVAPD XMM1, XMM5 ;
  2797. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2798. ADDPD XMM5, XMM1 ;
  2799. MOVAPD XMM1, XMM6 ;
  2800. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2801. ADDPD XMM6, XMM1 ;
  2802. singlepieces: ;
  2803. CMP RAX, 0 ;
  2804. JLE store ; len <= 0- > exit
  2805. MOV RSI, [RBP+radr] ;
  2806. MOVSD XMM7, [RBX] ;
  2807. MOVSD XMM0, [RSI+RDI] ;
  2808. ADD RSI, RCX ;
  2809. MOVSD XMM1, [RSI+RDI] ;
  2810. MULSD XMM0, XMM7 ;
  2811. ADDSD XMM2, XMM0 ;
  2812. ADD RSI, RCX ;
  2813. MOVSD XMM0, [RSI+RDI] ;
  2814. MULSD XMM1, XMM7 ;
  2815. ADDSD XMM3, XMM1 ;
  2816. ADD RSI, RCX ;
  2817. MOVSD XMM1, [RSI+RDI] ;
  2818. MULSD XMM0, XMM7 ;
  2819. ADDSD XMM4, XMM0 ;
  2820. ADD RSI, RCX ;
  2821. MOVSD XMM1, [RSI+RDI] ;
  2822. MULSD XMM0, XMM7 ;
  2823. ADDSD XMM4, XMM0 ;
  2824. ADD RSI, RCX ;
  2825. MOVSD XMM0, [RSI+RDI] ;
  2826. MULSD XMM1, XMM7 ;
  2827. ADDSD XMM5, XMM1 ;
  2828. MULSD XMM0, XMM7 ;
  2829. ADDSD XMM6, XMM0 ;
  2830. ADD RBX, 4 (* INC(ladr,incl) *)
  2831. ADD RDI, 4 (* INC(radr,incr) *)
  2832. DEC RAX ; DEC(len)
  2833. JMP singlepieces ;
  2834. store:
  2835. MOVSD [RDX], XMM2 ;
  2836. ADD RDX, [RBP+incd] ;
  2837. MOVSD [RDX], XMM3 ;
  2838. ADD RDX, [RBP+incd] ;
  2839. MOVSD [RDX], XMM4 ;
  2840. ADD RDX, [RBP+incd] ;
  2841. MOVSD [RDX], XMM5 ;
  2842. ADD RDX, [RBP+incd] ;
  2843. MOVSD [RDX], XMM6 ;
  2844. end:
  2845. END AlignedSPXSSE5;
  2846. *)
  2847. (* sse version of scalar product *)
  2848. PROCEDURE AlignedSPXSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  2849. add: BOOLEAN );
  2850. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  2851. ; register initialization
  2852. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  2853. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  2854. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  2855. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  2856. XORPD XMM0, XMM0 ;
  2857. CMP [RBP+add], 0 ; add?
  2858. JE aligned8 ; no add
  2859. MOVSD XMM0, [RDX] ;
  2860. aligned8:
  2861. CMP RAX, 8 ;
  2862. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2863. MOVAPD XMM1, [RBX] ;
  2864. MOVAPD XMM2, [RBX+16] ;
  2865. MOVAPD XMM3, [RBX+32] ;
  2866. MOVAPD XMM4, [RCX] ;
  2867. MOVAPD XMM5, [RCX+16] ;
  2868. MOVAPD XMM6, [RCX+32] ;
  2869. MULPD XMM1, XMM4 ;
  2870. ADDPD XMM0, XMM1 ;
  2871. MULPD XMM2, XMM5 ;
  2872. ADDPD XMM0, XMM2 ;
  2873. MULPD XMM3, XMM6 ;
  2874. ADDPD XMM0, XMM3 ;
  2875. MOVAPD XMM7, [RBX+48] ;
  2876. MOVAPD XMM1, [RCX+48] ;
  2877. MULPD XMM1, XMM7 ;
  2878. ADDPD XMM0, XMM1 ;
  2879. ADD RBX, 64 ;
  2880. ADD RCX, 64 ;
  2881. SUB RAX, 8 ;
  2882. JMP aligned8 ;
  2883. ; LOOP FOR 2 pieces aligned
  2884. aligned4:
  2885. CMP RAX, 4 ;
  2886. JL aligned2 ; ; len < 4- > EXIT TO singlepieces
  2887. MOVAPD XMM1, [RBX] ;
  2888. MOVAPD XMM2, [RCX] ;
  2889. MOVAPD XMM3, [RBX+16] ;
  2890. MOVAPD XMM4, [RCX+16] ;
  2891. MULPD XMM1, XMM2 ;
  2892. ADDPD XMM0, XMM1 ;
  2893. MULPD XMM3, XMM4 ;
  2894. ADDPD XMM0, XMM3 ;
  2895. ADD RBX, 32 ;
  2896. ADD RCX, 32 ;
  2897. SUB RAX, 4 ;
  2898. JMP aligned4 ;
  2899. aligned2:
  2900. CMP RAX, 2 ;
  2901. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2902. MOVAPD XMM1, [RBX] ;
  2903. MOVAPD XMM2, [RCX] ;
  2904. MULPD XMM1, XMM2 ;
  2905. ADDPD XMM0, XMM1 ;
  2906. ADD RBX, 16 ;
  2907. ADD RCX, 16 ;
  2908. SUB RAX, 2 ;
  2909. JMP aligned2 ;
  2910. horizontaladd: ;
  2911. MOVAPD XMM1, XMM0 ;
  2912. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2913. ADDPD XMM0, XMM1 ;
  2914. singlepieces: ;
  2915. CMP RAX, 0 ;
  2916. JLE store ; len <= 0- > EXIT
  2917. MOVSD XMM1, [RBX]
  2918. MOVSD XMM2, [RCX]
  2919. MULSD XMM1, XMM2
  2920. ADDSD XMM0, XMM1
  2921. ADD RBX, 8 ; INC(ladr, incl)
  2922. ADD RCX, 8 ; INC(radr, incr)
  2923. DEC RAX ; DEC(len)
  2924. JMP singlepieces ;
  2925. store:
  2926. MOVSD [RDX], XMM0 ;
  2927. endL:
  2928. END AlignedSPXSSE;
  2929. (*
  2930. PROCEDURE AlignedSPRSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2931. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  2932. ; register initialization
  2933. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  2934. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  2935. MOV RSI, [RBP+radr] ; RCX reserved for radr
  2936. MOV RAX, [RBP+len] ; RAX reserverd for length
  2937. MOV RCX, [RBP+stride] ;
  2938. XORPS XMM2, XMM2 ;
  2939. XORPS XMM3, XMM3 ;
  2940. XORPS XMM4, XMM4 ;
  2941. XORPS XMM5, XMM5 ;
  2942. XORPS XMM6, XMM6 ;
  2943. XOR RDI, RDI ;
  2944. aligned8:
  2945. CMP RAX, 8 ;
  2946. JL aligned4 ; ; len < 4- > exit to singlepieces
  2947. PREFETCH0 24[RBX] ;
  2948. ; PREFETCH0[RSI] ;
  2949. MOV RSI, [RBP+radr] ;
  2950. ADD RSI, RDI ;
  2951. MOVAPS XMM7, [RBX] ;
  2952. MOVAPS XMM0, [RSI] ;
  2953. ADD RSI, RCX ;
  2954. MOVAPS XMM1, [RSI] ;
  2955. MULPS XMM0, XMM7 ;
  2956. ADDPS XMM2, XMM0 ;
  2957. ADD RSI, RCX ;
  2958. MOVAPS XMM0, [RSI] ;
  2959. MULPS XMM1, XMM7 ;
  2960. ADDPS XMM3, XMM1 ;
  2961. ADD RSI, RCX ;
  2962. MOVAPS XMM1, [RSI] ;
  2963. MULPS XMM0, XMM7 ;
  2964. ADDPS XMM4, XMM0 ;
  2965. ADD RSI, RCX ;
  2966. MOVAPS XMM0, [RSI] ;
  2967. MULPS XMM1, XMM7 ;
  2968. ADDPS XMM5, XMM1 ;
  2969. MULPS XMM0, XMM7 ;
  2970. ADDPS XMM6, XMM0 ;
  2971. ADD RBX, 16 ;
  2972. ADD RDI, 16 ;
  2973. MOV RSI, [RBP+radr] ;
  2974. ADD RSI, RDI ;
  2975. MOVAPS XMM7, [RBX] ;
  2976. MOVAPS XMM0, [RSI] ;
  2977. ADD RSI, RCX ;
  2978. MOVAPS XMM1, [RSI] ;
  2979. MULPS XMM0, XMM7 ;
  2980. ADDPS XMM2, XMM0 ;
  2981. ADD RSI, RCX ;
  2982. MOVAPS XMM0, [RSI] ;
  2983. MULPS XMM1, XMM7 ;
  2984. ADDPS XMM3, XMM1 ;
  2985. ADD RSI, RCX ;
  2986. MOVAPS XMM1, [RSI] ;
  2987. MULPS XMM0, XMM7 ;
  2988. ADDPS XMM4, XMM0 ;
  2989. ADD RSI, RCX ;
  2990. MOVAPS XMM0, [RSI] ;
  2991. MULPS XMM1, XMM7 ;
  2992. ADDPS XMM5, XMM1 ;
  2993. MULPS XMM0, XMM7 ;
  2994. ADDPS XMM6, XMM0 ;
  2995. ADD RBX, 16 ;
  2996. ADD RDI, 16 ;
  2997. SUB RAX, 8 ;
  2998. JMP aligned8 ;
  2999. aligned4:
  3000. CMP RAX, 4 ;
  3001. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3002. MOV RSI, [RBP+radr] ;
  3003. ADD RSI, RDI ;
  3004. MOVAPS XMM7, [RBX] ;
  3005. MOVAPS XMM0, [RSI] ;
  3006. ADD RSI, RCX ;
  3007. MOVAPS XMM1, [RSI] ;
  3008. MULPS XMM0, XMM7 ;
  3009. ADDPS XMM2, XMM0 ;
  3010. ADD RSI, RCX ;
  3011. MOVAPS XMM0, [RSI] ;
  3012. MULPS XMM1, XMM7 ;
  3013. ADDPS XMM3, XMM1 ;
  3014. ADD RSI, RCX ;
  3015. MOVAPS XMM1, [RSI] ;
  3016. MULPS XMM0, XMM7 ;
  3017. ADDPS XMM4, XMM0 ;
  3018. ADD RSI, RCX ;
  3019. MOVAPS XMM0, [RSI] ;
  3020. MULPS XMM1, XMM7 ;
  3021. ADDPS XMM5, XMM1 ;
  3022. MULPS XMM0, XMM7 ;
  3023. ADDPS XMM6, XMM0 ;
  3024. ADD RBX, 16 ;
  3025. ADD RDI, 16 ;
  3026. SUB RAX, 4 ;
  3027. JMP aligned4 ;
  3028. horizontaladd: ;
  3029. MOVLHPS XMM1, XMM2 ;
  3030. ADDPS XMM1, XMM2 ;
  3031. SHUFPS XMM2, XMM1, 48 ;
  3032. ADDPS XMM2, XMM1 ;
  3033. MOVHLPS XMM2, XMM2 ;
  3034. MOVLHPS XMM1, XMM3 ;
  3035. ADDPS XMM1, XMM3 ;
  3036. SHUFPS XMM3, XMM1, 48 ;
  3037. ADDPS XMM3, XMM1 ;
  3038. MOVHLPS XMM3, XMM3 ;
  3039. MOVLHPS XMM1, XMM4 ;
  3040. ADDPS XMM1, XMM4 ;
  3041. SHUFPS XMM4, XMM1, 48 ;
  3042. ADDPS XMM4, XMM1 ;
  3043. MOVHLPS XMM4, XMM4 ;
  3044. MOVLHPS XMM1, XMM5 ;
  3045. ADDPS XMM1, XMM5 ;
  3046. SHUFPS XMM5, XMM1, 48 ;
  3047. ADDPS XMM5, XMM1 ;
  3048. MOVHLPS XMM5, XMM5 ;
  3049. MOVLHPS XMM1, XMM6 ;
  3050. ADDPS XMM1, XMM6 ;
  3051. SHUFPS XMM6, XMM1, 48 ;
  3052. ADDPS XMM6, XMM1 ;
  3053. MOVHLPS XMM6, XMM6 ;
  3054. singlepieces: ;
  3055. CMP RAX, 0 ;
  3056. JLE store ; len <= 0- > exit
  3057. MOV RSI, [RBP+radr] ;
  3058. MOVSS XMM7, [RBX] ;
  3059. MOVSS XMM0, [RSI+RDI] ;
  3060. ADD RSI, RCX ;
  3061. MOVSS XMM1, [RSI+RDI] ;
  3062. MULSS XMM0, XMM7 ;
  3063. ADDSS XMM2, XMM0 ;
  3064. ADD RSI, RCX ;
  3065. MOVSS XMM0, [RSI+RDI] ;
  3066. MULSS XMM1, XMM7 ;
  3067. ADDSS XMM3, XMM1 ;
  3068. ADD RSI, RCX ;
  3069. MOVSS XMM1, [RSI+RDI] ;
  3070. MULSS XMM0, XMM7 ;
  3071. ADDSS XMM4, XMM0 ;
  3072. ADD RSI, RCX ;
  3073. MOVSS XMM0, [RSI+RDI] ;
  3074. MULSS XMM1, XMM7 ;
  3075. ADDSS XMM5, XMM1 ;
  3076. MULSS XMM0, XMM7 ;
  3077. ADDSS XMM6, XMM0 ;
  3078. ADD RBX, 4 (* INC(ladr,incl) *)
  3079. ADD RDI, 4 (* INC(radr,incr) *)
  3080. DEC RAX ; DEC(len)
  3081. JMP singlepieces ;
  3082. store:
  3083. MOVSS [RDX], XMM2 ;
  3084. ADD RDX, [RBP+incd] ;
  3085. MOVSS [RDX], XMM3 ;
  3086. ADD RDX, [RBP+incd] ;
  3087. MOVSS [RDX], XMM4 ;
  3088. ADD RDX, [RBP+incd] ;
  3089. MOVSS [RDX], XMM5 ;
  3090. ADD RDX, [RBP+incd] ;
  3091. MOVSS [RDX], XMM6 ;
  3092. end:
  3093. END AlignedSPRSSE5;
  3094. *)
  3095. PROCEDURE AlignedSPRSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  3096. add: BOOLEAN );
  3097. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3098. ; register initialization
  3099. MOV RDX, [RBP+dadr] ; RDX reserved FOR dadr
  3100. MOV RBX, [RBP+ladr] ; RBX reserved FOR ladr
  3101. MOV RCX, [RBP+radr] ; RCX reserved FOR radr
  3102. MOV RAX, [RBP+len] ; RAX reserverd FOR length
  3103. XORPS XMM0, XMM0 ;
  3104. CMP [RBP+add], 0 ; add?
  3105. JE aligned16 ; no add
  3106. MOVSS XMM0, [RDX] ;
  3107. aligned16:
  3108. CMP RAX, 16 ;
  3109. JL aligned8 ; len < 4- > EXIT TO singlepieces
  3110. MOVAPS XMM1, [RBX] ;
  3111. MOVAPS XMM4, [RCX] ;
  3112. MOVAPS XMM2, [RBX+16] ;
  3113. MOVAPS XMM5, [RCX+16] ;
  3114. MULPS XMM1, XMM4 ;
  3115. ADDPS XMM0, XMM1 ;
  3116. MOVAPS XMM3, [RBX+32] ;
  3117. MOVAPS XMM6, [RCX+32] ;
  3118. MULPS XMM2, XMM5 ;
  3119. ADDPS XMM0, XMM2 ;
  3120. MOVAPS XMM7, [RBX+48] ;
  3121. MOVAPS XMM1, [RCX+48] ;
  3122. MULPS XMM3, XMM6 ;
  3123. ADDPS XMM0, XMM3 ;
  3124. MULPS XMM1, XMM7 ;
  3125. ADDPS XMM0, XMM1 ;
  3126. ADD RBX, 64 ;
  3127. ADD RCX, 64 ;
  3128. SUB RAX, 16 ;
  3129. JMP aligned16 ;
  3130. ; LOOP FOR 8 pieces aligned
  3131. aligned8:
  3132. CMP RAX, 8 ;
  3133. JL aligned4 ; ; len < 4- > EXIT TO singlepieces
  3134. MOVAPS XMM1, [RBX] ;
  3135. MOVAPS XMM4, [RCX] ;
  3136. MOVAPS XMM2, [RBX+16] ;
  3137. MOVAPS XMM5, [RCX+16] ;
  3138. MULPS XMM1, XMM4 ;
  3139. ADDPS XMM0, XMM1 ;
  3140. MULPS XMM2, XMM5 ;
  3141. ADDPS XMM0, XMM2 ;
  3142. ADD RBX, 32 ;
  3143. ADD RCX, 32 ;
  3144. SUB RAX, 8 ;
  3145. JMP aligned8 ;
  3146. aligned4:
  3147. CMP RAX, 4 ;
  3148. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  3149. MOVAPS XMM1, [RBX] ;
  3150. MOVAPS XMM2, [RCX] ;
  3151. MULPS XMM1, XMM2 ;
  3152. ADDPS XMM0, XMM1 ;
  3153. ADD RBX, 16 ;
  3154. ADD RCX, 16 ;
  3155. SUB RAX, 4 ;
  3156. JMP aligned4 ;
  3157. horizontaladd: ;
  3158. MOVAPS XMM1, XMM0 ;
  3159. ; 1*0 (* dest 0 -> dest 0 *) + 4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  3160. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  3161. ADDPS XMM1, XMM0 ;
  3162. MOVAPS XMM0, XMM1
  3163. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  3164. ADDPS XMM0, XMM1 ;
  3165. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  3166. singlepieces: ;
  3167. CMP RAX, 0 ;
  3168. JLE store ; len <= 0- > EXIT
  3169. MOVSS XMM1, [RBX]
  3170. MOVSS XMM2, [RCX]
  3171. MULSS XMM1, XMM2
  3172. ADDSS XMM0, XMM1
  3173. ADD RBX, 4 ; INC(ladr, incl)
  3174. ADD RCX, 4 ; INC(radr, incr)
  3175. DEC RAX ; DEC(len)
  3176. JMP singlepieces ;
  3177. store:
  3178. MOVSS [RDX], XMM0 ;
  3179. endL:
  3180. END AlignedSPRSSE;
  3181. (*
  3182. (* sse version of scalar product *)
  3183. PROCEDURE AlignedSPRSSE( ladr, radr, dadr, rows, stride, dinc, len: LONGINT );
  3184. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3185. ; register initialization
  3186. MOV RDI, [RBP+radr] ; radr start
  3187. MOV RDX, [RBP+dadr] ; RDX reserved for dadr
  3188. MOV RSI, [RBP+rows] ; outer loop counter
  3189. outerloop:
  3190. CMP RSI, 0 ;
  3191. JLE end ;
  3192. MOV RBX, [RBP+ladr] ; RBX reserved for ladr
  3193. MOV RCX, RDI ; RCX reserved for radr
  3194. MOV RAX, [RBP+len] ; RAX reserverd for length
  3195. XORPS XMM0, XMM0 ;
  3196. aligned16:
  3197. CMP RAX, 16 ;
  3198. JL aligned8 ; len < 4- > exit to singlepieces
  3199. MOVAPS XMM1, [RBX] ;
  3200. MOVAPS XMM2, [RBX+16] ;
  3201. MOVAPS XMM3, [RBX+32] ;
  3202. MOVAPS XMM4, [RCX] ;
  3203. MOVAPS XMM5, [RCX+16] ;
  3204. MOVAPS XMM6, [RCX+32] ;
  3205. MULPS XMM1, XMM4 ;
  3206. ADDPS XMM0, XMM1 ;
  3207. MULPS XMM2, XMM5 ;
  3208. ADDPS XMM0, XMM2 ;
  3209. MULPS XMM3, XMM6 ;
  3210. ADDPS XMM0, XMM3 ;
  3211. MOVAPS XMM7, [RBX+48] ;
  3212. MOVAPS XMM1, [RCX+48] ;
  3213. MULPS XMM1, XMM7 ;
  3214. ADDPS XMM0, XMM1 ;
  3215. ADD RBX, 64 ;
  3216. ADD RCX, 64 ;
  3217. SUB RAX, 16 ;
  3218. JMP aligned16 ;
  3219. ; loop for 8 pieces aligned
  3220. aligned8:
  3221. CMP RAX, 8 ;
  3222. JL aligned4 ; ; len < 4- > exit to singlepieces
  3223. MOVAPS XMM1, [RBX] ;
  3224. MOVAPS XMM2, [RBX+16] ;
  3225. MOVAPS XMM4, [RCX] ;
  3226. MOVAPS XMM5, [RCX+16] ;
  3227. MULPS XMM1, XMM4 ;
  3228. ADDPS XMM0, XMM1 ;
  3229. MULPS XMM2, XMM5 ;
  3230. ADDPS XMM0, XMM2 ;
  3231. ADD RBX, 32 ;
  3232. ADD RCX, 32 ;
  3233. SUB RAX, 8 ;
  3234. JMP aligned8 ;
  3235. aligned4:
  3236. CMP RAX, 4 ;
  3237. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3238. MOVAPS XMM1, [RBX] ;
  3239. MOVAPS XMM2, [RCX] ;
  3240. MULPS XMM1, XMM2 ;
  3241. ADDPS XMM0, XMM1 ;
  3242. ADD RBX, 16 ;
  3243. ADD RCX, 16 ;
  3244. SUB RAX, 4 ;
  3245. JMP aligned4 ;
  3246. horizontaladd: ;
  3247. MOVAPS XMM1, XMM0 ;
  3248. SHUFPS XMM1, XMM1, 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  3249. ADDPS XMM1, XMM0 ;
  3250. MOVAPS XMM0, XMM1
  3251. SHUFPS XMM0, XMM0, 16*3 ; (* src 3-> dest 2 *)
  3252. ADDPS XMM0, XMM1 ;
  3253. SHUFPS XMM0, XMM0, 1*2 ; (* dest 2 -> dest 0 *)
  3254. singlepieces: ;
  3255. CMP RAX, 0 ;
  3256. JLE store ; len <= 0- > exit
  3257. MOVSS XMM1, [RBX]
  3258. MOVSS XMM2, [RCX]
  3259. MULSS XMM1, XMM2
  3260. ADDSS XMM0, XMM1
  3261. ADD RBX, 4 (* INC(ladr,incl) *)
  3262. ADD RCX, 4 (* INC(radr,incr) *)
  3263. DEC RAX ; DEC(len)
  3264. JMP singlepieces ;
  3265. store:
  3266. MOVSS [RDX], XMM0 ;
  3267. ADD RDX, [RBP+dinc] ;
  3268. ADD RDI, [RBP+stride] ;
  3269. DEC RSI ;
  3270. JMP outerloop ;
  3271. end:
  3272. END AlignedSPRSSE;
  3273. *)
  3274. PROCEDURE Copy4( ladr, dadr: ADDRESS; linc, dinc, len: SIZE);
  3275. CODE {SYSTEM.AMD64}
  3276. MOV RSI, [RBP+ladr] ; RCX := ladr
  3277. MOV RDI, [RBP+dadr] ; RDX := dadr
  3278. MOV RCX, [RBP+len] ; RBX := len
  3279. MOV RAX, [RBP+linc] ;
  3280. CMP RAX, 4 ;
  3281. JNE loopL ;
  3282. MOV RAX, [RBP+dinc] ;
  3283. CMP RAX, 4 ;
  3284. JNE loopL ;
  3285. fastmove:
  3286. CLD ; incremental
  3287. REP ;
  3288. MOVSD ; move rest IN one byte steps
  3289. JMP endL ;
  3290. loopL:
  3291. CMP RCX, 0 ;
  3292. JLE endL ; WHILE RCX > 0 DO
  3293. MOV EAX, [RSI] ; RAX := SYSTEM.GET32(RSI)
  3294. MOV [RDI], EAX ; SYSTEM.PUT32(RDI, RAX))
  3295. ADD RSI, [RBP+linc] ; INC(RSI, linc)
  3296. ADD RDI, [RBP+dinc] ; INC(RDI, rinc)
  3297. DEC RCX ; DEC(RCX)
  3298. JMP loopL
  3299. endL:
  3300. END Copy4;
  3301. PROCEDURE Copy8( ladr, dadr: ADDRESS; linc, dinc, len: SIZE );
  3302. CODE {SYSTEM.AMD64}
  3303. MOV RSI, [RBP+ladr] ; RCX := ladr
  3304. MOV RDI, [RBP+dadr] ; RDX := dadr
  3305. MOV RCX, [RBP+len] ; RBX := len
  3306. MOV RAX, [RBP+linc] ;
  3307. CMP RAX, 8 ;
  3308. JNE loopL ;
  3309. MOV RAX, [RBP+dinc] ;
  3310. CMP RAX, 8 ;
  3311. JNE loopL ;
  3312. fastmove:
  3313. SHL RCX, 1 ;
  3314. CLD ; incremental
  3315. REP ;
  3316. MOVSD ; move rest IN one byte steps
  3317. JMP endL ;
  3318. loopL:
  3319. CMP RCX, 0 ;
  3320. JLE endL ; WHILE RBX > 0 DO
  3321. MOV RAX, [RSI] ; RAX := SYSTEM.GET64(RCX)
  3322. MOV [RDI], RAX ; SYSTEM.PUT64(RDX, RAX))
  3323. ADD RSI, [RBP+linc] ; INC(RCX, linc)
  3324. ADD RDI, [RBP+dinc] ; INC(RDX, rinc)
  3325. DEC RCX ; DEC(RBX)
  3326. JMP loopL
  3327. endL:
  3328. END Copy8;
  3329. PROCEDURE Transpose4A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3330. CODE {SYSTEM.AMD64}
  3331. startrows:
  3332. MOV RAX, [RBP+rows] ;
  3333. startouter:
  3334. CMP RAX, 0 ;
  3335. JLE endL ;
  3336. MOV RSI, [RBP+ladr] ;
  3337. MOV RDI, [RBP+dadr] ;
  3338. MOV RBX, [RBP+linc] ;
  3339. MOV RCX, [RBP+dstride] ;
  3340. MOV RAX, [RBP+cols] ;
  3341. startinner:
  3342. CMP RAX, 0 ;
  3343. JLE endinner ;
  3344. MOV RDX, [RSI] ;
  3345. MOV [RDI], RDX ;
  3346. ADD RSI, RBX ;
  3347. ADD RDI, RCX ;
  3348. DEC RAX ;
  3349. JMP startinner ;
  3350. endinner:
  3351. MOV RSI, [RBP+ladr] ;
  3352. ADD RSI, [RBP+lstride] ;
  3353. MOV [RBP+ladr], RSI
  3354. MOV RDI, [RBP+dadr] ;
  3355. ADD RDI, [RBP+dinc] ;
  3356. MOV [RBP+dadr], RDI ;
  3357. MOV RAX, [RBP+rows] ;
  3358. DEC RAX ;
  3359. MOV [RBP+rows], RAX ;
  3360. JMP startouter ;
  3361. endL:
  3362. END Transpose4A;
  3363. PROCEDURE Transpose4( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3364. VAR l, d, c: SIZE; BlockSize: SIZE;
  3365. BEGIN
  3366. BlockSize :=
  3367. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3368. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3369. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3370. BlockSize := MAX( 8, BlockSize );
  3371. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3372. WHILE (rows >= BlockSize) DO
  3373. c := cols; l := ladr; d := dadr;
  3374. WHILE (c >= BlockSize) DO
  3375. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3376. BlockSize );
  3377. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3378. INC( d, BlockSize * dstride );
  3379. END;
  3380. IF c > 0 THEN
  3381. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3382. END;
  3383. DEC( rows, BlockSize ); INC( ladr, BlockSize * lstride );
  3384. INC( dadr, BlockSize * dinc );
  3385. END;
  3386. IF (rows > 0) THEN
  3387. c := cols; l := ladr; d := dadr;
  3388. WHILE (c >= BlockSize) DO
  3389. Transpose4A( l, d, lstride, linc, dstride, dinc, rows,
  3390. BlockSize );
  3391. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3392. INC( d, BlockSize * dstride );
  3393. END;
  3394. IF c > 0 THEN
  3395. Transpose4A( l, d, lstride, linc, dstride, dinc, rows, c );
  3396. END;
  3397. END;
  3398. END Transpose4;
  3399. PROCEDURE Transpose8( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3400. VAR l, d, c: SIZE; BlockSize: SIZE;
  3401. BEGIN
  3402. BlockSize :=
  3403. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3404. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3405. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3406. BlockSize := MAX( 8, BlockSize );
  3407. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3408. WHILE (rows >= BlockSize) DO
  3409. c := cols; l := ladr; d := dadr;
  3410. WHILE (c >= BlockSize) DO
  3411. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3412. BlockSize );
  3413. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3414. INC( d, BlockSize * dstride );
  3415. END;
  3416. IF c > 0 THEN
  3417. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3418. END;
  3419. DEC( rows, BlockSize ); INC( ladr, lstride * BlockSize );
  3420. INC( dadr, dinc * BlockSize );
  3421. END;
  3422. IF (rows > 0) THEN
  3423. c := cols; l := ladr; d := dadr;
  3424. WHILE (c >= BlockSize) DO
  3425. Transpose8A( l, d, lstride, linc, dstride, dinc, rows,
  3426. BlockSize );
  3427. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3428. INC( d, BlockSize * dstride );
  3429. END;
  3430. IF c > 0 THEN
  3431. Transpose8A( l, d, lstride, linc, dstride, dinc, rows, c );
  3432. END;
  3433. END;
  3434. END Transpose8;
  3435. PROCEDURE Transpose8A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3436. CODE {SYSTEM.AMD64}
  3437. startrows:
  3438. MOV RAX, [RBP+rows] ;
  3439. startouter:
  3440. CMP RAX, 0 ;
  3441. JLE endL ;
  3442. MOV RSI, [RBP+ladr] ;
  3443. MOV RDI, [RBP+dadr] ;
  3444. MOV RBX, [RBP+linc] ;
  3445. MOV RCX, [RBP+dstride] ;
  3446. MOV RAX, [RBP+cols] ;
  3447. startinner:
  3448. CMP RAX, 0 ;
  3449. JLE endinner ;
  3450. MOV RDX, [RSI] ;
  3451. MOV [RDI], RDX ;
  3452. MOV RDX, [RSI+4] ;
  3453. MOV [RDI+4], RDX ;
  3454. ADD RSI, RBX ;
  3455. ADD RDI, RCX ;
  3456. DEC RAX ;
  3457. JMP startinner ;
  3458. endinner:
  3459. MOV RSI, [RBP+ladr] ;
  3460. ADD RSI, [RBP+lstride] ;
  3461. MOV [RBP+ladr], RSI
  3462. MOV RDI, [RBP+dadr] ;
  3463. ADD RDI, [RBP+dinc] ;
  3464. MOV [RBP+dadr], RDI ;
  3465. MOV RAX, [RBP+rows] ;
  3466. DEC RAX ;
  3467. MOV [RBP+rows], RAX ;
  3468. JMP startouter ;
  3469. endL:
  3470. END Transpose8A;
  3471. PROCEDURE SSEMul24BlockR( VAR CbFirst: SIZE;
  3472. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3473. add: BOOLEAN );
  3474. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3475. MatrixOfResultsSetup:
  3476. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3477. RowOfResultsLoop:
  3478. MOV RBX, 0 ; counter FOR columns IN B-Cb
  3479. DotProductSetup:
  3480. MOV RSI, [RBP+matrixA] ; matrixA
  3481. MOV RDI, [RBP+matrixB] ; matrixB
  3482. LEA RDI, [RDI+RBX*4] ; current position IN matrixB
  3483. XORPS XMM2, XMM2
  3484. XORPS XMM3, XMM3
  3485. XORPS XMM4, XMM4
  3486. XORPS XMM5, XMM5
  3487. XORPS XMM6, XMM6
  3488. XORPS XMM7, XMM7
  3489. MOV RAX, 0 ;
  3490. MOV AL, [RBP+add] ;
  3491. CMP AL, 0 ; add?
  3492. JE DotProductLoop ;
  3493. MOV RAX, [RBP+matrixC] ; matrixC
  3494. LEA RAX, [RAX+RBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3495. MOVUPS XMM2, [RAX]
  3496. MOVUPS XMM3, [RAX+16]
  3497. MOVUPS XMM4, [RAX+32]
  3498. MOVUPS XMM5, [RAX+48]
  3499. MOVUPS XMM6, [RAX+64]
  3500. MOVUPS XMM7, [RAX+80]
  3501. MOV RAX, 0
  3502. DotProductLoop:
  3503. MOV RDX, [RSI+RAX*4]
  3504. SHL RDX, 1
  3505. CMP RDX, 0
  3506. JE SparseEntryEscape
  3507. MOVSS XMM0, [RSI+RAX*4]
  3508. SHUFPS XMM0, XMM0, 0H
  3509. MOVUPS XMM1, [RDI]
  3510. MULPS XMM1, XMM0
  3511. ADDPS XMM2, XMM1
  3512. MOVUPS XMM1, [RDI+16]
  3513. MULPS XMM1, XMM0
  3514. ADDPS XMM3, XMM1
  3515. MOVUPS XMM1, [RDI+32]
  3516. MULPS XMM1, XMM0
  3517. ADDPS XMM4, XMM1
  3518. MOVUPS XMM1, [RDI+48]
  3519. MULPS XMM1, XMM0
  3520. ADDPS XMM5, XMM1
  3521. MOVUPS XMM1, [RDI+64]
  3522. MULPS XMM1, XMM0
  3523. ADDPS XMM6, XMM1
  3524. MOVUPS XMM1, [RDI+80]
  3525. MULPS XMM1, XMM0
  3526. ADDPS XMM7, XMM1
  3527. SparseEntryEscape:
  3528. ADD RDI, [RBP+StrideB] ; StrideB
  3529. INC RAX
  3530. CMP RAX, [RBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3531. JL DotProductLoop
  3532. ; endL DopProductLoop
  3533. MOV RAX, [RBP+matrixC] ; matrixC
  3534. LEA RAX, [RAX+RBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3535. MOVUPS [RAX], XMM2
  3536. MOVUPS [RAX+16], XMM3
  3537. MOVUPS [RAX+32], XMM4
  3538. MOVUPS [RAX+48], XMM5
  3539. MOVUPS [RAX+64], XMM6
  3540. MOVUPS [RAX+80], XMM7
  3541. ADD RBX, 24 ; move over TO next batch OF 24
  3542. MOV RDX, RBX
  3543. ADD RDX, 24
  3544. CMP RDX, [RBP+Cb] ; Cb, check TO see IF row IS complete
  3545. JLE DotProductSetup
  3546. ; endL RowOfResultsLoop
  3547. MOV RAX, [RBP+matrixA] ; matrixA
  3548. ADD RAX, [RBP+StrideA] ; StrideA
  3549. MOV [RBP+matrixA], RAX ; matrixA
  3550. MOV RAX, [RBP+matrixC] ; matrixC
  3551. ADD RAX, [RBP+StrideC] ; StrideC
  3552. MOV [RBP+matrixC], RAX ; matrixC
  3553. INC RCX
  3554. CMP RCX, [RBP+Ra] ; Ra
  3555. JL RowOfResultsLoop
  3556. Done:
  3557. MOV RAX, [RBP+CbFirst] ; CbFirst
  3558. MOV [RAX], RBX ;
  3559. END SSEMul24BlockR;
  3560. (*! might be better to make a 10Block operation and utilize 2 registers for temporary calculations, see article abaout Emmerald*)
  3561. PROCEDURE SSEMul12BlockX( VAR CbFirst: SIZE;
  3562. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC :ADDRESS;
  3563. add: BOOLEAN );
  3564. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3565. MatrixOfResultsSetup:
  3566. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3567. RowOfResultsLoop:
  3568. MOV RBX, 0 ; counter FOR columns IN B-Cb
  3569. DotProductSetup:
  3570. MOV RSI, [RBP+matrixA] ; matrixA
  3571. MOV RDI, [RBP+matrixB] ; matrixB
  3572. LEA RDI, [RDI+RBX*8]
  3573. XORPD XMM2, XMM2
  3574. XORPD XMM3, XMM3
  3575. XORPD XMM4, XMM4
  3576. XORPD XMM5, XMM5
  3577. XORPD XMM6, XMM6
  3578. XORPD XMM7, XMM7
  3579. MOV RAX, 0 ;
  3580. MOV AL, [RBP+add] ;
  3581. CMP AL, 0 ; add?
  3582. JE DotProductLoop ;
  3583. MOV RAX, [RBP+matrixC] ; matrixC
  3584. LEA RAX, [RAX+RBX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3585. MOVUPD XMM2, [RAX]
  3586. MOVUPD XMM3, [RAX+16]
  3587. MOVUPD XMM4, [RAX+32]
  3588. MOVUPD XMM5, [RAX+48]
  3589. MOVUPD XMM6, [RAX+64]
  3590. MOVUPD XMM7, [RAX+80]
  3591. MOV RAX, 0
  3592. DotProductLoop:
  3593. ; MOV RDX, [RSI+RAX*8]
  3594. ; SHL RDX, 1
  3595. ; CMP RDX, 0
  3596. ; JE SparseEntryEscape
  3597. MOVSD XMM0, [RSI+RAX*8]
  3598. SHUFPD XMM0, XMM0, 0H
  3599. MOVUPD XMM1, [RDI]
  3600. MULPD XMM1, XMM0
  3601. ADDPD XMM2, XMM1
  3602. MOVUPD XMM1, [RDI+16]
  3603. MULPD XMM1, XMM0
  3604. ADDPD XMM3, XMM1
  3605. MOVUPD XMM1, [RDI+32]
  3606. MULPD XMM1, XMM0
  3607. ADDPD XMM4, XMM1
  3608. MOVUPD XMM1, [RDI+48]
  3609. MULPD XMM1, XMM0
  3610. ADDPD XMM5, XMM1
  3611. MOVUPD XMM1, [RDI+64]
  3612. MULPD XMM1, XMM0
  3613. ADDPD XMM6, XMM1
  3614. MOVUPD XMM1, [RDI+80]
  3615. MULPD XMM1, XMM0
  3616. ADDPD XMM7, XMM1
  3617. SparseEntryEscape:
  3618. ADD RDI, [RBP+StrideB] ; StrideB
  3619. INC RAX
  3620. CMP RAX, [RBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3621. JL DotProductLoop ; endL DopProductLoop
  3622. MOV RAX , [RBP+matrixC] ; matrixC
  3623. LEA RAX, [RAX+RBX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3624. MOVUPD [RAX], XMM2
  3625. MOVUPD [RAX+16], XMM3
  3626. MOVUPD [RAX+32], XMM4
  3627. MOVUPD [RAX+48], XMM5
  3628. MOVUPD [RAX+64], XMM6
  3629. MOVUPD [RAX+80], XMM7
  3630. ADD RBX, 12 ; move over TO next batch OF 12
  3631. MOV RDX, RBX
  3632. ADD RDX, 12
  3633. CMP RDX, [RBP+Cb] ; Cb, check TO see IF row IS complete
  3634. JLE DotProductSetup ; end RowOfResultsLoop
  3635. MOV RAX , [RBP+matrixA] ; matrixA
  3636. ADD RAX, [RBP+StrideA] ; StrideA
  3637. MOV [RBP+matrixA], RAX ; matrixA
  3638. MOV RAX, [RBP+matrixC] ; matrixC
  3639. ADD RAX, [RBP+StrideC] ; StrideC
  3640. MOV [RBP+matrixC], RAX ; matrixC
  3641. INC RCX
  3642. CMP RCX, [RBP+Ra] ; Ra
  3643. JL RowOfResultsLoop
  3644. Done:
  3645. MOV RAX, [RBP+CbFirst] ; CbFirst
  3646. MOV [RAX], RBX ;
  3647. END SSEMul12BlockX;
  3648. PROCEDURE SSEMul16BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3649. add: BOOLEAN );
  3650. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3651. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3652. DotProductSetup:
  3653. MOV RSI, [RBP+matrixA] ; matrixA
  3654. MOV RDI, [RBP+matrixB] ; matrixB
  3655. MOV RDX, [RBP+CbFrom] ; CbFrom
  3656. LEA RDI, [RDI+RDX*4]
  3657. XORPS XMM2, XMM2
  3658. XORPS XMM3, XMM3
  3659. XORPS XMM4, XMM4
  3660. XORPS XMM5, XMM5
  3661. MOV RAX, 0 ;
  3662. MOV AL, [RBP+add] ;
  3663. CMP AL, 0 ; add?
  3664. JE DotProductLoop ;
  3665. MOV RAX, [RBP+matrixC] ; matrixC
  3666. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally
  3667. MOVUPS XMM2, [RAX]
  3668. MOVUPS XMM3, [RAX+16]
  3669. MOVUPS XMM4, [RAX+32]
  3670. MOVUPS XMM5, [RAX+48]
  3671. MOV RAX, 0
  3672. DotProductLoop:
  3673. MOV RDX, [RSI+RAX*4]
  3674. SHL RDX, 1
  3675. CMP RDX, 0
  3676. JE SparseEntryEscape
  3677. MOVSS XMM0, [RSI+RAX*4]
  3678. SHUFPS XMM0, XMM0, 0H
  3679. MOVUPS XMM1, [RDI]
  3680. MULPS XMM1, XMM0
  3681. ADDPS XMM2, XMM1
  3682. MOVUPS XMM1, [RDI+16]
  3683. MULPS XMM1, XMM0
  3684. ADDPS XMM3, XMM1
  3685. MOVUPS XMM1, [RDI+32]
  3686. MULPS XMM1, XMM0
  3687. ADDPS XMM4, XMM1
  3688. MOVUPS XMM1, [RDI+48]
  3689. MULPS XMM1, XMM0
  3690. ADDPS XMM5, XMM1
  3691. SparseEntryEscape:
  3692. ADD RDI, [RBP+StrideB] ; StrideB
  3693. INC RAX
  3694. CMP RAX, [RBP+Ca] ; Ca
  3695. JL DotProductLoop ; end DotProductLoop
  3696. MOV RAX , [RBP+matrixC] ; matrixC
  3697. MOV RDX, [RBP+CbFrom] ; CbFirst
  3698. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 12
  3699. MOVUPS [RAX], XMM2
  3700. MOVUPS [RAX+16], XMM3
  3701. MOVUPS [RAX+32], XMM4
  3702. MOVUPS [RAX+48], XMM5
  3703. MOV RAX, [RBP+matrixA] ; matrixA
  3704. ADD RAX, [RBP+StrideA] ; StrideA
  3705. MOV [RBP+matrixA], RAX ; matrixA
  3706. MOV RAX, [RBP+matrixC] ; matrixC
  3707. ADD RAX, [RBP+StrideC] ; StrideC
  3708. MOV [RBP+matrixC], RAX ; matrixC
  3709. INC RCX
  3710. CMP RCX, [RBP+Ra] ; Ra
  3711. JL DotProductSetup ;
  3712. END SSEMul16BlockR;
  3713. PROCEDURE SSEMul8BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3714. add: BOOLEAN );
  3715. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3716. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3717. DotProductSetup:
  3718. MOV RSI, [RBP+matrixA] ; matrixA
  3719. MOV RDI, [RBP+matrixB] ; matrixB
  3720. MOV RDX, [RBP+CbFrom] ; CbFrom
  3721. LEA RDI, [RDI+RDX*8]
  3722. XORPD XMM2, XMM2
  3723. XORPD XMM3, XMM3
  3724. XORPD XMM4, XMM4
  3725. XORPD XMM5, XMM5
  3726. MOV RAX, 0 ;
  3727. MOV AL, [RBP+add] ;
  3728. CMP AL, 0 ; add?
  3729. JE DotProductLoop ;
  3730. MOV RAX, [RBP+matrixC] ; matrixC
  3731. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3732. MOVUPD XMM2, [RAX]
  3733. MOVUPD XMM3, [RAX+16]
  3734. MOVUPD XMM4, [RAX+32]
  3735. MOVUPD XMM5, [RAX+48]
  3736. MOV RAX, 0
  3737. DotProductLoop:
  3738. ; MOV RDX, [RSI+RAX*8]
  3739. ; SHL RDX, 1
  3740. ; CMP RDX, 0
  3741. ; JE SparseEntryEscape
  3742. MOVSD XMM0, [RSI+RAX*8]
  3743. SHUFPD XMM0, XMM0, 0H
  3744. MOVUPD XMM1, [RDI]
  3745. MULPD XMM1, XMM0
  3746. ADDPD XMM2, XMM1
  3747. MOVUPD XMM1, [RDI+16]
  3748. MULPD XMM1, XMM0
  3749. ADDPD XMM3, XMM1
  3750. MOVUPD XMM1, [RDI+32]
  3751. MULPD XMM1, XMM0
  3752. ADDPD XMM4, XMM1
  3753. MOVUPD XMM1, [RDI+48]
  3754. MULPD XMM1, XMM0
  3755. ADDPD XMM5, XMM1
  3756. SparseEntryEscape:
  3757. ADD RDI, [RBP+StrideB] ; StrideB
  3758. INC RAX
  3759. CMP RAX, [RBP+Ca] ; Ca
  3760. JL DotProductLoop ; end DotProductLoop
  3761. MOV RAX , [RBP+matrixC] ; matrixC
  3762. MOV RDX, [RBP+CbFrom] ; CbFirst
  3763. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3764. MOVUPD [RAX], XMM2
  3765. MOVUPD [RAX+16], XMM3
  3766. MOVUPD [RAX+32], XMM4
  3767. MOVUPD [RAX+48], XMM5
  3768. MOV RAX, [RBP+matrixA] ; matrixA
  3769. ADD RAX, [RBP+StrideA] ; StrideA
  3770. MOV [RBP+matrixA], RAX ; matrixA
  3771. MOV RAX, [RBP+matrixC] ; matrixC
  3772. ADD RAX, [RBP+StrideC] ; StrideC
  3773. MOV [RBP+matrixC], RAX ; matrixC
  3774. INC RCX
  3775. CMP RCX, [RBP+Ra] ; Ra
  3776. JL DotProductSetup ;
  3777. END SSEMul8BlockX;
  3778. PROCEDURE SSEMul8BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3779. add: BOOLEAN );
  3780. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3781. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3782. DotProductSetup:
  3783. MOV RSI, [RBP+matrixA] ; matrixA
  3784. MOV RDI, [RBP+matrixB] ; matrixB
  3785. MOV RDX, [RBP+CbFrom] ; CbFrom
  3786. LEA RDI, [RDI+RDX*4]
  3787. XORPS XMM2, XMM2
  3788. XORPS XMM3, XMM3
  3789. MOV RAX, 0 ;
  3790. MOV AL, [RBP+add] ;
  3791. CMP AL, 0 ; add?
  3792. JE DotProductLoop ;
  3793. MOV RAX, [RBP+matrixC] ; matrixC
  3794. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3795. MOVUPS XMM2, [RAX]
  3796. MOVUPS XMM3, [RAX+16]
  3797. MOV RAX, 0
  3798. DotProductLoop:
  3799. MOV RDX, [RSI+RAX*4]
  3800. SHL RDX, 1
  3801. CMP RDX, 0
  3802. JE SparseEntryEscape
  3803. MOVSS XMM0, [RSI+RAX*4]
  3804. SHUFPS XMM0, XMM0, 0H
  3805. MOVUPS XMM1, [RDI]
  3806. MULPS XMM1, XMM0
  3807. ADDPS XMM2, XMM1
  3808. MOVUPS XMM1, [RDI+16]
  3809. MULPS XMM1, XMM0
  3810. ADDPS XMM3, XMM1
  3811. SparseEntryEscape:
  3812. ADD RDI, [RBP+StrideB] ; StrideB
  3813. INC RAX
  3814. CMP RAX, [RBP+Ca] ; Ca
  3815. JL DotProductLoop ; end DotProductLoop
  3816. MOV RAX , [RBP+matrixC] ; matrixC
  3817. MOV RDX, [RBP+CbFrom] ; CbFrom
  3818. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3819. MOVUPS [RAX], XMM2
  3820. MOVUPS [RAX+16], XMM3
  3821. MOV RAX, [RBP+matrixA] ; matrixA
  3822. ADD RAX, [RBP+StrideA] ; StrideA
  3823. MOV [RBP+matrixA], RAX ; matrixA
  3824. MOV RAX, [RBP+matrixC] ; matrixC
  3825. ADD RAX, [RBP+StrideC] ; StrideC
  3826. MOV [RBP+matrixC], RAX ; matrixC
  3827. INC RCX
  3828. CMP RCX, [RBP+Ra] ; Ra
  3829. JL DotProductSetup ;
  3830. END SSEMul8BlockR;
  3831. PROCEDURE SSEMul4BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3832. add: BOOLEAN );
  3833. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3834. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3835. DotProductSetup:
  3836. MOV RAX, 0 ; cols IN A
  3837. MOV RSI, [RBP+matrixA] ; matrixA
  3838. MOV RDI, [RBP+matrixB] ; matrixB
  3839. MOV RDX, [RBP+CbFrom] ; CbFrom
  3840. LEA RDI, [RDI+RDX*8]
  3841. XORPS XMM2, XMM2
  3842. XORPS XMM3, XMM3
  3843. MOV RAX, 0 ;
  3844. MOV AL, [RBP+add] ;
  3845. CMP AL, 0 ; add?
  3846. JE DotProductLoop ;
  3847. MOV RAX, [RBP+matrixC] ; matrixC
  3848. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3849. MOVUPD XMM2, [RAX]
  3850. MOVUPD XMM3, [RAX+16]
  3851. MOV RAX, 0
  3852. DotProductLoop:
  3853. ; MOV RDX, [RSI+RAX*8]
  3854. ; SHL RDX, 1
  3855. ; CMP RDX, 0
  3856. ; JE SparseEntryEscape
  3857. MOVSD XMM0, [RSI+RAX*8]
  3858. SHUFPD XMM0, XMM0, 0H
  3859. MOVUPD XMM1, [RDI]
  3860. MULPD XMM1, XMM0
  3861. ADDPD XMM2, XMM1
  3862. MOVUPD XMM1, [RDI+16]
  3863. MULPD XMM1, XMM0
  3864. ADDPD XMM3, XMM1
  3865. SparseEntryEscape:
  3866. ADD RDI, [RBP+StrideB] ; StrideB
  3867. INC RAX
  3868. CMP RAX, [RBP+Ca] ; Ca
  3869. JL DotProductLoop ; end DotProductLoop
  3870. MOV RAX , [RBP+matrixC] ; matrixC
  3871. MOV RDX, [RBP+CbFrom] ; CbFrom
  3872. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3873. MOVUPD [RAX], XMM2
  3874. MOVUPD [RAX+16], XMM3
  3875. MOV RAX, [RBP+matrixA] ; matrixA
  3876. ADD RAX, [RBP+StrideA] ; StrideA
  3877. MOV [RBP+matrixA], RAX ; matrixA
  3878. MOV RAX, [RBP+matrixC] ; matrixC
  3879. ADD RAX, [RBP+StrideC] ; StrideC
  3880. MOV [RBP+matrixC], RAX ; matrixC
  3881. INC RCX
  3882. CMP RCX, [RBP+Ra] ; Ra
  3883. JL DotProductSetup ;
  3884. END SSEMul4BlockX;
  3885. PROCEDURE SSEMul4BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3886. add: BOOLEAN );
  3887. CODE {SYSTEM.AMD64, SYSTEM.SSE}
  3888. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3889. DotProductSetup:
  3890. MOV RAX, 0 ; cols IN A
  3891. MOV RSI, [RBP+matrixA] ; matrixA
  3892. MOV RDI, [RBP+matrixB] ; matrixB
  3893. MOV RDX, [RBP+CbFrom] ; CbFrom
  3894. LEA RDI, [RDI+RDX*4]
  3895. XORPS XMM2, XMM2
  3896. MOV RAX, 0 ;
  3897. MOV AL, [RBP+add] ;
  3898. CMP AL, 0 ; add?
  3899. JE DotProductLoop ;
  3900. MOV RAX, [RBP+matrixC] ; matrixC
  3901. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3902. MOVUPS XMM2, [RAX]
  3903. MOV RAX, 0
  3904. DotProductLoop:
  3905. MOV RDX, [RSI+RAX*4]
  3906. SHL RDX, 1
  3907. CMP RDX, 0
  3908. JE SparseEntryEscape
  3909. MOVSS XMM0, [RSI+RAX*4]
  3910. SHUFPS XMM0, XMM0, 0H
  3911. MOVUPS XMM1, [RDI]
  3912. MULPS XMM1, XMM0
  3913. ADDPS XMM2, XMM1
  3914. SparseEntryEscape:
  3915. ADD RDI, [RBP+StrideB] ; StrideB
  3916. INC RAX
  3917. CMP RAX, [RBP+Ca] ; Ca
  3918. JL DotProductLoop ; end DopProductLoop
  3919. MOV RAX, [RBP+matrixC] ; matrixC
  3920. MOV RDX, [RBP+CbFrom] ; CbFrom
  3921. LEA RAX, [RAX+RDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3922. MOVUPS [RAX], XMM2
  3923. MOV RAX, [RBP+matrixA] ; matrixA
  3924. ADD RAX, [RBP+StrideA] ; StrideA
  3925. MOV [RBP+matrixA], RAX ; matrixA
  3926. MOV RAX, [RBP+matrixC] ; matrixC
  3927. ADD RAX, [RBP+StrideC] ; StrideC
  3928. MOV [RBP+matrixC], RAX ; matrixC
  3929. INC RCX
  3930. CMP RCX, [RBP+Ra] ; Ra
  3931. JL DotProductSetup ;
  3932. END SSEMul4BlockR;
  3933. PROCEDURE SSEMul2BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3934. add: BOOLEAN );
  3935. CODE {SYSTEM.AMD64, SYSTEM.SSE2}
  3936. MOV RCX, 0 ; counter FOR rows IN A-Ra
  3937. DotProductSetup:
  3938. MOV RAX, 0 ; cols IN A
  3939. MOV RSI, [RBP+matrixA] ; matrixA
  3940. MOV RDI, [RBP+matrixB] ; matrixB
  3941. MOV RDX, [RBP+CbFrom] ; CbFrom
  3942. LEA RDI, [RDI+RDX*8]
  3943. XORPD XMM2, XMM2
  3944. MOV RAX, 0 ;
  3945. MOV AL, [RBP+add] ;
  3946. CMP AL, 0 ; add?
  3947. JE DotProductLoop ;
  3948. MOV RAX, [RBP+matrixC] ; matrixC
  3949. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3950. MOVUPD XMM2, [RAX]
  3951. MOV RAX, 0
  3952. DotProductLoop:
  3953. ; MOV RDX, [RSI+RAX*4] ;
  3954. ; SHL RDX, 1 ;
  3955. ; CMP RDX, 0
  3956. ; JE SparseEntryEscape
  3957. MOVSD XMM0, [RSI+RAX*8]
  3958. SHUFPD XMM0, XMM0, 0H
  3959. MOVUPD XMM1, [RDI]
  3960. MULPD XMM1, XMM0
  3961. ADDPD XMM2, XMM1
  3962. SparseEntryEscape:
  3963. ADD RDI, [RBP+StrideB] ; StrideB
  3964. INC RAX
  3965. CMP RAX, [RBP+Ca] ; Ca
  3966. JL DotProductLoop ; end DotProductLoop
  3967. MOV RAX , [RBP+matrixC] ; matrixC
  3968. MOV RDX, [RBP+CbFrom] ; CbFrom
  3969. LEA RAX, [RAX+RDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3970. MOVUPD [RAX], XMM2
  3971. MOV RAX, [RBP+matrixA] ; matrixA
  3972. ADD RAX, [RBP+StrideA] ; StrideA
  3973. MOV [RBP+matrixA], RAX ; matrixA
  3974. MOV RAX, [RBP+matrixC] ; matrixC
  3975. ADD RAX, [RBP+StrideC] ; StrideC
  3976. MOV [RBP+matrixC], RAX ; matrixC
  3977. INC RCX
  3978. CMP RCX, [RBP+Ra] ; Ra
  3979. JL DotProductSetup ;
  3980. END SSEMul2BlockX;
  3981. (****** blocking matrix multiplication with copy of data ******)
  3982. PROCEDURE MagicBlockR( M, N, K: SIZE;
  3983. VAR L2BlockM, L2BlockN, L2BlockK: SIZE );
  3984. BEGIN
  3985. K := (K DIV L0BlockKR) * L0BlockKR;
  3986. N := (N DIV L1BlockN) * L1BlockN;
  3987. IF M = 0 THEN M := 1 END;
  3988. IF N = 0 THEN N := 1 END;
  3989. IF K = 0 THEN K := 1 END;
  3990. L2BlockK :=
  3991. K DIV ((K + L1MaxBlockKR - 1) DIV L1MaxBlockKR);
  3992. (* Round up to next multiple of 16 *)
  3993. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  3994. L2BlockN :=
  3995. L2BlockSize DIV SIZEOF( REAL ) DIV
  3996. (L2BlockK * (L2BARatio + 1));
  3997. IF L2BlockN > N THEN L2BlockN := N
  3998. ELSIF L2BlockN < 1 THEN L2BlockN := 1;
  3999. END;
  4000. L2BlockM :=
  4001. (L2BlockSize DIV SIZEOF( REAL ) - L2BlockN * L2BlockK) DIV
  4002. L2BlockK;
  4003. (* Round up to next multiple of 5 *)
  4004. IF L2BlockM > M THEN L2BlockM := M
  4005. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  4006. END;
  4007. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  4008. END MagicBlockR;
  4009. PROCEDURE MagicBlockX( M, N, K: SIZE;
  4010. VAR L2BlockM, L2BlockN, L2BlockK:SIZE );
  4011. BEGIN
  4012. K := (K DIV L0BlockKX) * L0BlockKX;
  4013. N := (N DIV L1BlockN) * L1BlockN;
  4014. IF M = 0 THEN M := 1 END;
  4015. IF N = 0 THEN N := 1 END;
  4016. IF K = 0 THEN K := 1 END;
  4017. L2BlockK :=
  4018. K DIV ((K + L1MaxBlockKX - 1) DIV L1MaxBlockKX);
  4019. (* Round up to next multiple of 16 *)
  4020. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  4021. L2BlockN :=
  4022. L2BlockSize DIV SIZEOF( LONGREAL ) DIV
  4023. (L2BlockK * (L2BARatio + 1));
  4024. IF L2BlockN > N THEN L2BlockN := N END;
  4025. L2BlockM :=
  4026. (L2BlockSize DIV SIZEOF( LONGREAL ) - L2BlockN * L2BlockK) DIV
  4027. L2BlockK;
  4028. (* Round up to next multiple of 5 *)
  4029. IF L2BlockM > M THEN L2BlockM := M
  4030. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  4031. END;
  4032. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  4033. END MagicBlockX;
  4034. (*
  4035. PROCEDURE L1Block1X( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4036. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  4037. PROCEDURE null( i: LONGINT );
  4038. BEGIN
  4039. reg[i, 0] := 0; reg[i, 1] := 0;
  4040. END null;
  4041. PROCEDURE get1( adr, i: LONGINT );
  4042. BEGIN
  4043. SYSTEM.GET( adr, reg[i, 0] );
  4044. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4045. END get1;
  4046. PROCEDURE get2( adr, i: LONGINT );
  4047. BEGIN
  4048. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  4049. IF debug THEN
  4050. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4051. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  4052. END;
  4053. END get2;
  4054. PROCEDURE mul2( i, j: LONGINT );
  4055. BEGIN
  4056. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4057. END mul2;
  4058. PROCEDURE add2( i, j: LONGINT );
  4059. BEGIN
  4060. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4061. END add2;
  4062. PROCEDURE put1( adr, i: LONGINT );
  4063. BEGIN
  4064. SYSTEM.PUT( adr, reg[i, 0] );
  4065. END put1;
  4066. PROCEDURE horadd( i: LONGINT );
  4067. BEGIN
  4068. reg[i, 0] := reg[i, 0] + reg[i, 1];
  4069. END horadd;
  4070. BEGIN
  4071. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  4072. null( 2 ); get1( adrC, 2 );
  4073. WHILE (K > 0) DO (* padding guaranteed *)
  4074. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 ); INC( adrB, 16 );
  4075. INC( adrA, 16 ); DEC( K, 2 );
  4076. END;
  4077. horadd( 2 ); put1( adrC, 2 );
  4078. END L1Block1X;
  4079. PROCEDURE L1Block5X( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4080. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  4081. PROCEDURE null( i: LONGINT );
  4082. BEGIN
  4083. reg[i, 0] := 0; reg[i, 1] := 0;
  4084. END null;
  4085. PROCEDURE get1( adr, i: LONGINT );
  4086. BEGIN
  4087. SYSTEM.GET( adr, reg[i, 0] );
  4088. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4089. END get1;
  4090. PROCEDURE get2( adr, i: LONGINT );
  4091. BEGIN
  4092. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  4093. IF debug THEN
  4094. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4095. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  4096. END;
  4097. END get2;
  4098. PROCEDURE mul2( i, j: LONGINT );
  4099. BEGIN
  4100. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4101. END mul2;
  4102. PROCEDURE add2( i, j: LONGINT );
  4103. BEGIN
  4104. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4105. END add2;
  4106. PROCEDURE put1( adr, i: LONGINT );
  4107. BEGIN
  4108. SYSTEM.PUT( adr, reg[i, 0] );
  4109. END put1;
  4110. PROCEDURE horadd( i: LONGINT );
  4111. BEGIN
  4112. reg[i, 0] := reg[i, 0] + reg[i, 1];
  4113. END horadd;
  4114. BEGIN
  4115. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  4116. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  4117. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  4118. get1( adrC + 4 * IncC, 6 );
  4119. WHILE (K > 0) DO (* padding guaranteed *)
  4120. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 );
  4121. get2( adrB + 16, 0 ); mul2( 0, 7 ); add2( 3, 0 ); get2( adrB + 32, 0 );
  4122. mul2( 0, 7 ); add2( 4, 0 ); get2( adrB + 48, 0 ); mul2( 0, 7 );
  4123. add2( 5, 0 ); get2( adrB + 64, 0 ); mul2( 0, 7 ); add2( 6, 0 ); INC( adrB, 80 );
  4124. INC( adrA, 16 ); DEC( K, 2 );
  4125. END;
  4126. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  4127. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  4128. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  4129. END L1Block5X;
  4130. PROCEDURE L1Block1R( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4131. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  4132. PROCEDURE null( i: LONGINT );
  4133. BEGIN
  4134. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  4135. END null;
  4136. PROCEDURE get1( adr, i: LONGINT );
  4137. BEGIN
  4138. SYSTEM.GET( adr, reg[i, 0] );
  4139. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4140. END get1;
  4141. PROCEDURE get4( adr, i: LONGINT );
  4142. BEGIN
  4143. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  4144. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  4145. IF debug THEN
  4146. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4147. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  4148. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  4149. END;
  4150. END get4;
  4151. PROCEDURE mul4( i, j: LONGINT );
  4152. BEGIN
  4153. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4154. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  4155. END mul4;
  4156. PROCEDURE add4( i, j: LONGINT );
  4157. BEGIN
  4158. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4159. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  4160. END add4;
  4161. PROCEDURE put1( adr, i: LONGINT );
  4162. BEGIN
  4163. SYSTEM.PUT( adr, reg[i, 0] );
  4164. END put1;
  4165. PROCEDURE horadd( i: LONGINT );
  4166. BEGIN
  4167. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  4168. END horadd;
  4169. BEGIN
  4170. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  4171. null( 2 ); get1( adrC, 2 );
  4172. WHILE (K > 0) DO (* padding guaranteed *)
  4173. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 ); INC( adrB, 16 );
  4174. INC( adrA, 16 ); DEC( K, 4 );
  4175. END;
  4176. horadd( 2 ); put1( adrC, 2 );
  4177. END L1Block1R;
  4178. PROCEDURE L1Block5R( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4179. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  4180. PROCEDURE null( i: LONGINT );
  4181. BEGIN
  4182. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  4183. END null;
  4184. PROCEDURE get1( adr, i: LONGINT );
  4185. BEGIN
  4186. SYSTEM.GET( adr, reg[i, 0] );
  4187. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4188. END get1;
  4189. PROCEDURE get4( adr, i: LONGINT );
  4190. BEGIN
  4191. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  4192. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  4193. IF debug THEN
  4194. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4195. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  4196. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  4197. END;
  4198. END get4;
  4199. PROCEDURE mul4( i, j: LONGINT );
  4200. BEGIN
  4201. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4202. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  4203. END mul4;
  4204. PROCEDURE add4( i, j: LONGINT );
  4205. BEGIN
  4206. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4207. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  4208. END add4;
  4209. PROCEDURE put1( adr, i: LONGINT );
  4210. BEGIN
  4211. SYSTEM.PUT( adr, reg[i, 0] );
  4212. END put1;
  4213. PROCEDURE horadd( i: LONGINT );
  4214. BEGIN
  4215. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  4216. END horadd;
  4217. BEGIN
  4218. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  4219. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  4220. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  4221. get1( adrC + 4 * IncC, 6 );
  4222. WHILE (K > 0) DO (* padding guaranteed *)
  4223. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 );
  4224. get4( adrB + 16, 0 ); mul4( 0, 7 ); add4( 3, 0 ); get4( adrB + 32, 0 );
  4225. mul4( 0, 7 ); add4( 4, 0 ); get4( adrB + 48, 0 ); mul4( 0, 7 );
  4226. add4( 5, 0 ); get4( adrB + 64, 0 ); mul4( 0, 7 ); add4( 6, 0 ); INC( adrB, 80 );
  4227. INC( adrA, 16 ); DEC( K, 4 );
  4228. END;
  4229. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  4230. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  4231. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  4232. END L1Block5R;
  4233. *)
  4234. PROCEDURE DispCR( adrM: ADDRESS;
  4235. inc, stride, M, N: SIZE );
  4236. VAR i, j: SIZE; adr: ADDRESS; val: REAL;
  4237. BEGIN
  4238. FOR i := 0 TO M - 1 DO
  4239. adr := adrM + i * stride;
  4240. FOR j := 0 TO N - 1 DO
  4241. SYSTEM.GET( adr, val );
  4242. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  4243. END;
  4244. KernelLog.Ln;
  4245. END;
  4246. END DispCR;
  4247. PROCEDURE DispCX( adrM: ADDRESS;
  4248. inc, stride, M, N: SIZE );
  4249. VAR i, j: SIZE; adr: ADDRESS; val: LONGREAL;
  4250. BEGIN
  4251. FOR i := 0 TO M - 1 DO
  4252. adr := adrM + i * stride;
  4253. FOR j := 0 TO N - 1 DO
  4254. SYSTEM.GET( adr, val );
  4255. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  4256. END;
  4257. KernelLog.Ln;
  4258. END;
  4259. END DispCX;
  4260. PROCEDURE L3BlockX( matrixA, matrixB, matrixC: ADDRESS;
  4261. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4262. (*
  4263. K N
  4264. *** N *****
  4265. M *** ****** -> ***** M
  4266. *** K ****** *****
  4267. *** ****** *****
  4268. A * B -> C
  4269. *)
  4270. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4271. KAligned: SIZE;
  4272. CONST Size = SIZEOF( LONGREAL );
  4273. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4274. (* M,N and K arbitrary ! *)
  4275. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4276. m, k, KAligned: SIZE;
  4277. BEGIN
  4278. KAligned := Align2( K ) * 8;
  4279. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4280. END;
  4281. adrB := matrixB;
  4282. WHILE (N >= L1BlockN) DO
  4283. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4284. adrC := matrixC; adrA := matrixA; m := M;
  4285. WHILE (m > 0) DO
  4286. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4287. IF SSE THEN
  4288. L1Block5XSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4289. ELSE
  4290. aadrA := adrA; aadrB := adrB; k := K;
  4291. WHILE (k > 0) DO
  4292. L1Block1XA( aadrA, aadrB, adrC, 2 );
  4293. L1Block1XA( aadrA, aadrB + 16, adrC + incC, 2 );
  4294. L1Block1XA( aadrA, aadrB + 32, adrC + 2 * incC,
  4295. 2 );
  4296. L1Block1XA( aadrA, aadrB + 48, adrC + 3 * incC,
  4297. 2 );
  4298. L1Block1XA( aadrA, aadrB + 64, adrC + 4 * incC,
  4299. 2 );
  4300. DEC( k, 2 ); INC( aadrA, 16 );
  4301. INC( aadrB, 16 * L1BlockN );
  4302. END;
  4303. END;
  4304. IF debug THEN
  4305. DispCX( matrixC, incC, strideC, M, N );
  4306. END;
  4307. INC( adrA, KAligned ); INC( adrC, strideC );
  4308. DEC( m );
  4309. END;
  4310. INC( matrixC, L1BlockN * incC );
  4311. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4312. END;
  4313. WHILE (N > 0) DO
  4314. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4315. adrC := matrixC; adrA := matrixA; m := M;
  4316. WHILE (m > 0) DO
  4317. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4318. IF SSE THEN
  4319. L1Block1XSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4320. ELSE L1Block1XA( adrA, adrB, adrC, K );
  4321. END;
  4322. IF debug THEN
  4323. DispCX( matrixC, incC, strideC, M, N );
  4324. END;
  4325. INC( adrA, KAligned ); INC( adrC, strideC );
  4326. DEC( m );
  4327. END;
  4328. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4329. END;
  4330. END L2Block;
  4331. BEGIN
  4332. KAligned := Align2( K ) * 8;
  4333. ASSERT( L2BlockK MOD 2 = 0 );
  4334. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4335. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4336. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4337. WHILE (n >= L2BlockN) DO
  4338. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4339. a1 := matrixA; adrC := matrixC; m := M;
  4340. WHILE (m >= L2BlockM) DO
  4341. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4342. adrA := a1; adrB := b1; k := K;
  4343. (* core: do matching level 2 cache Blocks *)
  4344. WHILE (k >= L2BlockK) DO
  4345. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4346. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4347. L2BlockK );
  4348. INC( adrA, L2BlockK * L2BlockM * Size );
  4349. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4350. DEC( k, L2BlockK );
  4351. END;
  4352. (* core: do rest of k *)
  4353. IF k > 0 THEN
  4354. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4355. END;
  4356. INC( a1, KAligned * L2BlockM );
  4357. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4358. END;
  4359. IF m > 0 THEN
  4360. (* clean up M *)
  4361. adrA := a1; adrB := b1; k := K;
  4362. WHILE (k >= L2BlockK) DO
  4363. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4364. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4365. INC( adrA, L2BlockK * Size * m );
  4366. INC( adrB, L2BlockK * L2BlockN * Size );
  4367. DEC( k, L2BlockK );
  4368. END;
  4369. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4370. IF k > 0 THEN
  4371. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4372. END;
  4373. END;
  4374. INC( b1, L2BlockN * KAligned );
  4375. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4376. END;
  4377. IF (n = 0) THEN RETURN
  4378. END;
  4379. a1 := matrixA; adrC := matrixC; m := M;
  4380. WHILE (m >= L2BlockM) DO
  4381. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4382. adrA := a1; adrB := b1; k := K;
  4383. WHILE (k >= L2BlockK) DO
  4384. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4385. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4386. INC( adrA, L2BlockM * L2BlockK * Size );
  4387. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4388. END;
  4389. IF k > 0 THEN
  4390. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4391. END;
  4392. INC( a1, L2BlockM * KAligned );
  4393. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4394. END;
  4395. IF (m = 0) THEN RETURN
  4396. END;
  4397. adrA := a1; adrB := b1; k := K;
  4398. WHILE (k >= L2BlockK) DO
  4399. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4400. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4401. INC( adrA, L2BlockK * m * Size );
  4402. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4403. END;
  4404. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4405. END;
  4406. END L3BlockX;
  4407. PROCEDURE L3BlockR( matrixA, matrixB, matrixC: ADDRESS;
  4408. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4409. (*
  4410. K N
  4411. *** N *****
  4412. M *** ****** -> ***** M
  4413. *** K ****** *****
  4414. *** ****** *****
  4415. A * B -> C
  4416. *)
  4417. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4418. KAligned: SIZE;
  4419. CONST Size = SIZEOF( REAL );
  4420. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4421. (* M,N and K arbitrary ! *)
  4422. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4423. m, KAligned, k: SIZE;
  4424. BEGIN
  4425. KAligned := Align4( K ) * 4;
  4426. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4427. END;
  4428. adrB := matrixB;
  4429. WHILE (N >= L1BlockN) DO
  4430. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4431. adrC := matrixC; adrA := matrixA; m := M;
  4432. WHILE (m > 0) DO
  4433. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4434. IF SSE THEN
  4435. L1Block5RSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4436. ELSE
  4437. aadrA := adrA; aadrB := adrB; k := K;
  4438. WHILE (k > 0) DO
  4439. L1Block1RA( aadrA, aadrB, adrC, 4 );
  4440. L1Block1RA( aadrA, aadrB + 16, adrC + incC, 4 );
  4441. L1Block1RA( aadrA, aadrB + 32, adrC + 2 * incC,
  4442. 4 );
  4443. L1Block1RA( aadrA, aadrB + 48, adrC + 3 * incC,
  4444. 4 );
  4445. L1Block1RA( aadrA, aadrB + 64, adrC + 4 * incC,
  4446. 4 );
  4447. DEC( k, 4 ); INC( aadrA, 16 );
  4448. INC( aadrB, 16 * L1BlockN );
  4449. END;
  4450. END;
  4451. IF debug THEN
  4452. DispCR( matrixC, incC, strideC, M, N );
  4453. END;
  4454. INC( adrA, KAligned ); INC( adrC, strideC );
  4455. DEC( m );
  4456. END;
  4457. INC( matrixC, L1BlockN * incC );
  4458. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4459. END;
  4460. WHILE (N > 0) DO
  4461. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4462. adrC := matrixC; adrA := matrixA; m := M;
  4463. WHILE (m > 0) DO
  4464. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4465. IF SSE THEN
  4466. L1Block1RSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4467. ELSE L1Block1RA( adrA, adrB, adrC, K );
  4468. END;
  4469. IF debug THEN
  4470. DispCR( matrixC, incC, strideC, M, N );
  4471. END;
  4472. INC( adrA, KAligned ); INC( adrC, strideC );
  4473. DEC( m );
  4474. END;
  4475. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4476. END;
  4477. END L2Block;
  4478. BEGIN
  4479. KAligned := Align4( K ) * 4;
  4480. ASSERT( L2BlockK MOD 4 = 0 );
  4481. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4482. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4483. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4484. WHILE (n >= L2BlockN) DO
  4485. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4486. a1 := matrixA; adrC := matrixC; m := M;
  4487. WHILE (m >= L2BlockM) DO
  4488. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4489. adrA := a1; adrB := b1; k := K;
  4490. (* core: do matching level 2 cache Blocks *)
  4491. WHILE (k >= L2BlockK) DO
  4492. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4493. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4494. L2BlockK );
  4495. INC( adrA, L2BlockK * L2BlockM * Size );
  4496. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4497. DEC( k, L2BlockK );
  4498. END;
  4499. (* core: do rest of k *)
  4500. IF k > 0 THEN
  4501. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4502. END;
  4503. INC( a1, KAligned * L2BlockM );
  4504. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4505. END;
  4506. IF m > 0 THEN
  4507. (* clean up M *)
  4508. adrA := a1; adrB := b1; k := K;
  4509. WHILE (k >= L2BlockK) DO
  4510. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4511. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4512. INC( adrA, L2BlockK * Size * m );
  4513. INC( adrB, L2BlockK * L2BlockN * Size );
  4514. DEC( k, L2BlockK );
  4515. END;
  4516. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4517. IF k > 0 THEN
  4518. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4519. END;
  4520. END;
  4521. INC( b1, L2BlockN * KAligned );
  4522. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4523. END;
  4524. IF (n = 0) THEN RETURN
  4525. END;
  4526. a1 := matrixA; adrC := matrixC; m := M;
  4527. WHILE (m >= L2BlockM) DO
  4528. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4529. adrA := a1; adrB := b1; k := K;
  4530. WHILE (k >= L2BlockK) DO
  4531. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4532. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4533. INC( adrA, L2BlockM * L2BlockK * Size );
  4534. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4535. END;
  4536. IF k > 0 THEN
  4537. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4538. END;
  4539. INC( a1, L2BlockM * KAligned );
  4540. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4541. END;
  4542. IF (m = 0) THEN RETURN
  4543. END;
  4544. adrA := a1; adrB := b1; k := K;
  4545. WHILE (k >= L2BlockK) DO
  4546. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4547. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4548. INC( adrA, L2BlockK * m * Size );
  4549. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4550. END;
  4551. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4552. END;
  4553. END L3BlockR;
  4554. PROCEDURE Align( adr: ADDRESS; align: SIZE ): ADDRESS;
  4555. BEGIN
  4556. RETURN adr + (-adr) MOD align; (* 128 bit = 16 byte alignment *)
  4557. END Align;
  4558. PROCEDURE CopyAX( matrixA, dest: ADDRESS;
  4559. IncA, StrideA: SIZE;
  4560. K, M, L2BlockK, L2BlockM: SIZE );
  4561. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4562. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4563. VAR rest: SIZE;
  4564. BEGIN
  4565. IF debug THEN
  4566. KernelLog.String( "CopyMK:" ); KernelLog.Int( M, 10 ); KernelLog.Int( K, 10 );
  4567. KernelLog.Ln;
  4568. END;
  4569. rest := (-K) MOD 2;
  4570. WHILE (M > 0) DO
  4571. MovX( matrixA, dest, IncA, K ); INC( dest, K * 8 );
  4572. IF rest # 0 THEN
  4573. ZeroX( dest, rest ); INC( dest, 8 * rest );
  4574. END;
  4575. INC( matrixA, StrideA ); DEC( M );
  4576. END;
  4577. END CopyMK;
  4578. BEGIN
  4579. Tic( t ); m := M;
  4580. WHILE (m >= L2BlockM) DO
  4581. k := K; adrA := matrixA;
  4582. WHILE (k >= L2BlockK) DO
  4583. CopyMK( adrA, L2BlockM, L2BlockK );
  4584. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4585. END;
  4586. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4587. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4588. END;
  4589. adrA := matrixA; k := K;
  4590. WHILE (k >= L2BlockK) DO
  4591. CopyMK( adrA, m, L2BlockK );
  4592. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4593. END;
  4594. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4595. Toc( t, copyT );
  4596. END CopyAX;
  4597. PROCEDURE CopyAR( matrixA, dest: ADDRESS;
  4598. IncA, StrideA: SIZE;
  4599. K, M, L2BlockK, L2BlockM: SIZE );
  4600. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4601. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4602. VAR rest: SIZE;
  4603. BEGIN
  4604. rest := (-K) MOD 4;
  4605. WHILE (M > 0) DO
  4606. MovR( matrixA, dest, IncA, K ); INC( dest, K * 4 );
  4607. IF rest # 0 THEN
  4608. ZeroR( dest, rest ); INC( dest, 4 * rest );
  4609. END;
  4610. INC( matrixA, StrideA ); DEC( M );
  4611. END;
  4612. END CopyMK;
  4613. BEGIN
  4614. Tic( t ); m := M;
  4615. WHILE (m >= L2BlockM) DO
  4616. k := K; adrA := matrixA;
  4617. WHILE (k >= L2BlockK) DO
  4618. CopyMK( adrA, L2BlockM, L2BlockK );
  4619. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4620. END;
  4621. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4622. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4623. END;
  4624. adrA := matrixA; k := K;
  4625. WHILE (k >= L2BlockK) DO
  4626. CopyMK( adrA, m, L2BlockK );
  4627. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4628. END;
  4629. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4630. Toc( t, copyT );
  4631. END CopyAR;
  4632. PROCEDURE CopyBX( matrixB, dest: ADDRESS;
  4633. IncB, StrideB: SIZE;
  4634. N, K, L2BlockN, L2BlockK: SIZE );
  4635. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4636. PROCEDURE Copy5x2k( matrixB: ADDRESS; k: SIZE );
  4637. VAR i: SIZE; adrB: ADDRESS; rest: SIZE;
  4638. BEGIN
  4639. rest := (-k) MOD 2;
  4640. WHILE (k >= 2) DO (* store 5x4 Block in line *)
  4641. adrB := matrixB;
  4642. FOR i := 1 TO L1BlockN DO
  4643. MovX( adrB, dest, StrideB, 2 ); INC( dest, 16 );
  4644. INC( adrB, IncB );
  4645. END;
  4646. INC( matrixB, 2 * StrideB ); DEC( k, 2 );
  4647. END;
  4648. IF k > 0 THEN
  4649. adrB := matrixB;
  4650. FOR i := 1 TO L1BlockN DO
  4651. MovX( adrB, dest, StrideB, k ); INC( dest, 8 * k );
  4652. IF rest # 0 THEN
  4653. ZeroX( dest, rest ); INC( dest, rest * 8 );
  4654. END;
  4655. INC( adrB, IncB );
  4656. END;
  4657. END;
  4658. END Copy5x2k;
  4659. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4660. VAR n, rest: SIZE;
  4661. BEGIN
  4662. rest := (-K) MOD 2;
  4663. IF debug THEN
  4664. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4665. END;
  4666. n := N;
  4667. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4668. Copy5x2k( matrixB, K );
  4669. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4670. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4671. END;
  4672. IF debug THEN KernelLog.String( "Copy1, n=" ); KernelLog.Int( n, 10 ); KernelLog.Ln;
  4673. END;
  4674. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4675. MovX( matrixB, dest, StrideB, K ); INC( dest, K * 8 );
  4676. ZeroR( dest, rest ); INC( dest, rest * 8 );
  4677. INC( matrixB, IncB ); DEC( n );
  4678. END;
  4679. END Copy1;
  4680. BEGIN
  4681. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4682. ASSERT( L2BlockK MOD 2 = 0 ); n := N;
  4683. WHILE (n >= L2BlockN) DO
  4684. k := K; adrB := matrixB;
  4685. WHILE (k >= L2BlockK) DO
  4686. Copy1( adrB, L2BlockK, L2BlockN );
  4687. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4688. END;
  4689. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4690. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4691. END;
  4692. IF (n = 0) THEN RETURN
  4693. END;
  4694. k := K; adrB := matrixB;
  4695. WHILE (k >= L2BlockK) DO
  4696. Copy1( adrB, L2BlockK, n );
  4697. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4698. END;
  4699. Copy1( adrB, k, n ); Toc( t, copyT );
  4700. END CopyBX;
  4701. PROCEDURE CopyBR( matrixB, dest: ADDRESS;
  4702. IncB, StrideB: SIZE;
  4703. N, K, L2BlockN, L2BlockK: SIZE );
  4704. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4705. PROCEDURE Copy5x4k( matrixB: ADDRESS; k: SIZE );
  4706. VAR i: SIZE; adrB: ADDRESS; rest, k4: SIZE;
  4707. BEGIN
  4708. k4 := k - k MOD 4; rest := (-k) MOD 4;
  4709. IF k4 > 0 THEN
  4710. MovR5( matrixB, IncB, StrideB, dest, k4 );
  4711. INC( matrixB, k4 * StrideB ); INC( dest, k4 * 80 DIV 4 );
  4712. DEC( k, k4 );
  4713. END;
  4714. (*
  4715. WHILE (k >= 4) DO (* store 5x4 Block in line *)
  4716. adrB := matrixB;
  4717. FOR i := 1 TO L1BlockN DO
  4718. MovR( adrB, dest, StrideB, 4 ); INC( dest, 16 ); INC( adrB, IncB );
  4719. END;
  4720. INC( matrixB, 4 * StrideB ); DEC( k, 4 );
  4721. END;
  4722. *)
  4723. IF k > 0 THEN
  4724. adrB := matrixB;
  4725. FOR i := 1 TO L1BlockN DO
  4726. MovR( adrB, dest, StrideB, k ); INC( dest, 4 * k );
  4727. IF rest # 0 THEN
  4728. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4729. END;
  4730. INC( adrB, IncB );
  4731. END;
  4732. END;
  4733. END Copy5x4k;
  4734. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4735. VAR n, rest: SIZE;
  4736. BEGIN
  4737. rest := (-K) MOD 4;
  4738. IF debug THEN
  4739. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4740. END;
  4741. n := N;
  4742. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4743. Copy5x4k( matrixB, K );
  4744. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4745. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4746. END;
  4747. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4748. MovR( matrixB, dest, StrideB, K ); INC( dest, K * 4 );
  4749. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4750. INC( matrixB, IncB ); DEC( n );
  4751. END;
  4752. END Copy1;
  4753. BEGIN
  4754. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4755. ASSERT( L2BlockK MOD 4 = 0 ); n := N;
  4756. WHILE (n >= L2BlockN) DO
  4757. k := K; adrB := matrixB;
  4758. WHILE (k >= L2BlockK) DO
  4759. Copy1( adrB, L2BlockK, L2BlockN );
  4760. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4761. END;
  4762. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4763. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4764. END;
  4765. IF (n = 0) THEN RETURN
  4766. END;
  4767. k := K; adrB := matrixB;
  4768. WHILE (k >= L2BlockK) DO
  4769. Copy1( adrB, L2BlockK, n );
  4770. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4771. END;
  4772. Copy1( adrB, k, n ); Toc( t, copyT );
  4773. END CopyBR;
  4774. (*
  4775. PROCEDURE FillMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4776. VAR i, j: LONGINT;
  4777. BEGIN
  4778. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4779. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4780. A[i, j] := ran.Dice( 10 );
  4781. IF debug THEN A[i, j] := 10 * i + j; END;
  4782. END;
  4783. END;
  4784. END FillMR;
  4785. PROCEDURE DispMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4786. VAR i, j: LONGINT;
  4787. BEGIN
  4788. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4789. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4790. KernelLog.Ln;
  4791. END;
  4792. END DispMR;
  4793. PROCEDURE FillMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4794. VAR i, j: LONGINT;
  4795. BEGIN
  4796. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4797. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4798. A[i, j] := ran.Dice( 10 );
  4799. IF debug THEN A[i, j] := 10 * i + j; END;
  4800. END;
  4801. END;
  4802. END FillMX;
  4803. PROCEDURE DispMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4804. VAR i, j: LONGINT;
  4805. BEGIN
  4806. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4807. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4808. KernelLog.Ln;
  4809. END;
  4810. END DispMX;
  4811. *)
  4812. PROCEDURE Tic( VAR t: HUGEINT );
  4813. BEGIN
  4814. t := Machine.GetTimer();
  4815. END Tic;
  4816. PROCEDURE Toc( VAR t, addto: HUGEINT );
  4817. BEGIN
  4818. INC( addto, Machine.GetTimer() - t ); t := Machine.GetTimer();
  4819. END Toc;
  4820. PROCEDURE MultiplyX( A, B, C: ADDRESS;
  4821. M, N, K, L2BlockM, L2BlockN, L2BlockK:SIZE;
  4822. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4823. add: BOOLEAN );
  4824. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4825. M1, M2, i: SIZE; val: LONGREAL; t: HUGEINT;
  4826. inc: SIZE;
  4827. obj: POINTER TO ARRAY OF MultiplyObjectX;
  4828. cache: Cache;
  4829. BEGIN
  4830. NEW(obj,nrProcesses+1);
  4831. lenA := M * Align2( K ) * 8; lenB := N * Align2( K ) * 8;
  4832. cache := cachePool.Acquire( lenA + lenB );
  4833. adrA := cache.adr; adrB := adrA + lenA;
  4834. CopyAX( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4835. CopyBX( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4836. Tic( t ); m := M; adrC := C;
  4837. IF ~add THEN
  4838. WHILE (m > 0) DO
  4839. ZeroXI( adrC, IncC, N ); INC( adrC, StrideC ); DEC( m );
  4840. END;
  4841. END;
  4842. Toc( t, zeroT );
  4843. IF debug THEN
  4844. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4845. FOR i := 0 TO M * Align2( K ) - 1 DO
  4846. SYSTEM.GET( adrA + i * 8, val );
  4847. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4848. END;
  4849. END;
  4850. IF debug THEN
  4851. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4852. FOR i := 0 TO N * Align2( K ) - 1 DO
  4853. SYSTEM.GET( adrB + i * 8, val );
  4854. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4855. END;
  4856. END;
  4857. IF parallel & (M > L2BlockM) THEN
  4858. inc := Align( MAX(M DIV nrProcesses,L2BlockM), L2BlockM ); M1 := 0;
  4859. i := 0;
  4860. WHILE (M1 < M) DO
  4861. M2 := M1 + inc;
  4862. IF M2 > M THEN M2 := M END;
  4863. NEW( obj[i], adrA + M1 * Align2( K ) * 8, adrB,
  4864. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4865. L2BlockM, L2BlockN, L2BlockK );
  4866. M1 := M2; INC( i );
  4867. END;
  4868. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4869. ELSE
  4870. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4871. L2BlockN, L2BlockK );
  4872. END;
  4873. Toc( t, compT ); cachePool.Release( cache );
  4874. END MultiplyX;
  4875. PROCEDURE MultiplyR( A, B, C: ADDRESS;
  4876. M, N, K, L2BlockM, L2BlockN, L2BlockK: SIZE;
  4877. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4878. add: BOOLEAN );
  4879. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4880. M1, M2, i: SIZE; val: REAL; inc: SIZE;
  4881. obj: POINTER TO ARRAY OF MultiplyObjectR;
  4882. t: HUGEINT; cache: Cache;
  4883. BEGIN
  4884. NEW(obj,nrProcesses+1);
  4885. lenA := M * Align4( K ) * 4; lenB := N * Align4( K ) * 4;
  4886. cache := cachePool.Acquire( lenA + lenB );
  4887. adrA := cache.adr; adrB := adrA + lenA;
  4888. CopyAR( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4889. CopyBR( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4890. Tic( t ); m := M; adrC := C;
  4891. IF ~add THEN
  4892. WHILE (m > 0) DO
  4893. ZeroRI( adrC, IncC, N ); INC( adrC, StrideC );
  4894. DEC( m );
  4895. END;
  4896. END;
  4897. Toc( t, zeroT );
  4898. IF debug THEN
  4899. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4900. FOR i := 0 TO M * Align4( K ) - 1 DO
  4901. SYSTEM.GET( adrA + i * 4, val );
  4902. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4903. END;
  4904. END;
  4905. IF debug THEN
  4906. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4907. FOR i := 0 TO N * Align4( K ) - 1 DO
  4908. SYSTEM.GET( adrB + i * 4, val );
  4909. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4910. END;
  4911. END;
  4912. IF parallel & (M > L2BlockM) THEN
  4913. inc := Align( MAX(M DIV nrProcesses,L2BlockM), L2BlockM ); M1 := 0;
  4914. i := 0;
  4915. WHILE (M1 < M) DO
  4916. M2 := M1 + inc;
  4917. IF M2 > M THEN M2 := M END;
  4918. NEW( obj[i], adrA + M1 * Align4( K ) * 4, adrB,
  4919. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4920. L2BlockM, L2BlockN, L2BlockK );
  4921. M1 := M2; INC( i );
  4922. END;
  4923. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4924. ELSE
  4925. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4926. L2BlockN, L2BlockK );
  4927. END;
  4928. Toc( t, compT ); cachePool.Release( cache );
  4929. END MultiplyR;
  4930. (*
  4931. PROCEDURE DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4932. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4933. A, B, C, D: ARRAY [ .. , .. ] OF LONGREAL;
  4934. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4935. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: LONGREAL; atime, time: LONGINT;
  4936. BEGIN
  4937. KernelLog.String( "LONGREAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4938. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4939. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4940. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4941. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMX( A ); FillMX( B );
  4942. IF debug THEN DispMX( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMX( B );
  4943. END;
  4944. atime := Input.Time(); (* C := 0; *)
  4945. WHILE (iter > 0) DO
  4946. MultiplyX( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4947. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4948. (*
  4949. 8,
  4950. LEN( A, 1 ) * 8, 8, LEN( B, 1 ) * 8, 8, LEN( C, 1 ) * 8
  4951. *)
  4952. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4953. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4954. );
  4955. DEC( iter );
  4956. END;
  4957. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4958. IF debug THEN
  4959. DispMX( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMX( B ); KernelLog.String( " = " );
  4960. KernelLog.Ln; DispMX( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  4961. END;
  4962. IF check THEN
  4963. (*
  4964. NEW(D,M,N);
  4965. MatMulAXAXNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4966. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4967. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  4968. *)
  4969. D := A * B;
  4970. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  4971. END;
  4972. END DoTestX;
  4973. PROCEDURE DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4974. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4975. A, B, C, D: ARRAY [ .. , .. ] OF REAL;
  4976. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4977. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: REAL; atime, time: LONGINT;
  4978. BEGIN
  4979. KernelLog.String( "REAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4980. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4981. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4982. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4983. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMR( A ); FillMR( B );
  4984. IF debug THEN DispMR( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMR( B );
  4985. END;
  4986. atime := Input.Time(); (* C := 0; *)
  4987. FOR i := 1 TO iter DO
  4988. MultiplyR( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4989. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4990. (* 4,
  4991. LEN( A, 1 ) * 4, 4, LEN( B, 1 ) * 4, 4, LEN( C, 1 ) * 4 *)
  4992. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4993. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4994. );
  4995. END;
  4996. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4997. IF debug THEN
  4998. DispMR( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMR( B ); KernelLog.String( " = " );
  4999. KernelLog.Ln; DispMR( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  5000. END;
  5001. IF check THEN
  5002. (*
  5003. NEW(D,M,N);
  5004. MatMulARARNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5005. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5006. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  5007. *)
  5008. D := A * B;
  5009. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  5010. END;
  5011. END DoTestR;
  5012. PROCEDURE RandTestR*;
  5013. VAR iter, i, time: LONGINT;
  5014. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  5015. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  5016. BEGIN
  5017. IF Min = Max THEN RETURN Min
  5018. ELSE RETURN ran.Dice( Max - Min ) + Min
  5019. END;
  5020. END Ran;
  5021. BEGIN
  5022. In.Open(); In.LongInt( iter );
  5023. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  5024. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  5025. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  5026. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  5027. K := Ran( MinK, MaxK );
  5028. IF N < 5 THEN N := 5 END;
  5029. IF K < 4 THEN K := 4 END;
  5030. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  5031. BN := Align( BN, 5 );
  5032. IF BN > N THEN DEC( BN, 5 ) END;
  5033. BK := Align( BK, 4 );
  5034. IF BK > K THEN DEC( BK, 4 ) END;
  5035. DoTestR( M, N, K, BM, BN, BK, TRUE , 1 );
  5036. END;
  5037. END RandTestR;
  5038. PROCEDURE RandTestX*;
  5039. VAR iter, i, time: LONGINT;
  5040. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  5041. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  5042. BEGIN
  5043. IF Min = Max THEN RETURN Min
  5044. ELSE RETURN ran.Dice( Max - Min ) + Min
  5045. END;
  5046. END Ran;
  5047. BEGIN
  5048. In.Open(); In.LongInt( iter );
  5049. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  5050. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  5051. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  5052. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  5053. K := Ran( MinK, MaxK );
  5054. IF N < 5 THEN N := 5 END;
  5055. IF K < 4 THEN K := 4 END;
  5056. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  5057. BN := Align( BN, 5 );
  5058. IF BN > N THEN DEC( BN, 5 ) END;
  5059. BK := Align( BK, 4 );
  5060. IF BK > K THEN DEC( BK, 4 ) END;
  5061. DoTestX( M, N, K, BM, BN, BK, TRUE , 1 );
  5062. END;
  5063. END RandTestX;
  5064. *)
  5065. (*
  5066. PROCEDURE Times*;
  5067. VAR all: HUGEINT;
  5068. BEGIN
  5069. all := allocT + copyT + zeroT + compT; KernelLog.String( "alloc=" );
  5070. KernelLog.LongRealFix( allocT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5071. KernelLog.Int( ENTIER( 100 * allocT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5072. KernelLog.Ln; KernelLog.String( "copy=" );
  5073. KernelLog.LongRealFix( copyT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5074. KernelLog.Int( ENTIER( 100 * copyT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5075. KernelLog.Ln; KernelLog.String( "zero=" );
  5076. KernelLog.LongRealFix( zeroT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5077. KernelLog.Int( ENTIER( 100 * zeroT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5078. KernelLog.Ln; KernelLog.String( "comp=" );
  5079. KernelLog.LongRealFix( compT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5080. KernelLog.Int( ENTIER( 100 * compT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5081. KernelLog.Ln;
  5082. END Times;
  5083. *)
  5084. (*
  5085. PROCEDURE TestRMM*;
  5086. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  5087. check, iter: LONGINT;
  5088. BEGIN
  5089. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  5090. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  5091. In.LongInt( iter ); In.LongInt( check );
  5092. IF L2BlockM = 0 THEN
  5093. MagicBlockR( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  5094. END;
  5095. DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  5096. END TestRMM;
  5097. PROCEDURE TestXMM*;
  5098. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  5099. iter, check: LONGINT;
  5100. BEGIN
  5101. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  5102. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  5103. In.LongInt( iter ); In.LongInt( check );
  5104. IF L2BlockM = 0 THEN
  5105. MagicBlockX( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  5106. END;
  5107. DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  5108. END TestXMM;
  5109. *)
  5110. (****** matrix multiplication using fast scalar product ******)
  5111. PROCEDURE MatMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5112. BEGIN
  5113. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  5114. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5115. END MatMulAXAXLoopA;
  5116. PROCEDURE MatMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5117. BEGIN
  5118. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  5119. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5120. END MatMulAXAXLoopSSE;
  5121. PROCEDURE MatMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5122. BEGIN
  5123. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  5124. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5125. END MatMulARARLoopA;
  5126. PROCEDURE MatMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5127. BEGIN
  5128. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  5129. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5130. END MatMulARARLoopSSE;
  5131. PROCEDURE MatMulIncAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5132. BEGIN
  5133. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5134. END MatMulIncAXAXLoopA;
  5135. PROCEDURE MatMulIncAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5136. BEGIN
  5137. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5138. END MatMulIncAXAXLoopSSE;
  5139. PROCEDURE MatMulIncARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5140. BEGIN
  5141. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5142. END MatMulIncARARLoopA;
  5143. PROCEDURE MatMulIncARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5144. BEGIN
  5145. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5146. END MatMulIncARARLoopSSE;
  5147. (****** matrix multiplication over rows with transposition of B *)
  5148. PROCEDURE MatMulHBlockR( MatrixA, MatrixB, MatrixC: ADDRESS;
  5149. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  5150. add: BOOLEAN );
  5151. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  5152. (*
  5153. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  5154. *)
  5155. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  5156. VAR i, j: SIZE; adrA, adrB, adrC: ADDRESS;
  5157. BEGIN
  5158. FOR i := fromA TO toA - 1 DO
  5159. adrA := MatrixA + i * Stride;
  5160. FOR j := fromB TO toB - 1 DO
  5161. adrB := MatrixB + j * Stride;
  5162. adrC := MatrixC + i * StrideC + j * IncC;
  5163. AlignedSPRSSE( adrA, adrB, adrC, Cols, add );
  5164. END;
  5165. END;
  5166. END Block;
  5167. BEGIN
  5168. IF cBlockSize = 0 THEN
  5169. BlockSize := L2CacheSize DIV Stride DIV 4;
  5170. ELSE BlockSize := cBlockSize;
  5171. END;
  5172. lastUsedBlockSize := BlockSize;
  5173. fromA := 0;
  5174. REPEAT
  5175. toA := fromA + BlockSize;
  5176. IF toA > RowsA THEN toA := RowsA END;
  5177. fromB := 0;
  5178. REPEAT
  5179. toB := fromB + BlockSize;
  5180. IF toB > RowsB THEN toB := RowsB END;
  5181. Block( fromA, toA, fromB, toB ); fromB := toB;
  5182. UNTIL toB = RowsB;
  5183. fromA := toA;
  5184. UNTIL toA = RowsA;
  5185. END MatMulHBlockR;
  5186. PROCEDURE MatMulHBlockX( MatrixA, MatrixB, MatrixC: ADDRESS;
  5187. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  5188. add: BOOLEAN );
  5189. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  5190. (*
  5191. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  5192. *)
  5193. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  5194. VAR adrA, adrB, adrC: ADDRESS; i, j: SIZE;
  5195. BEGIN
  5196. FOR i := fromA TO toA - 1 DO
  5197. adrA := MatrixA + i * Stride;
  5198. FOR j := fromB TO toB - 1 DO
  5199. adrB := MatrixB + j * Stride;
  5200. adrC := MatrixC + i * StrideC + j * IncC;
  5201. AlignedSPXSSE( adrA, adrB, adrC, Cols, add );
  5202. END;
  5203. END;
  5204. END Block;
  5205. BEGIN
  5206. IF cBlockSize = 0 THEN
  5207. BlockSize := L2CacheSize DIV Stride DIV 8;
  5208. ELSE BlockSize := cBlockSize;
  5209. END;
  5210. lastUsedBlockSize := BlockSize;
  5211. fromA := 0;
  5212. REPEAT
  5213. toA := fromA + BlockSize;
  5214. IF toA > RowsA THEN toA := RowsA END;
  5215. fromB := 0;
  5216. REPEAT
  5217. toB := fromB + BlockSize;
  5218. IF toB > RowsB THEN toB := RowsB END;
  5219. Block( fromA, toA, fromB, toB ); fromB := toB;
  5220. UNTIL toB = RowsB;
  5221. fromA := toA;
  5222. UNTIL toA = RowsA;
  5223. END MatMulHBlockX;
  5224. PROCEDURE CopyDataR( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  5225. VAR i: SIZE; t: HUGEINT;
  5226. BEGIN
  5227. Tic( t );
  5228. FOR i := 0 TO rows - 1 DO
  5229. Copy4( src, dest, incSrc, incDest, cols );
  5230. INC( src, strideSrc ); INC( dest, strideDest );
  5231. END;
  5232. Toc( t, copyT );
  5233. END CopyDataR;
  5234. PROCEDURE CopyDataX( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  5235. VAR i: SIZE; t: HUGEINT;
  5236. BEGIN
  5237. Tic( t );
  5238. FOR i := 0 TO rows - 1 DO
  5239. Copy8( src, dest, incSrc, incDest, cols );
  5240. INC( src, strideSrc ); INC( dest, strideDest );
  5241. END;
  5242. Toc( t, copyT );
  5243. END CopyDataX;
  5244. PROCEDURE MatMulARARTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5245. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5246. add: BOOLEAN ): BOOLEAN;
  5247. VAR stride: SIZE; adrB, adrC: ADDRESS;
  5248. proc: POINTER TO ARRAY OF MatMulHObjR;
  5249. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5250. t: HUGEINT;
  5251. BEGIN
  5252. NEW(proc,nrProcesses);
  5253. ASSERT( ColsA = RowsB );
  5254. (* allocate 128 bit = 16 byte aligned matrix *)
  5255. stride := Align( ColsA * SIZEOF( REAL ), 16 );
  5256. IF (IncA # SIZEOF( REAL )) OR (StrideA # stride) OR
  5257. (matrixA MOD 16 # 0) THEN
  5258. cacheA := cachePool.Acquire( stride * RowsA );
  5259. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5260. SIZEOF( REAL ), stride, RowsA, ColsA ); (* copy to array *)
  5261. matrixA := cacheA.adr;
  5262. ELSE cacheA := NIL;
  5263. END;
  5264. IF (StrideB # SIZEOF( REAL )) OR (IncB # stride) OR
  5265. (matrixB MOD 16 # 0) THEN
  5266. cacheB := cachePool.Acquire( stride * ColsB );
  5267. CopyDataR( matrixB, cacheB.adr, StrideB, IncB,
  5268. SIZEOF( REAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5269. matrixB := cacheB.adr;
  5270. ELSE cacheB := NIL;
  5271. END;
  5272. Tic( t );
  5273. (*! needs decision rule if to split by rows or columns *)
  5274. IF nrProcesses > 1 THEN
  5275. from := 0;
  5276. FOR i := 0 TO nrProcesses - 1 DO
  5277. (*
  5278. to := RowsA * (i + 1) DIV nrProcesses; adrA := matrixA + from * stride;
  5279. adrC := matrixC + from * StrideC;
  5280. *)
  5281. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5282. adrB := matrixB + from * stride;
  5283. adrC := matrixC + from * IncC;
  5284. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5285. RowsA, to0 - from, RowsB, add );
  5286. from := to0;
  5287. END;
  5288. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5289. ELSE
  5290. MatMulHBlockR( matrixA, matrixB, matrixC, stride, IncC,
  5291. StrideC, RowsA, ColsB, RowsB, add );
  5292. END;
  5293. Toc( t, compT ); cachePool.Release( cacheA );
  5294. cachePool.Release( cacheB ); RETURN TRUE;
  5295. END MatMulARARTransposed;
  5296. PROCEDURE MatMulAXAXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5297. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5298. add: BOOLEAN ): BOOLEAN;
  5299. VAR stride: SIZE; adrB, adrC: ADDRESS;
  5300. proc: POINTER TO ARRAY OF MatMulHObjX;
  5301. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5302. t: HUGEINT;
  5303. BEGIN
  5304. NEW(proc,nrProcesses);
  5305. ASSERT( ColsA = RowsB );
  5306. stride := Align( ColsA * SIZEOF( LONGREAL ), 16 );
  5307. IF (IncA # SIZEOF( LONGREAL )) OR (StrideA # stride) OR
  5308. (matrixA MOD 16 # 0) THEN
  5309. cacheA := cachePool.Acquire( stride * RowsA );
  5310. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5311. SIZEOF( LONGREAL ), stride, RowsA, ColsA ); (* copy to array *)
  5312. matrixA := cacheA.adr;
  5313. ELSE cacheA := NIL;
  5314. END;
  5315. IF (StrideB # SIZEOF( LONGREAL )) OR (IncB # stride) OR
  5316. (matrixB MOD 16 # 0) THEN
  5317. cacheB := cachePool.Acquire( stride * ColsB );
  5318. CopyDataX( matrixB, cacheB.adr, StrideB, IncB,
  5319. SIZEOF( LONGREAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5320. matrixB := cacheB.adr;
  5321. ELSE cacheB := NIL;
  5322. END;
  5323. Tic( t );
  5324. IF nrProcesses > 1 THEN
  5325. from := 0;
  5326. FOR i := 0 TO nrProcesses - 1 DO
  5327. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5328. adrB := matrixB + from * stride;
  5329. adrC := matrixC + from * IncC;
  5330. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5331. RowsA, to0 - from, RowsB, add );
  5332. from := to0;
  5333. END;
  5334. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5335. ELSE
  5336. MatMulHBlockX( matrixA, matrixB, matrixC, stride, IncC,
  5337. StrideC, RowsA, ColsB, RowsB, add );
  5338. END;
  5339. Toc( t, compT ); cachePool.Release( cacheA );
  5340. cachePool.Release( cacheB ); RETURN TRUE;
  5341. END MatMulAXAXTransposed;
  5342. (****** strided matrix multiplication with restrictions to increments ******)
  5343. PROCEDURE MatMulARARSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5344. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5345. add: BOOLEAN ): BOOLEAN;
  5346. VAR sum: REAL; CbFrom, i, j, k: SIZE; valA, valB: REAL;
  5347. adrA, adrB, adrC: ADDRESS;
  5348. cacheA, cacheB, cacheC: Cache;
  5349. matrixCO, StrideCO, IncCO: SIZE; t: HUGEINT;
  5350. (*VAR fromA, toA: LONGINT; *)
  5351. BEGIN
  5352. IF (IncA # SIZEOF( REAL )) THEN
  5353. cacheA :=
  5354. cachePool.Acquire( RowsA * ColsA * SIZEOF( REAL ) );
  5355. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5356. SIZEOF( REAL ), SIZEOF( REAL ) * ColsA, RowsA,
  5357. ColsA );
  5358. matrixA := cacheA.adr; IncA := SIZEOF( REAL );
  5359. StrideA := SIZEOF( REAL ) * ColsA;
  5360. END;
  5361. IF (IncB # SIZEOF( REAL )) THEN
  5362. cacheB :=
  5363. cachePool.Acquire( RowsB * ColsB * SIZEOF( REAL ) );
  5364. CopyDataR( matrixB, cacheB.adr, IncB, StrideB,
  5365. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsB,
  5366. ColsB );
  5367. matrixB := cacheB.adr; IncB := SIZEOF( REAL );
  5368. StrideB := SIZEOF( REAL ) * ColsB;
  5369. END;
  5370. IF (IncC # SIZEOF( REAL )) THEN
  5371. cacheC :=
  5372. cachePool.Acquire( RowsA * ColsB * SIZEOF( REAL ) );
  5373. CopyDataR( matrixC, cacheC.adr, IncC, StrideC,
  5374. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsA,
  5375. ColsB );
  5376. matrixCO := matrixC; StrideCO := StrideC;
  5377. IncCO := IncC; matrixC := cacheC.adr;
  5378. IncC := SIZEOF( REAL ); StrideC := SIZEOF( REAL ) * ColsB;
  5379. END;
  5380. Tic( t );
  5381. CbFrom := 0;
  5382. IF ColsB >= 24 THEN
  5383. SSEMul24BlockR( CbFrom, StrideA, StrideB, StrideC,
  5384. ColsA, RowsA, ColsB, RowsB, matrixA,
  5385. matrixB, matrixC, add );
  5386. END;
  5387. IF ColsB - CbFrom >= 16 THEN
  5388. SSEMul16BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5389. CbFrom, matrixA, matrixB, matrixC, add );
  5390. INC( CbFrom, 16 );
  5391. END;
  5392. IF ColsB - CbFrom >= 8 THEN
  5393. SSEMul8BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5394. CbFrom, matrixA, matrixB, matrixC, add );
  5395. INC( CbFrom, 8 );
  5396. END;
  5397. IF ColsB - CbFrom >= 4 THEN
  5398. SSEMul4BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5399. CbFrom, matrixA, matrixB, matrixC, add );
  5400. INC( CbFrom, 4 );
  5401. END;
  5402. IF ColsB - CbFrom > 0 THEN
  5403. (* do it in Oberon *)
  5404. FOR i := 0 TO RowsA - 1 DO
  5405. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5406. FOR j := CbFrom TO ColsB - 1 DO
  5407. adrA := matrixA + i * StrideA;
  5408. adrB := matrixB + j * IncB;
  5409. IF add THEN SYSTEM.GET( adrC, sum )
  5410. ELSE sum := 0
  5411. END;
  5412. FOR k := 0 TO RowsB - 1 DO
  5413. SYSTEM.GET( adrA, valA );
  5414. SYSTEM.GET( adrB, valB );
  5415. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5416. INC( adrA, IncA ); INC( adrB, StrideB );
  5417. END;
  5418. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5419. (* C[i, j] := sum; *)
  5420. END;
  5421. END;
  5422. END;
  5423. Toc( t, compT );
  5424. IF cacheC # NIL THEN
  5425. CopyDataR( matrixC, matrixCO, IncC, StrideC, IncCO,
  5426. StrideCO, RowsA, ColsB );
  5427. END;
  5428. cachePool.Release( cacheA );
  5429. cachePool.Release( cacheB );
  5430. cachePool.Release( cacheC );
  5431. RETURN TRUE;
  5432. END MatMulARARSSEStride;
  5433. PROCEDURE MatMulAXAXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5434. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5435. add: BOOLEAN ): BOOLEAN;
  5436. VAR sum: LONGREAL; CbFrom, i, j, k: SIZE;
  5437. valA, valB: LONGREAL; adrA, adrB, adrC: ADDRESS;
  5438. cacheA, cacheB, cacheC: Cache;
  5439. matrixCO, StrideCO, IncCO:SIZE; t: HUGEINT;
  5440. BEGIN
  5441. IF (IncA # SIZEOF( LONGREAL )) THEN
  5442. cacheA :=
  5443. cachePool.Acquire( RowsA * ColsA * SIZEOF( LONGREAL ) );
  5444. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5445. SIZEOF( LONGREAL ),
  5446. SIZEOF( LONGREAL ) * ColsA, RowsA, ColsA );
  5447. matrixA := cacheA.adr;
  5448. StrideA := SIZEOF( LONGREAL ) * ColsA;
  5449. IncA := SIZEOF( LONGREAL );
  5450. END;
  5451. IF (IncB # SIZEOF( LONGREAL )) THEN
  5452. cacheB :=
  5453. cachePool.Acquire( RowsB * ColsB * SIZEOF( LONGREAL ) );
  5454. CopyDataX( matrixB, cacheB.adr, IncB, StrideB,
  5455. SIZEOF( LONGREAL ),
  5456. SIZEOF( LONGREAL ) * ColsB, RowsB, ColsB );
  5457. matrixB := cacheB.adr;
  5458. StrideB := SIZEOF( LONGREAL ) * ColsB;
  5459. IncB := SIZEOF( LONGREAL );
  5460. END;
  5461. IF (IncC # SIZEOF( LONGREAL )) THEN
  5462. cacheC :=
  5463. cachePool.Acquire( RowsA * ColsB * SIZEOF( LONGREAL ) );
  5464. CopyDataX( matrixC, cacheC.adr, IncC, StrideC,
  5465. SIZEOF( LONGREAL ),
  5466. SIZEOF( LONGREAL ) * ColsB, RowsA, ColsB );
  5467. matrixCO := matrixC; StrideCO := StrideC;
  5468. IncCO := IncC; StrideC := SIZEOF( LONGREAL ) * ColsB;
  5469. IncC := SIZEOF( LONGREAL ); matrixC := cacheC.adr;
  5470. END;
  5471. Tic( t );
  5472. CbFrom := 0;
  5473. IF ColsB >= 12 THEN
  5474. SSEMul12BlockX( CbFrom, StrideA, StrideB, StrideC,
  5475. ColsA, RowsA, ColsB, RowsB, matrixA,
  5476. matrixB, matrixC, add );
  5477. END;
  5478. IF ColsB - CbFrom >= 8 THEN
  5479. SSEMul8BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5480. CbFrom, matrixA, matrixB, matrixC, add );
  5481. INC( CbFrom, 8 );
  5482. END;
  5483. IF ColsB - CbFrom >= 4 THEN
  5484. SSEMul4BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5485. CbFrom, matrixA, matrixB, matrixC, add );
  5486. INC( CbFrom, 4 );
  5487. END;
  5488. IF ColsB - CbFrom >= 2 THEN
  5489. SSEMul2BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5490. CbFrom, matrixA, matrixB, matrixC, add );
  5491. INC( CbFrom, 2 );
  5492. END;
  5493. IF ColsB - CbFrom > 0 THEN
  5494. (* do it in Oberon *)
  5495. FOR i := 0 TO RowsA - 1 DO
  5496. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5497. FOR j := CbFrom TO ColsB - 1 DO
  5498. adrA := matrixA + i * StrideA;
  5499. adrB := matrixB + j * IncB;
  5500. IF add THEN SYSTEM.GET( adrC, sum )
  5501. ELSE sum := 0
  5502. END;
  5503. FOR k := 0 TO RowsB - 1 DO
  5504. SYSTEM.GET( adrA, valA );
  5505. SYSTEM.GET( adrB, valB );
  5506. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5507. INC( adrA, IncA ); INC( adrB, StrideB );
  5508. END;
  5509. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5510. (* C[i, j] := sum; *)
  5511. END;
  5512. END;
  5513. END;
  5514. Toc( t, compT );
  5515. IF cacheC # NIL THEN
  5516. CopyDataX( matrixC, matrixCO, IncC, StrideC, IncCO,
  5517. StrideCO, RowsA, ColsB );
  5518. END;
  5519. cachePool.Release( cacheA );
  5520. cachePool.Release( cacheB );
  5521. cachePool.Release( cacheC );
  5522. RETURN TRUE;
  5523. END MatMulAXAXSSEStride;
  5524. (****** naiive Oberon matrix multiplication ******)
  5525. PROCEDURE MatMulARARNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5526. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5527. add: BOOLEAN );
  5528. (*
  5529. A is M x K matrix, M=rows (A); K=cols(A);
  5530. B is K x N matrix; K=rows(B); N = cols(B);
  5531. C is M x N matrix; M=rows(C); N=cols(C);
  5532. *)
  5533. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5534. val1, val2, sum: REAL; t: HUGEINT;
  5535. BEGIN
  5536. Tic( t );
  5537. FOR i := 1 TO M DO
  5538. adrC := matrixC; adrB := matrixB;
  5539. FOR j := 1 TO N DO
  5540. adrA := matrixA; innerB := adrB;
  5541. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5542. FOR k := 1 TO K DO
  5543. SYSTEM.GET( adrA, val1 );
  5544. SYSTEM.GET( innerB, val2 );
  5545. sum := sum + val1 * val2; INC( adrA, IncA );
  5546. INC( innerB, StrideB );
  5547. END;
  5548. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5549. INC( adrC, IncC );
  5550. END;
  5551. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5552. END;
  5553. Toc( t, compT );
  5554. END MatMulARARNaiive;
  5555. PROCEDURE MatMulAXAXNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5556. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5557. add: BOOLEAN );
  5558. (*
  5559. A is M x K matrix, M=rows (A); K=cols(A);
  5560. B is K x N matrix; K=rows(B); N = cols(B);
  5561. C is M x N matrix; M=rows(C); N=cols(C);
  5562. *)
  5563. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5564. val1, val2, sum: LONGREAL; t: HUGEINT;
  5565. BEGIN
  5566. Tic( t );
  5567. FOR i := 1 TO M DO
  5568. adrC := matrixC; adrB := matrixB;
  5569. FOR j := 1 TO N DO
  5570. adrA := matrixA; innerB := adrB;
  5571. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5572. FOR k := 1 TO K DO
  5573. SYSTEM.GET( adrA, val1 );
  5574. SYSTEM.GET( innerB, val2 );
  5575. sum := sum + val1 * val2; INC( adrA, IncA );
  5576. INC( innerB, StrideB );
  5577. END;
  5578. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5579. INC( adrC, IncC );
  5580. END;
  5581. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5582. END;
  5583. Toc( t, compT );
  5584. END MatMulAXAXNaiive;
  5585. (*
  5586. PROCEDURE Toggle( VAR A, B: LONGINT );
  5587. VAR temp: LONGINT;
  5588. BEGIN
  5589. temp := A; A := B; B := temp;
  5590. END Toggle;
  5591. PROCEDURE Transpose( VAR matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT );
  5592. (*
  5593. prepare computation of C=A*B via C = (B` * A`)`
  5594. *)
  5595. BEGIN
  5596. Toggle( matrixA, matrixB ); Toggle( IncA, StrideB ); Toggle( StrideA, IncB );
  5597. Toggle( IncC, StrideC ); Toggle( M, N );
  5598. END Transpose;
  5599. *)
  5600. (*
  5601. *)
  5602. PROCEDURE BestMethod( M, N, K: SIZE ): LONGINT;
  5603. BEGIN
  5604. IF M = 1 THEN
  5605. IF N < 32 THEN RETURN cMatMulScalarProduct
  5606. ELSIF N < 256 THEN
  5607. IF K < 256 THEN RETURN cMatMulScalarProduct
  5608. ELSE RETURN cMatMulStride
  5609. END;
  5610. ELSE RETURN cMatMulStride
  5611. END;
  5612. ELSIF N = 1 THEN
  5613. IF (M > 1024) & (K > 1024) THEN
  5614. RETURN cMatMulTransposed
  5615. ELSE RETURN cMatMulScalarProduct
  5616. END;
  5617. ELSIF K = 1 THEN
  5618. IF N < 32 THEN
  5619. IF M < 256 THEN RETURN cMatMulNaive
  5620. ELSE RETURN cMatMulStride
  5621. END;
  5622. ELSIF N < 256 THEN
  5623. IF M < 32 THEN RETURN cMatMulNaive
  5624. ELSE RETURN cMatMulStride
  5625. END;
  5626. ELSE RETURN cMatMulStride
  5627. END;
  5628. ELSIF M < 32 THEN
  5629. IF N < 32 THEN RETURN cMatMulScalarProduct
  5630. ELSIF N < 256 THEN
  5631. IF K < 32 THEN RETURN cMatMulScalarProduct
  5632. ELSE RETURN cMatMulStride
  5633. END;
  5634. ELSE RETURN cMatMulStride
  5635. END;
  5636. ELSIF M < 256 THEN
  5637. IF N < 32 THEN
  5638. IF K < 32 THEN RETURN cMatMulScalarProduct
  5639. ELSE RETURN cMatMulStride
  5640. END;
  5641. ELSE
  5642. IF K < 256 THEN RETURN cMatMulStride
  5643. ELSE RETURN cMatMulBlocked
  5644. END;
  5645. END;
  5646. ELSE
  5647. IF N < 32 THEN RETURN cMatMulStride ELSE
  5648. IF K < 256 THEN RETURN cMatMulStride
  5649. ELSE RETURN cMatMulBlocked
  5650. END;
  5651. END;
  5652. END;
  5653. RETURN cMatMulStride;
  5654. END BestMethod;
  5655. (*
  5656. (N) (K) (N)
  5657. CCCCCC AAAAA BBBBB
  5658. CCCCCC AAAAA BBBBB
  5659. (M) CCCCCC = (M) AAAAA * (K) BBBBB
  5660. CCCCCC AAAAA BBBBB
  5661. CCCCCC AAAAA BBBBB
  5662. *)
  5663. PROCEDURE MatMulR( matrixA, matrixB, matrixC: ADDRESS;
  5664. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5665. (*! heuristics for choice of different methods needs improvement *)
  5666. (*! transpose if superior*)
  5667. (*! provide special variant for small [up to 4x4] matrices *)
  5668. VAR M, N, K: SIZE;
  5669. BEGIN
  5670. ASSERT( ColsA = RowsB );
  5671. M := RowsA; N := ColsB; K := ColsA;
  5672. CASE BestMethod( M, N, K ) OF
  5673. | cMatMulScalarProduct:
  5674. RETURN FALSE;
  5675. | cMatMulNaive:
  5676. RETURN MatMulRNaive( matrixA, matrixB, matrixC, IncA,
  5677. StrideA, IncB, StrideB, IncC,
  5678. StrideC, RowsA, ColsA, RowsB,
  5679. ColsB );
  5680. | cMatMulTransposed:
  5681. RETURN MatMulARARTransposed( matrixA, matrixB,
  5682. matrixC, IncA,
  5683. StrideA, IncB,
  5684. StrideB, IncC,
  5685. StrideC, RowsA,
  5686. ColsA, RowsB,
  5687. ColsB, FALSE );
  5688. | cMatMulStride:
  5689. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5690. matrixC, IncA, StrideA,
  5691. IncB, StrideB, IncC,
  5692. StrideC, RowsA,
  5693. ColsA, RowsB, ColsB,
  5694. FALSE );
  5695. | cMatMulBlocked:
  5696. RETURN MatMulARARBlocked( matrixA, matrixB,
  5697. matrixC, IncA, StrideA,
  5698. IncB, StrideB, IncC,
  5699. StrideC, RowsA, ColsA,
  5700. RowsB, ColsB, FALSE );
  5701. ELSE
  5702. RETURN FALSE (* use scalar product for each row and column *)
  5703. END;
  5704. END MatMulR;
  5705. PROCEDURE MatMulX( matrixA, matrixB, matrixC: ADDRESS;
  5706. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5707. VAR M, N, K: SIZE;
  5708. BEGIN
  5709. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5710. K := ColsA;
  5711. (*
  5712. KernelLog.String("MatMulX, M,N,K = "); KernelLog.Int(M,10); KernelLog.Int(N,10); KernelLog.Int(K,10); KernelLog.Ln;
  5713. KernelLog.String("Method= "); KernelLog.Int( BestMethod(M,N,K),10); KernelLog.Ln;
  5714. *)
  5715. CASE BestMethod( M, N, K ) OF
  5716. | cMatMulScalarProduct:
  5717. RETURN FALSE;
  5718. | cMatMulNaive:
  5719. RETURN MatMulXNaive( matrixA, matrixB, matrixC, IncA,
  5720. StrideA, IncB, StrideB, IncC,
  5721. StrideC, RowsA, ColsA, RowsB,
  5722. ColsB );
  5723. | cMatMulTransposed:
  5724. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5725. matrixC, IncA,
  5726. StrideA, IncB, StrideB,
  5727. IncC, StrideC, RowsA,
  5728. ColsA, RowsB, ColsB,
  5729. FALSE );
  5730. | cMatMulStride:
  5731. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5732. matrixC, IncA, StrideA,
  5733. IncB, StrideB, IncC,
  5734. StrideC, RowsA, ColsA,
  5735. RowsB, ColsB,
  5736. FALSE );
  5737. | cMatMulBlocked:
  5738. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5739. matrixC, IncA, StrideA,
  5740. IncB, StrideB, IncC,
  5741. StrideC, RowsA, ColsA,
  5742. RowsB, ColsB, FALSE );
  5743. ELSE
  5744. RETURN FALSE (* use scalar product for each row and column *)
  5745. END;
  5746. END MatMulX;
  5747. PROCEDURE MatMulIncR( matrixA, matrixB, matrixC: ADDRESS;
  5748. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5749. (*! heuristics for choice of different methods needs improvement *)
  5750. (*! transpose if superior*)
  5751. (*! provide special variant for small [up to 4x4] matrices *)
  5752. VAR M, N, K: SIZE;
  5753. BEGIN
  5754. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5755. K := ColsA;
  5756. CASE BestMethod( M, N, K ) OF
  5757. | cMatMulScalarProduct:
  5758. RETURN FALSE;
  5759. | cMatMulNaive:
  5760. RETURN MatMulIncRNaive( matrixA, matrixB, matrixC,
  5761. IncA, StrideA, IncB, StrideB,
  5762. IncC, StrideC, RowsA, ColsA,
  5763. RowsB, ColsB );
  5764. | cMatMulTransposed:
  5765. RETURN MatMulARARTransposed( matrixA, matrixB,
  5766. matrixC, IncA,
  5767. StrideA, IncB,
  5768. StrideB, IncC,
  5769. StrideC, RowsA,
  5770. ColsA, RowsB,
  5771. ColsB, TRUE );
  5772. | cMatMulStride:
  5773. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5774. matrixC, IncA, StrideA,
  5775. IncB, StrideB, IncC,
  5776. StrideC, RowsA,
  5777. ColsA, RowsB, ColsB,
  5778. TRUE );
  5779. | cMatMulBlocked:
  5780. RETURN MatMulARARBlocked( matrixA, matrixB,
  5781. matrixC, IncA, StrideA,
  5782. IncB, StrideB, IncC,
  5783. StrideC, RowsA, ColsA,
  5784. RowsB, ColsB, TRUE );
  5785. ELSE
  5786. RETURN FALSE (* use scalar product for each row and column *)
  5787. END;
  5788. END MatMulIncR;
  5789. PROCEDURE MatMulIncX( matrixA, matrixB, matrixC: ADDRESS;
  5790. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5791. VAR M, N, K: SIZE;
  5792. BEGIN
  5793. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5794. K := ColsA;
  5795. CASE BestMethod( M, N, K ) OF
  5796. | cMatMulScalarProduct:
  5797. RETURN FALSE;
  5798. | cMatMulNaive:
  5799. RETURN MatMulIncXNaive( matrixA, matrixB, matrixC,
  5800. IncA, StrideA, IncB, StrideB,
  5801. IncC, StrideC, RowsA, ColsA,
  5802. RowsB, ColsB );
  5803. | cMatMulTransposed:
  5804. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5805. matrixC, IncA,
  5806. StrideA, IncB, StrideB,
  5807. IncC, StrideC, RowsA,
  5808. ColsA, RowsB, ColsB,
  5809. TRUE );
  5810. | cMatMulStride:
  5811. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5812. matrixC, IncA, StrideA,
  5813. IncB, StrideB, IncC,
  5814. StrideC, RowsA, ColsA,
  5815. RowsB, ColsB, TRUE );
  5816. | cMatMulBlocked:
  5817. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5818. matrixC, IncA, StrideA,
  5819. IncB, StrideB, IncC,
  5820. StrideC, RowsA, ColsA,
  5821. RowsB, ColsB, TRUE );
  5822. ELSE
  5823. RETURN FALSE (* use scalar product for each row and column *)
  5824. END;
  5825. END MatMulIncX;
  5826. PROCEDURE MatMulARARBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5827. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5828. add: BOOLEAN ): BOOLEAN;
  5829. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5830. BEGIN
  5831. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5832. K := ColsA; MagicBlockR( M, N, K, L2M, L2N, L2K );
  5833. (*
  5834. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5835. IncC, StrideC, RowsA, ColsB, ColsA );
  5836. *)
  5837. MultiplyR( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5838. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5839. StrideC, add );
  5840. RETURN TRUE;
  5841. END MatMulARARBlocked;
  5842. PROCEDURE MatMulAXAXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5843. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5844. add: BOOLEAN ): BOOLEAN;
  5845. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5846. BEGIN
  5847. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5848. K := ColsA; MagicBlockX( M, N, K, L2M, L2N, L2K );
  5849. (*
  5850. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5851. IncC, StrideC, RowsA, ColsB, ColsA );
  5852. *)
  5853. MultiplyX( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5854. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5855. StrideC, add );
  5856. RETURN TRUE;
  5857. END MatMulAXAXBlocked;
  5858. PROCEDURE MatMulRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5859. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5860. BEGIN
  5861. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5862. IncB, StrideB, IncC, StrideC, RowsA,
  5863. ColsB, ColsA, FALSE );
  5864. RETURN TRUE;
  5865. END MatMulRNaive;
  5866. PROCEDURE MatMulXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5867. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5868. BEGIN
  5869. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5870. IncB, StrideB, IncC, StrideC, RowsA,
  5871. ColsB, ColsA, FALSE );
  5872. RETURN TRUE;
  5873. END MatMulXNaive;
  5874. PROCEDURE MatMulIncRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5875. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5876. BEGIN
  5877. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5878. IncB, StrideB, IncC, StrideC, RowsA,
  5879. ColsB, ColsA, TRUE );
  5880. RETURN TRUE;
  5881. END MatMulIncRNaive;
  5882. PROCEDURE MatMulIncXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5883. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5884. BEGIN
  5885. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5886. IncB, StrideB, IncC, StrideC, RowsA,
  5887. ColsB, ColsA, TRUE );
  5888. RETURN TRUE;
  5889. END MatMulIncXNaive;
  5890. PROCEDURE MatMulXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5891. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5892. BEGIN
  5893. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5894. IncA, StrideA, IncB,
  5895. StrideB, IncC, StrideC,
  5896. RowsA, ColsA, RowsB,
  5897. ColsB, FALSE );
  5898. END MatMulXTransposed;
  5899. PROCEDURE MatMulIncXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5900. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5901. BEGIN
  5902. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5903. IncA, StrideA, IncB,
  5904. StrideB, IncC, StrideC,
  5905. RowsA, ColsA, RowsB,
  5906. ColsB, TRUE )
  5907. END MatMulIncXTransposed;
  5908. PROCEDURE MatMulRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5909. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5910. BEGIN
  5911. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5912. IncA, StrideA, IncB,
  5913. StrideB, IncC, StrideC,
  5914. RowsA, ColsA, RowsB,
  5915. ColsB, FALSE );
  5916. END MatMulRTransposed;
  5917. PROCEDURE MatMulIncRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5918. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5919. BEGIN
  5920. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5921. IncA, StrideA, IncB,
  5922. StrideB, IncC, StrideC,
  5923. RowsA, ColsA, RowsB,
  5924. ColsB, TRUE )
  5925. END MatMulIncRTransposed;
  5926. PROCEDURE MatMulXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5927. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5928. BEGIN
  5929. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5930. IncA, StrideA, IncB, StrideB,
  5931. IncC, StrideC, RowsA,
  5932. ColsA, RowsB, ColsB,
  5933. FALSE );
  5934. END MatMulXSSEStride;
  5935. PROCEDURE MatMulIncXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5936. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5937. BEGIN
  5938. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5939. IncA, StrideA, IncB, StrideB,
  5940. IncC, StrideC, RowsA,
  5941. ColsA, RowsB, ColsB,
  5942. TRUE );
  5943. END MatMulIncXSSEStride;
  5944. PROCEDURE MatMulRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5945. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5946. BEGIN
  5947. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5948. IncA, StrideA, IncB, StrideB,
  5949. IncC, StrideC, RowsA,
  5950. ColsA, RowsB, ColsB,
  5951. FALSE );
  5952. END MatMulRSSEStride;
  5953. PROCEDURE MatMulIncRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5954. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5955. BEGIN
  5956. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5957. IncA, StrideA, IncB, StrideB,
  5958. IncC, StrideC, RowsA,
  5959. ColsA, RowsB, ColsB,
  5960. TRUE )
  5961. END MatMulIncRSSEStride;
  5962. PROCEDURE MatMulRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5963. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5964. BEGIN
  5965. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5966. IncA, StrideA, IncB, StrideB,
  5967. IncC, StrideC, RowsA, ColsA,
  5968. RowsB, ColsB, FALSE )
  5969. END MatMulRBlocked;
  5970. PROCEDURE MatMulIncRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5971. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5972. BEGIN
  5973. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5974. IncA, StrideA, IncB, StrideB,
  5975. IncC, StrideC, RowsA, ColsA,
  5976. RowsB, ColsB, TRUE )
  5977. END MatMulIncRBlocked;
  5978. PROCEDURE MatMulXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5979. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5980. BEGIN
  5981. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5982. IncA, StrideA, IncB, StrideB,
  5983. IncC, StrideC, RowsA, ColsA,
  5984. RowsB, ColsB, FALSE )
  5985. END MatMulXBlocked;
  5986. PROCEDURE MatMulIncXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5987. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5988. BEGIN
  5989. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5990. IncA, StrideA, IncB, StrideB,
  5991. IncC, StrideC, RowsA, ColsA,
  5992. RowsB, ColsB, TRUE )
  5993. END MatMulIncXBlocked;
  5994. PROCEDURE SetMatMulMethod*( i: LONGINT );
  5995. BEGIN
  5996. KernelLog.String("ArrayBaseOptimized, method = ");
  5997. IF i = cMatMulDynamic THEN
  5998. KernelLog.String("dynamic.");
  5999. ArrayBase.matMulIncR := MatMulIncR;
  6000. ArrayBase.matMulIncX := MatMulIncX;
  6001. ArrayBase.matMulR := MatMulR;
  6002. ArrayBase.matMulX := MatMulX;
  6003. ELSIF i = cMatMulScalarProduct THEN
  6004. KernelLog.String("scalarproduct.");
  6005. ArrayBase.matMulIncR := NIL;
  6006. ArrayBase.matMulIncX := NIL;
  6007. ArrayBase.matMulR := NIL; ArrayBase.matMulX := NIL;
  6008. ELSIF i = cMatMulNaive THEN
  6009. KernelLog.String("naiive.");
  6010. ArrayBase.matMulR := MatMulRNaive;
  6011. ArrayBase.matMulX := MatMulXNaive;
  6012. ArrayBase.matMulIncR := MatMulIncRNaive;
  6013. ArrayBase.matMulIncX := MatMulIncXNaive;
  6014. ELSIF i = cMatMulTransposed THEN
  6015. KernelLog.String("transposed.");
  6016. ArrayBase.matMulR := MatMulRTransposed;
  6017. ArrayBase.matMulX := MatMulXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  6018. ArrayBase.matMulIncR := MatMulIncRTransposed;
  6019. ArrayBase.matMulIncX := MatMulIncXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  6020. ELSIF i = cMatMulStride THEN
  6021. KernelLog.String("stride.");
  6022. ArrayBase.matMulR := MatMulRSSEStride;
  6023. ArrayBase.matMulX := MatMulXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  6024. ArrayBase.matMulIncR := MatMulIncRSSEStride;
  6025. ArrayBase.matMulIncX := MatMulIncXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  6026. ELSIF i = cMatMulBlocked THEN
  6027. KernelLog.String("blocked.");
  6028. ArrayBase.matMulR := MatMulRBlocked;
  6029. ArrayBase.matMulX := MatMulXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  6030. ArrayBase.matMulIncR := MatMulIncRBlocked;
  6031. ArrayBase.matMulIncX := MatMulIncXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  6032. END;
  6033. KernelLog.Ln;
  6034. END SetMatMulMethod;
  6035. (* optimizations for small arrays (Alexey Morozov) *)
  6036. (* assumes that all arrays do not overlap *)
  6037. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  6038. PROCEDURE MatMulR2x2(dadr, ladr, radr: ADDRESS);
  6039. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  6040. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6041. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6042. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6043. MOVUPS XMM0, [RAX] ; [a00,a01,a10,a11]
  6044. MOVUPS XMM1, [RBX] ; [b00,b01,b10,b11]
  6045. MOVAPS XMM2, XMM1
  6046. SHUFPS XMM2, XMM1, 204 ; XMM2 := [b00,b11,b00,b11]
  6047. MULPS XMM2, XMM0
  6048. SHUFPS XMM0, XMM0, 177 ; XMM0 := [a01,a00,a11,a10]
  6049. SHUFPS XMM1, XMM1, 102 ; XMM1 := [b10,b01,b10,b01]
  6050. MULPS XMM1, XMM0
  6051. ADDPS XMM1, XMM2
  6052. MOVUPS [RCX], XMM1
  6053. END MatMulR2x2;
  6054. (* based on weighted sum of rows (Alexey Morozov) *)
  6055. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  6056. PROCEDURE MatMulR3x3(dadr, ladr, radr: ADDRESS);
  6057. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  6058. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6059. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6060. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6061. MOVUPS XMM0, [RBX] ; XMM0 := [b00,b01,b02,-]
  6062. MOVUPS XMM1, [RBX+12] ; XMM1 := [b10,b11,b12,-]
  6063. ; last element is out of range, is it still OK?
  6064. MOVUPS XMM2, [RBX+24] ; XMM2 := [b20,b21,b22,-]
  6065. ;MOVLPS XMM2, [RBX+24]
  6066. ;MOVSS XMM3, [RBX+32]
  6067. ;MOVLHPS XMM2, XMM3
  6068. MOVSS XMM3, [RAX]
  6069. SHUFPS XMM3, XMM3, 0; XMM3 := [a00,a00,a00,-]
  6070. MOVAPS XMM4, XMM0
  6071. MULPS XMM4, XMM3
  6072. MOVSS XMM3, [RAX+4]
  6073. SHUFPS XMM3, XMM3, 0; XMM3 := [a01,a01,a01,-]
  6074. MULPS XMM3, XMM1
  6075. ADDPS XMM4, XMM3
  6076. MOVSS XMM3, [RAX+8]
  6077. SHUFPS XMM3, XMM3, 0; XMM3 := [a02,a02,a02,-]
  6078. MULPS XMM3, XMM2
  6079. ADDPS XMM4, XMM3
  6080. MOVUPS [RCX], XMM4
  6081. ;***************************************************;
  6082. MOVSS XMM3, [RAX+12]
  6083. SHUFPS XMM3, XMM3, 0; XMM3 := [a10,a10,a10,-]
  6084. MOVAPS XMM4, XMM0
  6085. MULPS XMM4, XMM3
  6086. MOVSS XMM3, [RAX+16]
  6087. SHUFPS XMM3, XMM3, 0; XMM3 := [a11,a11,a11,-]
  6088. MULPS XMM3, XMM1
  6089. ADDPS XMM4, XMM3
  6090. MOVSS XMM3, [RAX+20]
  6091. SHUFPS XMM3, XMM3, 0; XMM3 := [a12,a12,a12,-]
  6092. MULPS XMM3, XMM2
  6093. ADDPS XMM4, XMM3
  6094. MOVUPS [RCX+12], XMM4
  6095. ;***************************************************;
  6096. MOVSS XMM3, [RAX+24]
  6097. SHUFPS XMM3, XMM3, 0; XMM3 := [a20,a20,a20,-]
  6098. MOVAPS XMM4, XMM0
  6099. MULPS XMM4, XMM3
  6100. MOVSS XMM3, [RAX+28]
  6101. SHUFPS XMM3, XMM3, 0; XMM3 := [a21,a21,a21,-]
  6102. MULPS XMM3, XMM1
  6103. ADDPS XMM4, XMM3
  6104. MOVSS XMM3, [RAX+32]
  6105. SHUFPS XMM3, XMM3, 0; XMM3 := [a22,a22,a22,-]
  6106. MULPS XMM3, XMM2
  6107. ADDPS XMM4, XMM3
  6108. ;MOVUPS [RCX+24], XMM4
  6109. MOVLPS [RCX+24], XMM4
  6110. MOVHLPS XMM4, XMM4
  6111. MOVSS [RCX+32], XMM4
  6112. END MatMulR3x3;
  6113. (* based on Strassen algorithm (Alexey Morozov) *)
  6114. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  6115. PROCEDURE MatMulR4x4(dadr, ladr, radr: ADDRESS);
  6116. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  6117. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6118. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6119. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6120. ; load A00
  6121. MOVLPS XMM0, [RAX] ; XMM0 := [a00,a01,-,-]
  6122. MOVHPS XMM0, [RAX+16] ; XMM0 := [a00,a01,a10,a11]
  6123. ; load A01
  6124. MOVLPS XMM1, [RAX+8] ; XMM1 := [a02,a03,-,-]
  6125. MOVHPS XMM1, [RAX+24] ; XMM1 := [a02,a03,a12,a13]
  6126. ; load B00
  6127. MOVLPS XMM2, [RBX] ; XMM2 := [b00,b01,-,-]
  6128. MOVHPS XMM2, [RBX+16] ; XMM2 := [b00,b01,b10,b11]
  6129. ; load B01
  6130. MOVLPS XMM3, [RBX+8] ; XMM3 := [a02,a03,-,-]
  6131. MOVHPS XMM3, [RBX+24] ; XMM3 := [a02,a03,a12,a13]
  6132. ; load B10
  6133. MOVLPS XMM4, [RBX+32] ; XMM4 := [b20,b21,-,-]
  6134. MOVHPS XMM4, [RBX+48] ; XMM4 := [b20,b21,b30,b31]
  6135. ; load B11
  6136. MOVLPS XMM5, [RBX+40] ; XMM5 := [b22,b23,-,-]
  6137. MOVHPS XMM5, [RBX+56] ; XMM5 := [b22,b23,b32,b33]
  6138. ;****************************************************;
  6139. ; multiply A00(D)*B00(E) (use MatMulR2x2 code)
  6140. MOVAPS XMM6, XMM2
  6141. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6142. MULPS XMM6, XMM0
  6143. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6144. MOVAPS XMM7, XMM2
  6145. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6146. MULPS XMM7, XMM0
  6147. ADDPS XMM7, XMM6
  6148. ; multiply A01(D)*B10(E)
  6149. MOVAPS XMM0, XMM4
  6150. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6151. MULPS XMM0, XMM1
  6152. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6153. MOVAPS XMM6, XMM4
  6154. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6155. MULPS XMM6, XMM1
  6156. ADDPS XMM6, XMM0
  6157. ADDPS XMM7, XMM6
  6158. MOVLPS [RCX], XMM7
  6159. MOVHPS [RCX+16], XMM7
  6160. ;****************************************************;
  6161. ; load A00
  6162. MOVLPS XMM0, [RAX] ; XMM0 := [a00,a01,-,-]
  6163. MOVHPS XMM0, [RAX+16] ; XMM0 := [a00,a01,a10,a11]
  6164. ; load A01
  6165. MOVLPS XMM1, [RAX+8] ; XMM1 := [a02,a03,-,-]
  6166. MOVHPS XMM1, [RAX+24] ; XMM1 := [a02,a03,a12,a13]
  6167. ; multiply A00(D)*B01(E) (use MatMulR2x2 code)
  6168. MOVAPS XMM6, XMM3
  6169. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6170. MULPS XMM6, XMM0
  6171. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6172. MOVAPS XMM7, XMM3
  6173. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6174. MULPS XMM7, XMM0
  6175. ADDPS XMM7, XMM6
  6176. ; multiply A01(D)*B11(E)
  6177. MOVAPS XMM0, XMM5
  6178. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6179. MULPS XMM0, XMM1
  6180. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6181. MOVAPS XMM6, XMM5
  6182. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6183. MULPS XMM6, XMM1
  6184. ADDPS XMM6, XMM0
  6185. ADDPS XMM7, XMM6
  6186. MOVLPS [RCX+8], XMM7
  6187. MOVHPS [RCX+24], XMM7
  6188. ;****************************************************;
  6189. ; load A10
  6190. MOVLPS XMM0, [RAX+32] ; XMM0 := [a20,a21,-,-]
  6191. MOVHPS XMM0, [RAX+48] ; XMM0 := [a20,a21,a30,a31]
  6192. ; load A11
  6193. MOVLPS XMM1, [RAX+40] ; XMM1 := [a22,a23,-,-]
  6194. MOVHPS XMM1, [RAX+56] ; XMM1 := [a22,a23,a32,a33]
  6195. ; multiply A10(D)*B00(E) (use MatMulR2x2 code)
  6196. MOVAPS XMM6, XMM2
  6197. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6198. MULPS XMM6, XMM0
  6199. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6200. MOVAPS XMM7, XMM2
  6201. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6202. MULPS XMM7, XMM0
  6203. ADDPS XMM7, XMM6
  6204. ; multiply A11(D)*B10(E)
  6205. MOVAPS XMM0, XMM4
  6206. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6207. MULPS XMM0, XMM1
  6208. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6209. MOVAPS XMM6, XMM4
  6210. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6211. MULPS XMM6, XMM1
  6212. ADDPS XMM6, XMM0
  6213. ADDPS XMM7, XMM6
  6214. MOVLPS [RCX+32], XMM7
  6215. MOVHPS [RCX+48], XMM7
  6216. ;****************************************************;
  6217. ; load A10
  6218. MOVLPS XMM0, [RAX+32] ; XMM0 := [a20,a21,-,-]
  6219. MOVHPS XMM0, [RAX+48] ; XMM0 := [a20,a21,a30,a31]
  6220. ; load A11
  6221. MOVLPS XMM1, [RAX+40] ; XMM1 := [a22,a23,-,-]
  6222. MOVHPS XMM1, [RAX+56] ; XMM1 := [a22,a23,a32,a33]
  6223. ; multiply A10(D)*B01(E) (use MatMulR2x2 code)
  6224. MOVAPS XMM6, XMM3
  6225. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6226. MULPS XMM6, XMM0
  6227. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6228. MOVAPS XMM7, XMM3
  6229. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6230. MULPS XMM7, XMM0
  6231. ADDPS XMM7, XMM6
  6232. ; multiply A11(D)*B11(E)
  6233. MOVAPS XMM0, XMM5
  6234. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6235. MULPS XMM0, XMM1
  6236. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6237. MOVAPS XMM6, XMM5
  6238. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6239. MULPS XMM6, XMM1
  6240. ADDPS XMM6, XMM0
  6241. ADDPS XMM7, XMM6
  6242. MOVLPS [RCX+40], XMM7
  6243. MOVHPS [RCX+56], XMM7
  6244. END MatMulR4x4;
  6245. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  6246. (* FIXME: speed it up when horizontal add is available!!! *)
  6247. PROCEDURE MatVecMulR2x2(dadr, ladr, radr: ADDRESS);
  6248. CODE{SYSTEM.AMD64, SYSTEM.SSE2}
  6249. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6250. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6251. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6252. ; load the whole matrix
  6253. MOVUPS XMM0, [RAX] ; XMM0 := [a00,a01,a10,a11]
  6254. MOVLPS XMM1, [RBX] ; XMM1 := [b00,b01,-,-]
  6255. MOVLHPS XMM1, XMM1 ; XMM1 := [b00,b10,b00,b10]
  6256. MULPS XMM0, XMM1 ; XMM0 := [a00*b00,a01*b10,a10*b00,a11*b10]
  6257. MOVAPS XMM1, XMM0
  6258. SHUFPS XMM0, XMM0, 8; XMM0 := [a00*b00,a10*b00,-,-]
  6259. SHUFPS XMM1, XMM1, 13; XMM1 := [a01*b10,a11*b10,-,-]
  6260. ADDPS XMM0, XMM1
  6261. MOVLPS [RCX], XMM0
  6262. END MatVecMulR2x2;
  6263. (* PH *)
  6264. (* to do: use MOVAPS when Felix fixes issues with alignment *)
  6265. PROCEDURE MatVecMulR4x4(dadr, ladr, radr: ADDRESS);
  6266. CODE{SYSTEM.AMD64, SYSTEM.SSE3}
  6267. MOV RBX, [RBP+radr] ; RBX := ADDR(right)
  6268. MOV RAX, [RBP+ladr] ; RAX := ADDR(left)
  6269. MOV RCX, [RBP+dadr] ; RCX := ADDR(dest)
  6270. MOVUPS XMM0, [RBX] ; XMM0 := [b0,b1,b2,b3]
  6271. MOVUPS XMM1, [RAX] ; XMM1 := [a00,a01,a02,a03]
  6272. MOVUPS XMM2, [RAX+16] ; XMM2 := [a10,a11,a12,a13]
  6273. MOVUPS XMM3, [RAX+32] ; XMM3 := [a20,a21,a22,a23]
  6274. MOVUPS XMM4, [RAX+48] ; XMM4 := [a30,a31,a32,a33]
  6275. MULPS XMM1, XMM0
  6276. MULPS XMM2, XMM0
  6277. HADDPS XMM1, XMM2 ; adjacent pairs are horizontally added
  6278. MULPS XMM3, XMM0
  6279. MULPS XMM4, XMM0
  6280. HADDPS XMM3, XMM4 ; adjacent pairs are horizontally added
  6281. HADDPS XMM1, XMM3 ; adjacent pairs are horizontally added
  6282. MOVUPS [RCX], XMM1
  6283. END MatVecMulR4x4;
  6284. PROCEDURE InstallMatMul*(context: Commands.Context);
  6285. VAR type: LONGINT; string: ARRAY 32 OF CHAR;
  6286. BEGIN
  6287. context.arg.String(string);
  6288. IF string = "dynamic" THEN
  6289. type := cMatMulDynamic;
  6290. ELSIF string = "scalarproduct" THEN
  6291. type := cMatMulScalarProduct
  6292. ELSIF string = "naive" THEN
  6293. type := cMatMulNaive
  6294. ELSIF string = "transposed" THEN
  6295. type := cMatMulTransposed
  6296. ELSIF string = "stride" THEN
  6297. type := cMatMulStride
  6298. ELSIF string ="blocked" THEN
  6299. type := cMatMulBlocked
  6300. ELSE
  6301. KernelLog.String("unknown method: "); KernelLog.String(string); KernelLog.Ln;
  6302. type := cMatMulDynamic;
  6303. END;
  6304. SetMatMulMethod( type );
  6305. END InstallMatMul;
  6306. PROCEDURE InstallAsm*;
  6307. BEGIN
  6308. KernelLog.String( "ASM " );
  6309. ArrayBase.loopSPAXAX := SPAXAXLoopA;
  6310. ArrayBase.loopSPARAR := SPARARLoopA;
  6311. ArrayBase.loopAddAXAX := AddAXAXLoopA;
  6312. ArrayBase.loopAddARAR := AddARARLoopA;
  6313. ArrayBase.loopSubAXAX := SubAXAXLoopA;
  6314. ArrayBase.loopSubARAR := SubARARLoopA;
  6315. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
  6316. ArrayBase.loopMatMulARAR := MatMulARARLoopA;
  6317. ArrayBase.loopMulAXSX := MulAXSXLoopA;
  6318. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopA;
  6319. ArrayBase.loopMulARSR := MulARSRLoopA;
  6320. ArrayBase.loopIncMulARSR := IncMulARSRLoopA;
  6321. ArrayBase.loopMatMulIncAXAX := MatMulIncAXAXLoopA;
  6322. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopA;
  6323. ArrayBase.transpose4 := Transpose4;
  6324. ArrayBase.transpose8 := Transpose8;
  6325. END InstallAsm;
  6326. PROCEDURE InstallSSE*;
  6327. BEGIN
  6328. IF Machine.SSESupport THEN
  6329. KernelLog.String( "SSE " );
  6330. ArrayBase.loopSPARAR := SPARARLoopSSE;
  6331. ArrayBase.loopAddARAR := AddARARLoopSSE;
  6332. ArrayBase.loopSubARAR := SubARARLoopSSE;
  6333. ArrayBase.loopMulARSR := MulARSRLoopSSE;
  6334. ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
  6335. ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
  6336. ArrayBase.matMulR := MatMulR;
  6337. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopSSE;
  6338. ArrayBase.matMulIncR := MatMulIncR;
  6339. (* optimizations for small matrices (Alexey Morozov) *)
  6340. ArrayBase.matMulR2x2 := MatMulR2x2;
  6341. ArrayBase.matMulR3x3 := MatMulR3x3;
  6342. ArrayBase.matMulR4x4 := MatMulR4x4;
  6343. ArrayBase.matVecMulR2x2 := MatVecMulR2x2;
  6344. END;
  6345. END InstallSSE;
  6346. PROCEDURE InstallSSE2*; (* extra for testing, will be merged with Install in later versions *)
  6347. BEGIN
  6348. IF Machine.SSE2Support THEN
  6349. KernelLog.String( "SSE2 " );
  6350. ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
  6351. ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
  6352. ArrayBase.loopSubAXAX := SubAXAXLoopSSE;
  6353. ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
  6354. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
  6355. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;
  6356. ArrayBase.matMulX := MatMulX;
  6357. ArrayBase.loopMatMulIncAXAX :=
  6358. MatMulIncAXAXLoopSSE;
  6359. ArrayBase.matMulIncX := MatMulIncX;
  6360. END;
  6361. END InstallSSE2;
  6362. (*! to do: at current, this only works for Win, not for native because SSE3Support is not yet implemented in BIOS.I386.Machine.Mod*)
  6363. PROCEDURE InstallSSE3*; (* extra for testing, will be merged with Install in later versions *)
  6364. BEGIN
  6365. IF Machine.SSE3Support THEN
  6366. KernelLog.String( "SSE3 " );
  6367. (* optimizations for small matrices *)
  6368. ArrayBase.matVecMulR4x4 := MatVecMulR4x4;
  6369. END;
  6370. END InstallSSE3;
  6371. PROCEDURE Install*;
  6372. BEGIN
  6373. KernelLog.String( "ArrayBaseOptimized: installing runtime library optimizations:" );
  6374. InstallAsm; InstallSSE; InstallSSE2; InstallSSE3;
  6375. KernelLog.String( " done." ); KernelLog.Ln;
  6376. END Install;
  6377. PROCEDURE SetParameters*( context: Commands.Context );
  6378. BEGIN
  6379. context.arg.SkipWhitespace; context.arg.Int(cBlockSize,TRUE);
  6380. context.arg.SkipWhitespace; context.arg.Int(nrProcesses,TRUE);
  6381. IF nrProcesses > maxProcesses THEN
  6382. nrProcesses := maxProcesses
  6383. ELSIF nrProcesses = 0 THEN nrProcesses := LONGINT (Machine.NumberOfProcessors());
  6384. END;
  6385. KernelLog.String( "BlockSize=" ); KernelLog.Int( cBlockSize, 0 );
  6386. KernelLog.String( ", NrProcesses = " ); KernelLog.Int( nrProcesses, 0 ); KernelLog.Ln;
  6387. END SetParameters;
  6388. BEGIN
  6389. cBlockSize := 0; (* automatic *)
  6390. nrProcesses := LONGINT (Machine.NumberOfProcessors()); (* automatic *)
  6391. allocT := 0; copyT := 0; compT := 0;
  6392. NEW( cachePool );
  6393. END FoxArrayBaseOptimized.
  6394. System.Free ArrayBaseOptimized ~
  6395. ArrayBaseOptimized.Install ~
  6396. ArrayBaseOptimized.InstallSSE2 ~
  6397. ArrayBaseOptimized.InstallSSE ~
  6398. ArrayBaseOptimized.InstallAsm ~
  6399. ArrayBaseOptimized.InstallMatMul dynamic ~
  6400. ArrayBaseOptimized.InstallMatMul scalarproduct ~
  6401. ArrayBaseOptimized.InstallMatMul transposed ~
  6402. ArrayBaseOptimized.InstallMatMul naive ~
  6403. ArrayBaseOptimized.InstallMatMul stride ~
  6404. ArrayBaseOptimized.InstallMatMul blocked ~
  6405. ArrayBaseOptimized.SetParameters 0 1 ~ (* BlockSize, NrProcesses *)