I386.FoxArrayBaseOptimized.Mod 180 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545
  1. MODULE FoxArrayBaseOptimized; (** AUTHOR "fof"; PURPOSE ""; **)
  2. IMPORT SYSTEM, ArrayBase := FoxArrayBase, Machine, KernelLog, Commands ;
  3. CONST
  4. L2CacheSize = 512 * 1024; (* L1CacheSize = 16 * 1024; *)
  5. (* parameters for blocking matrix multiplication *)
  6. L1BlockN = 5; (* L1 block size -> nr of columns in a block that can be processed using L1 chache *)
  7. L2BARatio = 1;
  8. L0BlockKR = 4; (* L0 block size -> nr of elements that can be processed at once for type REAL *)
  9. L1MaxBlockKR = 336; (* L1CacheSize/SIZEOF(REAL)/2/6*)
  10. L2BlockSize = 81920;
  11. L0BlockKX = 2; (* L0 block size -> nr of elements that can be processed at once for type LONGREAL *)
  12. L1MaxBlockKX = 256; (* > L1CacheSize/SIZEOF(LONGREAL)/2/6*)
  13. (*
  14. DefaultL2CacheSize = 81920;
  15. L2SizeR = L2CacheSize DIV 8; MaxBlockKR = 336; (* ca L1CacheSize/SIZEOF(REAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  16. L2SizeX = L2CacheSize DIV 8; MaxBlockKX = 256; (* bit more than L1CacheSize/SIZEL(LONGREAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  17. *)
  18. debug = FALSE; parallel = TRUE; SSE = TRUE;
  19. MaxCachePoolSize = 0 (* disabled *) (* 646*1024*1024 *) (* enabled *) ;
  20. maxProcesses = 32;
  21. cMatMulDynamic* = -1; cMatMulScalarProduct* = 0;
  22. cMatMulNaive* = 1; cMatMulTransposed* = 2;
  23. cMatMulStride* = 3; cMatMulBlocked* = 4;
  24. VAR
  25. cBlockSize*: LONGINT; nrProcesses*: LONGINT;
  26. lastUsedBlockSize*: SIZE;
  27. allocT-, copyT-, zeroT-, compT-: HUGEINT;
  28. TYPE
  29. Cache = POINTER TO RECORD
  30. p: ANY;
  31. adr: ADDRESS; size: SIZE;
  32. prev, next: Cache;
  33. END;
  34. CachePool = OBJECT
  35. (*! provide heuristics for overal size *)
  36. VAR first, last: Cache;
  37. PROCEDURE & Init*;
  38. BEGIN
  39. NEW( first ); first.size := 0; (* sentinel *)
  40. NEW( last ); last.size := MAX( SIZE ); (* sentinel *)
  41. first.next := last; first.prev := NIL; last.prev := first; last.next := NIL;
  42. END Init;
  43. PROCEDURE Acquire( size: SIZE ): Cache;
  44. VAR c: Cache; t: HUGEINT;
  45. BEGIN {EXCLUSIVE}
  46. IF size = 0 THEN RETURN first END;
  47. Tic( t );
  48. c := last;
  49. WHILE (c.prev.size >= size) DO
  50. c := c.prev;
  51. END;
  52. IF c = last THEN
  53. NEW( c ); SYSTEM.NEW( c.p, size + 16 );
  54. c.adr := Align( c.p , 16 );
  55. c.size := size;
  56. ELSE
  57. c.prev.next := c.next;
  58. c.next.prev := c.prev;
  59. c.prev := NIL; c.next := NIL;
  60. END;
  61. Toc( t, allocT ); RETURN c;
  62. END Acquire;
  63. PROCEDURE Release( c: Cache );
  64. VAR t: Cache;
  65. BEGIN {EXCLUSIVE}
  66. IF (c=first) OR (c=NIL) THEN RETURN END;
  67. ASSERT(c.size > 0);
  68. IF c.size > MaxCachePoolSize THEN RETURN END;
  69. t := first;
  70. WHILE (t.size <= c.size) DO t := t.next; END;
  71. c.prev := t.prev; c.next := t; t.prev := c; c.prev.next := c;
  72. END Release;
  73. END CachePool;
  74. ComputationObj = OBJECT
  75. VAR done: BOOLEAN;
  76. PROCEDURE & Init*;
  77. BEGIN
  78. done := FALSE;
  79. END Init;
  80. PROCEDURE Compute; (*abstract*)
  81. END Compute;
  82. PROCEDURE Wait;
  83. BEGIN {EXCLUSIVE}
  84. AWAIT( done );
  85. END Wait;
  86. BEGIN {ACTIVE, EXCLUSIVE}
  87. Compute; done := TRUE;
  88. END ComputationObj;
  89. MatMulHObjR = OBJECT (ComputationObj)
  90. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  91. add: BOOLEAN;
  92. PROCEDURE & InitR*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  93. add: BOOLEAN );
  94. BEGIN
  95. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  96. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  97. SELF.IncC := IncC; SELF.StrideC := StrideC;
  98. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  99. SELF.Cols := Cols; SELF.add := add;
  100. END InitR;
  101. PROCEDURE Compute;
  102. BEGIN
  103. MatMulHBlockR( MatrixA, MatrixB, MatrixC, Stride, IncC,
  104. StrideC, RowsA, RowsB, Cols, add );
  105. END Compute;
  106. END MatMulHObjR;
  107. MatMulHObjX = OBJECT (ComputationObj)
  108. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  109. add: BOOLEAN;
  110. PROCEDURE & InitX*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  111. add: BOOLEAN );
  112. BEGIN
  113. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  114. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  115. SELF.IncC := IncC; SELF.StrideC := StrideC;
  116. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  117. SELF.Cols := Cols; SELF.add := add;
  118. END InitX;
  119. PROCEDURE Compute;
  120. BEGIN
  121. MatMulHBlockX( MatrixA, MatrixB, MatrixC, Stride, IncC,
  122. StrideC, RowsA, RowsB, Cols, add );
  123. END Compute;
  124. END MatMulHObjX;
  125. MultiplyObjectR = OBJECT (ComputationObj);
  126. VAR adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK:SIZE;
  127. start, finished: BOOLEAN;
  128. PROCEDURE & InitR*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  129. BEGIN
  130. Init; start := FALSE; finished := FALSE;
  131. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  132. SELF.M := M; SELF.N := N; SELF.K := K;
  133. SELF.IncC := IncC; SELF.StrideC := StrideC;
  134. SELF.L2BlockM := L2BlockM;
  135. SELF.L2BlockN := L2BlockN;
  136. SELF.L2BlockK := L2BlockK;
  137. END InitR;
  138. PROCEDURE Compute;
  139. BEGIN
  140. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  141. L2BlockN, L2BlockK );
  142. END Compute;
  143. END MultiplyObjectR;
  144. MultiplyObjectX = OBJECT (ComputationObj);
  145. VAR adrA, adrB:ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE;
  146. start, finished: BOOLEAN;
  147. PROCEDURE & InitX*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  148. BEGIN
  149. Init; start := FALSE; finished := FALSE;
  150. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  151. SELF.M := M; SELF.N := N; SELF.K := K;
  152. SELF.IncC := IncC; SELF.StrideC := StrideC;
  153. SELF.L2BlockM := L2BlockM;
  154. SELF.L2BlockN := L2BlockN;
  155. SELF.L2BlockK := L2BlockK;
  156. END InitX;
  157. PROCEDURE Compute;
  158. BEGIN
  159. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  160. L2BlockN, L2BlockK );
  161. END Compute;
  162. END MultiplyObjectX;
  163. VAR
  164. (* ran: Random.Generator; (* testing *)*)
  165. cachePool: CachePool;
  166. (*********** Part 0: assembler routines ***************)
  167. PROCEDURE -L1Block1XA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  168. CODE {SYSTEM.i386, SYSTEM.FPU}
  169. MOV EAX, [ESP+K] ; EAX IS counter
  170. MOV EDX, [ESP+adrC]
  171. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  172. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  173. FLD QWORD [EDX] ; S.GET(dadr, x)
  174. loop8:
  175. CMP EAX, 8
  176. JL loop1
  177. FLD QWORD[EBX] ; S.GET(ladr, x)
  178. ADD EBX, 8 ; INC(ladr, incl)
  179. FLD QWORD[ECX] ; S.GET(ladr, y)
  180. ADD ECX, 8 ; INC(radr, incr)
  181. FMULP ; x := x*y
  182. FADDP ; z := z+x
  183. FLD QWORD[EBX] ; S.GET(ladr, x)
  184. ADD EBX, 8 ; INC(ladr, incl)
  185. FLD QWORD[ECX] ; S.GET(ladr, y)
  186. ADD ECX, 8 ; INC(radr, incr)
  187. FMULP ; x := x*y
  188. FADDP ; z := z+x
  189. FLD QWORD[EBX] ; S.GET(ladr, x)
  190. ADD EBX, 8 ; INC(ladr, incl)
  191. FLD QWORD[ECX] ; S.GET(ladr, y)
  192. ADD ECX, 8 ; INC(radr, incr)
  193. FMULP ; x := x*y
  194. FADDP ; z := z+x
  195. FLD QWORD[EBX] ; S.GET(ladr, x)
  196. ADD EBX, 8 ; INC(ladr, incl)
  197. FLD QWORD[ECX] ; S.GET(ladr, y)
  198. ADD ECX, 8 ; INC(radr, incr)
  199. FMULP ; x := x*y
  200. FADDP ; z := z+x
  201. FLD QWORD[EBX] ; S.GET(ladr, x)
  202. ADD EBX, 8 ; INC(ladr, incl)
  203. FLD QWORD[ECX] ; S.GET(ladr, y)
  204. ADD ECX, 8 ; INC(radr, incr)
  205. FMULP ; x := x*y
  206. FADDP ; z := z+x
  207. FLD QWORD[EBX] ; S.GET(ladr, x)
  208. ADD EBX, 8 ; INC(ladr, incl)
  209. FLD QWORD[ECX] ; S.GET(ladr, y)
  210. ADD ECX, 8 ; INC(radr, incr)
  211. FMULP ; x := x*y
  212. FADDP ; z := z+x
  213. FLD QWORD[EBX] ; S.GET(ladr, x)
  214. ADD EBX, 8 ; INC(ladr, incl)
  215. FLD QWORD[ECX] ; S.GET(ladr, y)
  216. ADD ECX, 8 ; INC(radr, incr)
  217. FMULP ; x := x*y
  218. FADDP ; z := z+x
  219. FLD QWORD[EBX] ; S.GET(ladr, x)
  220. ADD EBX, 8 ; INC(ladr, incl)
  221. FLD QWORD[ECX] ; S.GET(ladr, y)
  222. ADD ECX, 8 ; INC(radr, incr)
  223. FMULP ; x := x*y
  224. FADDP ; z := z+x
  225. SUB EAX, 8 ; DEC(len)
  226. JMP loop8 ;
  227. loop1:
  228. CMP EAX, 0 ; WHILE len > 0 DO
  229. JLE endL
  230. FLD QWORD[EBX] ; S.GET(ladr, x)
  231. ADD EBX, 8 ; INC(ladr, incl)
  232. FLD QWORD[ECX] ; S.GET(ladr, y)
  233. ADD ECX, 8 ; INC(radr, incr)
  234. FMULP ; x := x*y
  235. FADDP ; z := z+x
  236. DEC EAX ; DEC(len)
  237. JMP loop1 ;
  238. endL:
  239. FSTP QWORD[EDX] ; S.PUT(dadr, x)
  240. FWAIT ;
  241. ADD ESP, 16 ;
  242. END L1Block1XA;
  243. PROCEDURE -L1Block1XSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  244. (*
  245. matrixA, matrixB must be stored in special format
  246. K>0 guaranteed
  247. *)
  248. CODE {SYSTEM.i386, SYSTEM.SSE2}
  249. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  250. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  251. MOV EDX, [ESP+K] ; EDX IS counter
  252. XORPD XMM2, XMM2 ;
  253. kLoop8: ;
  254. CMP EDX, 8 ;
  255. JL kLoop2 ;
  256. MOVAPD XMM7, [EBX] ;
  257. MOVAPD XMM0, [ECX] ;
  258. ADD ECX, 16 ;
  259. ADD EBX, 16 ;
  260. MOVAPD XMM6, [EBX] ;
  261. MOVAPD XMM1, [ECX] ;
  262. ADD ECX, 16 ;
  263. ADD EBX, 16 ;
  264. MULPD XMM0, XMM7 ;
  265. ADDPD XMM2, XMM0 ;
  266. MOVAPD XMM5, [EBX] ;
  267. MOVAPD XMM3, [ECX] ;
  268. ADD ECX, 16 ;
  269. ADD EBX, 16 ;
  270. MULPD XMM1, XMM6 ;
  271. ADDPD XMM2, XMM1 ;
  272. MOVAPD XMM7, [EBX] ;
  273. MOVAPD XMM0, [ECX] ;
  274. ADD ECX, 16 ;
  275. ADD EBX, 16 ;
  276. MULPD XMM3, XMM5 ;
  277. ADDPD XMM2, XMM3 ;
  278. MULPD XMM0, XMM7 ;
  279. ADDPD XMM2, XMM0 ;
  280. SUB EDX, 8 ;
  281. JMP kLoop8 ;
  282. kLoop2: ;
  283. CMP EDX, 0 ;
  284. JLE horizontalAdd ;
  285. MOVAPD XMM7, [EBX] ;
  286. MOVAPD XMM0, [ECX] ;
  287. ADD ECX, 16 ;
  288. ADD EBX, 16 ;
  289. MULPD XMM0, XMM7 ;
  290. ADDPD XMM2, XMM0 ;
  291. SUB EDX, 2
  292. JMP kLoop2 ;
  293. horizontalAdd:
  294. MOV EDI, [ESP+adrC] ;
  295. MOVAPD XMM1, XMM2 ;
  296. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  297. ADDPD XMM2, XMM1 ;
  298. ADDSD XMM2, [EDI] ;
  299. MOVSD [EDI], XMM2 ;
  300. endL:
  301. ADD ESP, 16 ;
  302. END L1Block1XSSE;
  303. PROCEDURE -L1Block5XSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  304. (*
  305. matrixA and matrix B are stored in special format !
  306. K > 0 is guaranteed
  307. *)
  308. CODE {SYSTEM.i386, SYSTEM.SSE2}
  309. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  310. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  311. MOV EDX, [ESP+K] ; EDX IS counter
  312. XORPD XMM2, XMM2 ;
  313. XORPD XMM3, XMM3 ;
  314. XORPD XMM4, XMM4 ;
  315. XORPD XMM5, XMM5 ;
  316. XORPD XMM6, XMM6 ;
  317. kLoop8: ;
  318. CMP EDX, 8 ;
  319. JL kLoop2
  320. ; (*-- 0 -- *) ;
  321. MOVAPD XMM7, [EBX] ; get 4 elements OF A
  322. ADD EBX, 16 ;
  323. MOVAPD XMM0, [ECX] ; get 4 elements OF B
  324. ADD ECX, 16 ;
  325. MOVAPD XMM1, [ECX] ; get 4 elements OF B
  326. ADD ECX, 16 ;
  327. MULPD XMM0, XMM7 ;
  328. ADDPD XMM2, XMM0 ;
  329. MOVAPD XMM0, [ECX] ;
  330. ADD ECX, 16 ;
  331. MULPD XMM1, XMM7 ;
  332. ADDPD XMM3, XMM1 ;
  333. MOVAPD XMM1, [ECX] ;
  334. ADD ECX, 16 ;
  335. MULPD XMM0, XMM7 ;
  336. ADDPD XMM4, XMM0 ;
  337. MOVAPD XMM0, [ECX] ;
  338. ADD ECX, 16 ;
  339. MULPD XMM1, XMM7 ;
  340. ADDPD XMM5, XMM1 ;
  341. MOVAPD XMM1, [ECX] ;
  342. ADD ECX, 16 ;
  343. MULPD XMM0, XMM7 ;
  344. ADDPD XMM6, XMM0
  345. ; (*-- 2 -- *) ;
  346. MOVAPD XMM7, [EBX] ;
  347. ADD EBX, 16 ;
  348. MOVAPD XMM0, [ECX] ;
  349. ADD ECX, 16 ;
  350. MULPD XMM1, XMM7 ;
  351. ADDPD XMM2, XMM1 ;
  352. MOVAPD XMM1, [ECX] ;
  353. ADD ECX, 16 ;
  354. MULPD XMM0, XMM7 ;
  355. ADDPD XMM3, XMM0 ;
  356. MOVAPD XMM0, [ECX] ;
  357. ADD ECX, 16 ;
  358. MULPD XMM1, XMM7 ;
  359. ADDPD XMM4, XMM1 ;
  360. MOVAPD XMM1, [ECX] ;
  361. ADD ECX, 16 ;
  362. MULPD XMM0, XMM7 ;
  363. ADDPD XMM5, XMM0 ;
  364. MOVAPD XMM0, [ECX] ;
  365. ADD ECX, 16 ;
  366. MULPD XMM1, XMM7 ;
  367. ADDPD XMM6, XMM1
  368. ; (*-- 4 -- *) ;
  369. MOVAPD XMM7, [EBX] ;
  370. ADD EBX, 16 ;
  371. MOVAPD XMM1, [ECX] ;
  372. ADD ECX, 16 ;
  373. MULPD XMM0, XMM7 ;
  374. ADDPD XMM2, XMM0 ;
  375. MOVAPD XMM0, [ECX] ;
  376. ADD ECX, 16 ;
  377. MULPD XMM1, XMM7 ;
  378. ADDPD XMM3, XMM1 ;
  379. MOVAPD XMM1, [ECX] ;
  380. ADD ECX, 16 ;
  381. MULPD XMM0, XMM7 ;
  382. ADDPD XMM4, XMM0 ;
  383. MOVAPD XMM0, [ECX] ;
  384. ADD ECX, 16 ;
  385. MULPD XMM1, XMM7 ;
  386. ADDPD XMM5, XMM1 ;
  387. MOVAPD XMM1, [ECX] ;
  388. ADD ECX, 16 ;
  389. MULPD XMM0, XMM7 ;
  390. ADDPD XMM6, XMM0
  391. ; (*-- 6 -- *) ;
  392. MOVAPD XMM7, [EBX] ;
  393. ADD EBX, 16 ;
  394. MOVAPD XMM0, [ECX] ;
  395. ADD ECX, 16 ;
  396. MULPD XMM1, XMM7 ;
  397. ADDPD XMM2, XMM1 ;
  398. MOVAPD XMM1, [ECX] ;
  399. ADD ECX, 16 ;
  400. MULPD XMM0, XMM7 ;
  401. ADDPD XMM3, XMM0 ;
  402. MOVAPD XMM0, [ECX] ;
  403. ADD ECX, 16 ;
  404. MULPD XMM1, XMM7 ;
  405. ADDPD XMM4, XMM1 ;
  406. MOVAPD XMM1, [ECX] ;
  407. ADD ECX, 16 ;
  408. MULPD XMM0, XMM7 ;
  409. ADDPD XMM5, XMM0 ;
  410. MULPD XMM1, XMM7 ;
  411. ADDPD XMM6, XMM1 ;
  412. SUB EDX, 8
  413. JMP kLoop8 ;
  414. kLoop2: ;
  415. CMP EDX, 0 ;
  416. JLE horizontalAdd ;
  417. MOVAPD XMM7, [EBX] ;
  418. ADD EBX, 16 ;
  419. MOVAPD XMM0, [ECX] ;
  420. ADD ECX, 16 ;
  421. MOVAPD XMM1, [ECX] ;
  422. ADD ECX, 16 ;
  423. MULPD XMM0, XMM7 ;
  424. ADDPD XMM2, XMM0 ;
  425. MOVAPD XMM0, [ECX] ;
  426. ADD ECX, 16 ;
  427. MULPD XMM1, XMM7 ;
  428. ADDPD XMM3, XMM1 ;
  429. MOVAPD XMM1, [ECX] ;
  430. ADD ECX, 16 ;
  431. MULPD XMM0, XMM7 ;
  432. ADDPD XMM4, XMM0 ;
  433. MOVAPD XMM0, [ECX] ;
  434. ADD ECX, 16 ;
  435. MULPD XMM1, XMM7 ;
  436. ADDPD XMM5, XMM1 ;
  437. MULPD XMM0, XMM7 ;
  438. ADDPD XMM6, XMM0 ;
  439. SUB EDX, 2
  440. JMP kLoop2 ;
  441. horizontalAdd: ; add and store
  442. MOV EDI, [ESP+adrC] ;
  443. MOV EAX, [ESP+IncC] ;
  444. MOVAPD XMM1, XMM2 ;
  445. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  446. ADDPD XMM2, XMM1 ;
  447. ADDSD XMM2, [EDI] ;
  448. MOVSD [EDI], XMM2 ;
  449. ADD EDI, EAX ;
  450. MOVAPD XMM1, XMM3 ;
  451. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  452. ADDPD XMM3, XMM1 ;
  453. ADDSD XMM3, [EDI] ;
  454. MOVSD [EDI], XMM3 ;
  455. ADD EDI, EAX ;
  456. MOVAPD XMM1, XMM4 ;
  457. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  458. ADDPD XMM4, XMM1 ;
  459. ADDSD XMM4, [EDI] ;
  460. MOVSD [EDI], XMM4 ;
  461. ADD EDI, EAX ;
  462. MOVAPD XMM1, XMM5 ;
  463. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  464. ADDPD XMM5, XMM1 ;
  465. ADDSD XMM5, [EDI] ;
  466. MOVSD [EDI], XMM5 ;
  467. ADD EDI, EAX ;
  468. MOVAPD XMM1, XMM6 ;
  469. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  470. ADDPD XMM6, XMM1 ;
  471. ADDSD XMM6, [EDI] ;
  472. MOVSD [EDI], XMM6 ;
  473. endL:
  474. ADD ESP, 20 ;
  475. END L1Block5XSSE;
  476. PROCEDURE -L1Block1RA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  477. CODE {SYSTEM.i386, SYSTEM.FPU}
  478. MOV EAX, [ESP+K] ; EAX IS counter
  479. MOV EDX, [ESP+adrC]
  480. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  481. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  482. FLD DWORD [EDX] ; S.GET(dadr, x)
  483. loop16:
  484. CMP EAX, 16
  485. JL loop1
  486. FLD DWORD[EBX] ; S.GET(ladr, x)
  487. ADD EBX, 4 ; INC(ladr, incl)
  488. FLD DWORD[ECX] ; S.GET(ladr, y)
  489. ADD ECX, 4 ; INC(radr, incr)
  490. FMULP ; x := x*y
  491. FADDP ; z := z+x
  492. FLD DWORD[EBX] ; S.GET(ladr, x)
  493. ADD EBX, 4 ; INC(ladr, incl)
  494. FLD DWORD[ECX] ; S.GET(ladr, y)
  495. ADD ECX, 4 ; INC(radr, incr)
  496. FMULP ; x := x*y
  497. FADDP ; z := z+x
  498. FLD DWORD[EBX] ; S.GET(ladr, x)
  499. ADD EBX, 4 ; INC(ladr, incl)
  500. FLD DWORD[ECX] ; S.GET(ladr, y)
  501. ADD ECX, 4 ; INC(radr, incr)
  502. FMULP ; x := x*y
  503. FADDP ; z := z+x
  504. FLD DWORD[EBX] ; S.GET(ladr, x)
  505. ADD EBX, 4 ; INC(ladr, incl)
  506. FLD DWORD[ECX] ; S.GET(ladr, y)
  507. ADD ECX, 4 ; INC(radr, incr)
  508. FMULP ; x := x*y
  509. FADDP ; z := z+x
  510. FLD DWORD[EBX] ; S.GET(ladr, x)
  511. ADD EBX, 4 ; INC(ladr, incl)
  512. FLD DWORD[ECX] ; S.GET(ladr, y)
  513. ADD ECX, 4 ; INC(radr, incr)
  514. FMULP ; x := x*y
  515. FADDP ; z := z+x
  516. FLD DWORD[EBX] ; S.GET(ladr, x)
  517. ADD EBX, 4 ; INC(ladr, incl)
  518. FLD DWORD[ECX] ; S.GET(ladr, y)
  519. ADD ECX, 4 ; INC(radr, incr)
  520. FMULP ; x := x*y
  521. FADDP ; z := z+x
  522. FLD DWORD[EBX] ; S.GET(ladr, x)
  523. ADD EBX, 4 ; INC(ladr, incl)
  524. FLD DWORD[ECX] ; S.GET(ladr, y)
  525. ADD ECX, 4 ; INC(radr, incr)
  526. FMULP ; x := x*y
  527. FADDP ; z := z+x
  528. FLD DWORD[EBX] ; S.GET(ladr, x)
  529. ADD EBX, 4 ; INC(ladr, incl)
  530. FLD DWORD[ECX] ; S.GET(ladr, y)
  531. ADD ECX, 4 ; INC(radr, incr)
  532. FMULP ; x := x*y
  533. FADDP ; z := z+x
  534. FLD DWORD[EBX] ; S.GET(ladr, x)
  535. ADD EBX, 4 ; INC(ladr, incl)
  536. FLD DWORD[ECX] ; S.GET(ladr, y)
  537. ADD ECX, 4 ; INC(radr, incr)
  538. FMULP ; x := x*y
  539. FADDP ; z := z+x
  540. FLD DWORD[EBX] ; S.GET(ladr, x)
  541. ADD EBX, 4 ; INC(ladr, incl)
  542. FLD DWORD[ECX] ; S.GET(ladr, y)
  543. ADD ECX, 4 ; INC(radr, incr)
  544. FMULP ; x := x*y
  545. FADDP ; z := z+x
  546. FLD DWORD[EBX] ; S.GET(ladr, x)
  547. ADD EBX, 4 ; INC(ladr, incl)
  548. FLD DWORD[ECX] ; S.GET(ladr, y)
  549. ADD ECX, 4 ; INC(radr, incr)
  550. FMULP ; x := x*y
  551. FADDP ; z := z+x
  552. FLD DWORD[EBX] ; S.GET(ladr, x)
  553. ADD EBX, 4 ; INC(ladr, incl)
  554. FLD DWORD[ECX] ; S.GET(ladr, y)
  555. ADD ECX, 4 ; INC(radr, incr)
  556. FMULP ; x := x*y
  557. FADDP ; z := z+x
  558. FLD DWORD[EBX] ; S.GET(ladr, x)
  559. ADD EBX, 4 ; INC(ladr, incl)
  560. FLD DWORD[ECX] ; S.GET(ladr, y)
  561. ADD ECX, 4 ; INC(radr, incr)
  562. FMULP ; x := x*y
  563. FADDP ; z := z+x
  564. FLD DWORD[EBX] ; S.GET(ladr, x)
  565. ADD EBX, 4 ; INC(ladr, incl)
  566. FLD DWORD[ECX] ; S.GET(ladr, y)
  567. ADD ECX, 4 ; INC(radr, incr)
  568. FMULP ; x := x*y
  569. FADDP ; z := z+x
  570. FLD DWORD[EBX] ; S.GET(ladr, x)
  571. ADD EBX, 4 ; INC(ladr, incl)
  572. FLD DWORD[ECX] ; S.GET(ladr, y)
  573. ADD ECX, 4 ; INC(radr, incr)
  574. FMULP ; x := x*y
  575. FADDP ; z := z+x
  576. FLD DWORD[EBX] ; S.GET(ladr, x)
  577. ADD EBX, 4 ; INC(ladr, incl)
  578. FLD DWORD[ECX] ; S.GET(ladr, y)
  579. ADD ECX, 4 ; INC(radr, incr)
  580. FMULP ; x := x*y
  581. FADDP ; z := z+x
  582. SUB EAX, 16 ; DEC(len)
  583. JMP loop16 ;
  584. loop1:
  585. CMP EAX, 0 ; WHILE len > 0 DO
  586. JLE endL
  587. FLD DWORD[EBX] ; S.GET(ladr, x)
  588. ADD EBX, 4 ; INC(ladr, incl)
  589. FLD DWORD[ECX] ; S.GET(ladr, y)
  590. ADD ECX, 4 ; INC(radr, incr)
  591. FMULP ; x := x*y
  592. FADDP ; z := z+x
  593. DEC EAX ; DEC(len)
  594. JMP loop1 ;
  595. endL:
  596. FSTP DWORD[EDX] ; S.PUT(dadr, x)
  597. FWAIT ;
  598. ADD ESP, 16 ;
  599. END L1Block1RA;
  600. PROCEDURE -L1Block1RSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  601. (*
  602. matrixA, matrixB must be stored in special format
  603. K>0 guaranteed
  604. *)
  605. CODE {SYSTEM.i386, SYSTEM.SSE}
  606. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  607. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  608. MOV EDX, [ESP+K] ; EDX IS counter
  609. XORPS XMM2, XMM2 ;
  610. kLoop16: ;
  611. CMP EDX, 16 ;
  612. JL kLoop4 ;
  613. MOVAPS XMM7, [EBX] ;
  614. MOVAPS XMM0, [ECX] ;
  615. ADD ECX, 16 ;
  616. ADD EBX, 16 ;
  617. MOVAPS XMM6, [EBX] ;
  618. MOVAPS XMM1, [ECX] ;
  619. ADD ECX, 16 ;
  620. ADD EBX, 16 ;
  621. MULPS XMM0, XMM7 ;
  622. ADDPS XMM2, XMM0 ;
  623. MOVAPS XMM5, [EBX] ;
  624. MOVAPS XMM3, [ECX] ;
  625. ADD ECX, 16 ;
  626. ADD EBX, 16 ;
  627. MULPS XMM1, XMM6 ;
  628. ADDPS XMM2, XMM1 ;
  629. MOVAPS XMM7, [EBX] ;
  630. MOVAPS XMM0, [ECX] ;
  631. ADD ECX, 16 ;
  632. ADD EBX, 16 ;
  633. MULPS XMM3, XMM5 ;
  634. ADDPS XMM2, XMM3 ;
  635. MULPS XMM0, XMM7 ;
  636. ADDPS XMM2, XMM0 ;
  637. SUB EDX, 16 ;
  638. JMP kLoop16 ;
  639. kLoop4: ;
  640. CMP EDX, 0 ;
  641. JLE horizontalAdd ;
  642. MOVAPS XMM7, [EBX] ;
  643. MOVAPS XMM0, [ECX] ;
  644. ADD ECX, 16 ;
  645. ADD EBX, 16 ;
  646. MULPS XMM0, XMM7 ;
  647. ADDPS XMM2, XMM0 ;
  648. SUB EDX, 4
  649. JMP kLoop4 ;
  650. horizontalAdd:
  651. MOV EDI, [ESP+adrC] ;
  652. MOVLHPS XMM1, XMM2 ;
  653. ADDPS XMM1, XMM2 ;
  654. SHUFPS XMM2, XMM1, 48 ;
  655. ADDPS XMM2, XMM1 ;
  656. MOVHLPS XMM2, XMM2 ;
  657. ADDSS XMM2, [EDI] ;
  658. MOVSS [EDI], XMM2 ;
  659. endL:
  660. ADD ESP, 16 ;
  661. END L1Block1RSSE;
  662. PROCEDURE -L1Block5RSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  663. (*
  664. matrixA and matrix B are stored in special format !
  665. K > 0 is guaranteed
  666. *)
  667. CODE {SYSTEM.i386, SYSTEM.SSE}
  668. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  669. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  670. MOV EDX, [ESP+K] ; EDX IS counter
  671. XORPS XMM2, XMM2 ;
  672. XORPS XMM3, XMM3 ;
  673. XORPS XMM4, XMM4 ;
  674. XORPS XMM5, XMM5 ;
  675. XORPS XMM6, XMM6 ;
  676. kLoop16: ;
  677. CMP EDX, 16 ;
  678. JL kLoop4 ; (*-- 0 -- *)
  679. MOVAPS XMM7, [EBX] ; get 4 elements OF A
  680. ADD EBX, 16 ;
  681. MOVAPS XMM0, [ECX] ; get 4 elements OF B
  682. ADD ECX, 16 ;
  683. MOVAPS XMM1, [ECX] ; get 4 elements OF B
  684. ADD ECX, 16 ;
  685. MULPS XMM0, XMM7 ;
  686. ADDPS XMM2, XMM0 ;
  687. MOVAPS XMM0, [ECX] ;
  688. ADD ECX, 16 ;
  689. MULPS XMM1, XMM7 ;
  690. ADDPS XMM3, XMM1 ;
  691. MOVAPS XMM1, [ECX] ;
  692. ADD ECX, 16 ;
  693. MULPS XMM0, XMM7 ;
  694. ADDPS XMM4, XMM0 ;
  695. MOVAPS XMM0, [ECX] ;
  696. ADD ECX, 16 ;
  697. MULPS XMM1, XMM7 ;
  698. ADDPS XMM5, XMM1 ;
  699. MOVAPS XMM1, [ECX] ;
  700. ADD ECX, 16 ;
  701. MULPS XMM0, XMM7 ;
  702. ADDPS XMM6, XMM0
  703. ; (*-- 4 -- *) ;
  704. MOVAPS XMM7, [EBX] ;
  705. ADD EBX, 16 ;
  706. MOVAPS XMM0, [ECX] ;
  707. ADD ECX, 16 ;
  708. MULPS XMM1, XMM7 ;
  709. ADDPS XMM2, XMM1 ;
  710. MOVAPS XMM1, [ECX] ;
  711. ADD ECX, 16 ;
  712. MULPS XMM0, XMM7 ;
  713. ADDPS XMM3, XMM0 ;
  714. MOVAPS XMM0, [ECX] ;
  715. ADD ECX, 16 ;
  716. MULPS XMM1, XMM7 ;
  717. ADDPS XMM4, XMM1 ;
  718. MOVAPS XMM1, [ECX] ;
  719. ADD ECX, 16 ;
  720. MULPS XMM0, XMM7 ;
  721. ADDPS XMM5, XMM0 ;
  722. MOVAPS XMM0, [ECX] ;
  723. ADD ECX, 16 ;
  724. MULPS XMM1, XMM7 ;
  725. ADDPS XMM6, XMM1
  726. ; (*-- 8 -- *) ;
  727. MOVAPS XMM7, [EBX] ;
  728. ADD EBX, 16 ;
  729. MOVAPS XMM1, [ECX] ;
  730. ADD ECX, 16 ;
  731. MULPS XMM0, XMM7 ;
  732. ADDPS XMM2, XMM0 ;
  733. MOVAPS XMM0, [ECX] ;
  734. ADD ECX, 16 ;
  735. MULPS XMM1, XMM7 ;
  736. ADDPS XMM3, XMM1 ;
  737. MOVAPS XMM1, [ECX] ;
  738. ADD ECX, 16 ;
  739. MULPS XMM0, XMM7 ;
  740. ADDPS XMM4, XMM0 ;
  741. MOVAPS XMM0, [ECX] ;
  742. ADD ECX, 16 ;
  743. MULPS XMM1, XMM7 ;
  744. ADDPS XMM5, XMM1 ;
  745. MOVAPS XMM1, [ECX] ;
  746. ADD ECX, 16 ;
  747. MULPS XMM0, XMM7 ;
  748. ADDPS XMM6, XMM0
  749. ; (*-- 12 -- *) ;
  750. MOVAPS XMM7, [EBX] ;
  751. ADD EBX, 16 ;
  752. MOVAPS XMM0, [ECX] ;
  753. ADD ECX, 16 ;
  754. MULPS XMM1, XMM7 ;
  755. ADDPS XMM2, XMM1 ;
  756. MOVAPS XMM1, [ECX] ;
  757. ADD ECX, 16 ;
  758. MULPS XMM0, XMM7 ;
  759. ADDPS XMM3, XMM0 ;
  760. MOVAPS XMM0, [ECX] ;
  761. ADD ECX, 16 ;
  762. MULPS XMM1, XMM7 ;
  763. ADDPS XMM4, XMM1 ;
  764. MOVAPS XMM1, [ECX] ;
  765. ADD ECX, 16 ;
  766. MULPS XMM0, XMM7 ;
  767. ADDPS XMM5, XMM0 ;
  768. MULPS XMM1, XMM7 ;
  769. ADDPS XMM6, XMM1 ;
  770. SUB EDX, 16
  771. JMP kLoop16 ;
  772. kLoop4: ;
  773. CMP EDX, 0 ;
  774. JLE horizontalAdd ;
  775. MOVAPS XMM7, [EBX] ;
  776. ADD EBX, 16 ;
  777. MOVAPS XMM0, [ECX] ;
  778. ADD ECX, 16 ;
  779. MOVAPS XMM1, [ECX] ;
  780. ADD ECX, 16 ;
  781. MULPS XMM0, XMM7 ;
  782. ADDPS XMM2, XMM0 ;
  783. MOVAPS XMM0, [ECX] ;
  784. ADD ECX, 16 ;
  785. MULPS XMM1, XMM7 ;
  786. ADDPS XMM3, XMM1 ;
  787. MOVAPS XMM1, [ECX] ;
  788. ADD ECX, 16 ;
  789. MULPS XMM0, XMM7 ;
  790. ADDPS XMM4, XMM0 ;
  791. MOVAPS XMM0, [ECX] ;
  792. ADD ECX, 16 ;
  793. MULPS XMM1, XMM7 ;
  794. ADDPS XMM5, XMM1 ;
  795. MULPS XMM0, XMM7 ;
  796. ADDPS XMM6, XMM0 ;
  797. SUB EDX, 4
  798. JMP kLoop4 ;
  799. horizontalAdd: ; add and store
  800. MOV EDI, [ESP+adrC] ;
  801. MOV EAX, [ESP+IncC] ;
  802. MOVLHPS XMM1, XMM2 ;
  803. ADDPS XMM1, XMM2 ;
  804. SHUFPS XMM2, XMM1, 48 ;
  805. ADDPS XMM2, XMM1 ;
  806. MOVHLPS XMM2, XMM2 ;
  807. ADDSS XMM2, [EDI] ;
  808. MOVSS [EDI], XMM2 ;
  809. ADD EDI, EAX ;
  810. MOVLHPS XMM1, XMM3 ;
  811. ADDPS XMM1, XMM3 ;
  812. SHUFPS XMM3, XMM1, 48 ;
  813. ADDPS XMM3, XMM1 ;
  814. MOVHLPS XMM3, XMM3 ;
  815. ADDSS XMM3, [EDI] ;
  816. MOVSS [EDI], XMM3 ;
  817. ADD EDI, EAX ;
  818. MOVLHPS XMM1, XMM4 ;
  819. ADDPS XMM1, XMM4 ;
  820. SHUFPS XMM4, XMM1, 48 ;
  821. ADDPS XMM4, XMM1 ;
  822. MOVHLPS XMM4, XMM4 ;
  823. ADDSS XMM4, [EDI] ;
  824. MOVSS [EDI], XMM4 ;
  825. ADD EDI, EAX ;
  826. MOVLHPS XMM1, XMM5 ;
  827. ADDPS XMM1, XMM5 ;
  828. SHUFPS XMM5, XMM1, 48 ;
  829. ADDPS XMM5, XMM1 ;
  830. MOVHLPS XMM5, XMM5 ;
  831. ADDSS XMM5, [EDI] ;
  832. MOVSS [EDI], XMM5 ;
  833. ADD EDI, EAX ;
  834. MOVLHPS XMM1, XMM6 ;
  835. ADDPS XMM1, XMM6 ;
  836. SHUFPS XMM6, XMM1, 48 ;
  837. ADDPS XMM6, XMM1 ;
  838. MOVHLPS XMM6, XMM6 ;
  839. ADDSS XMM6, [EDI] ;
  840. MOVSS [EDI], XMM6 ;
  841. endL:
  842. ADD ESP, 20 ;
  843. END L1Block5RSSE;
  844. PROCEDURE -Align4( adr: ADDRESS ): ADDRESS;
  845. CODE {SYSTEM.i386}
  846. MOV EAX, [ESP+adr] ;
  847. NEG EAX ;
  848. AND EAX, 3H ;
  849. ADD EAX, [ESP+adr] ;
  850. ADD ESP, 4
  851. END Align4;
  852. PROCEDURE -Align2( adr: ADDRESS ): ADDRESS;
  853. CODE {SYSTEM.i386}
  854. MOV EAX, [ESP+adr] ;
  855. NEG EAX ;
  856. AND EAX, 1H ;
  857. ADD EAX, [ESP+adr] ;
  858. ADD ESP, 4
  859. END Align2;
  860. PROCEDURE -ZeroR( adr: ADDRESS; count: SIZE );
  861. (** For 32 bit types *)
  862. CODE {SYSTEM.i386}
  863. MOV EDI, [ESP+adr] ; address OF dest index
  864. MOV ECX, [ESP+count] ; counter
  865. MOV EAX, 0 ; value
  866. CLD ; incremental
  867. REP ;
  868. STOSD ;
  869. ADD ESP, 8 ;
  870. END ZeroR;
  871. PROCEDURE -ZeroX( adr: ADDRESS; count: SIZE );
  872. (** For 64 bit types *)
  873. CODE {SYSTEM.i386}
  874. MOV EDI, [ESP+adr] ; address OF dest index
  875. MOV ECX, [ESP+count] ; counter
  876. SHL ECX, 1 ;
  877. MOV EAX, 0 ; value
  878. CLD ; incremental
  879. REP ;
  880. STOSD ;
  881. ADD ESP, 8 ;
  882. END ZeroX;
  883. PROCEDURE -ZeroRI( adr: SIZE; inc, count: SIZE );
  884. (** For 32 bit types *)
  885. CODE {SYSTEM.i386}
  886. MOV EDI, [ESP+adr] ; address OF dest index
  887. MOV EBX, [ESP+inc] ;
  888. MOV ECX, [ESP+count] ; counter
  889. CMP EBX, 4 ;
  890. JE fastzero ;
  891. MOV EAX, 0 ;
  892. loopL:
  893. CMP ECX, 0 ;
  894. JLE endL ;
  895. MOV [EDI], EAX ;
  896. ADD EDI, EBX ;
  897. DEC ECX ;
  898. JMP loopL ;
  899. fastzero:
  900. MOV EAX, 0 ; value
  901. CLD ; incremental
  902. REP ;
  903. STOSD ;
  904. endL:
  905. ADD ESP, 12 ;
  906. END ZeroRI;
  907. PROCEDURE -ZeroXI( adr: ADDRESS; inc, count: SIZE );
  908. (** For 32 bit types *)
  909. CODE {SYSTEM.i386}
  910. MOV EDI, [ESP+adr] ; address OF dest index
  911. MOV EBX, [ESP+inc] ;
  912. MOV ECX, [ESP+count] ; counter
  913. MOV EAX, 0 ;
  914. CMP EBX, 8 ;
  915. JE fastzero ;
  916. loopL:
  917. CMP ECX, 0 ;
  918. JLE endL ;
  919. MOV [EDI], EAX ;
  920. MOV [EDI+4], EAX ;
  921. ADD EDI, EBX ;
  922. DEC ECX ;
  923. JMP loopL ;
  924. fastzero:
  925. SHL ECX, 1 ;
  926. CLD ; incremental
  927. REP ;
  928. STOSD ;
  929. endL:
  930. ADD ESP, 12 ;
  931. END ZeroXI;
  932. PROCEDURE -MovR( from, to0, frominc, count: SIZE );
  933. CODE {SYSTEM.i386}
  934. MOV EDI, [ESP+to0] ; TO
  935. MOV ESI, [ESP+from] ; from
  936. MOV ECX, [ESP+count] ; count
  937. MOV EBX, [ESP+frominc] ; inc
  938. CMP EBX, 4 ;
  939. JE fastmove ;
  940. loopL:
  941. CMP ECX, 0 ;
  942. JLE endL ;
  943. MOV EAX, [ESI] ;
  944. MOV [EDI], EAX ;
  945. ADD ESI, EBX ;
  946. ADD EDI, 4 ;
  947. DEC ECX ;
  948. JMP loopL ;
  949. fastmove:
  950. CLD ; incremental
  951. REP ;
  952. MOVSD ; move rest IN one byte steps
  953. endL:
  954. ADD ESP, 16 ;
  955. END MovR;
  956. PROCEDURE -MovX( from, to0: ADDRESS; frominc, count:SIZE );
  957. CODE {SYSTEM.i386}
  958. MOV EDI, [ESP+to0] ; TO
  959. MOV ESI, [ESP+from] ; from
  960. MOV ECX, [ESP+count] ; count
  961. MOV EBX, [ESP+frominc] ; inc
  962. CMP EBX, 8 ;
  963. JE fastmove ;
  964. loopL:
  965. CMP ECX, 0 ;
  966. JLE endL ;
  967. MOV EAX, [ESI] ;
  968. MOV [EDI], EAX ;
  969. MOV EAX, [ESI+4] ;
  970. MOV [EDI+4], EAX ;
  971. ADD ESI, EBX ;
  972. ADD EDI, 8 ;
  973. DEC ECX ;
  974. JMP loopL ;
  975. fastmove:
  976. SHL ECX, 1 ;
  977. CLD ; incremental
  978. REP ;
  979. MOVSD ; move rest IN one byte steps
  980. endL:
  981. ADD ESP, 16 ;
  982. END MovX;
  983. PROCEDURE -MovR5( src: ADDRESS; inc, stride: SIZE; dest: ADDRESS; count: SIZE);
  984. CODE {SYSTEM.i386}
  985. MOV ESI, [ESP+src] ; src
  986. MOV EBX, [ESP+inc] ; inc
  987. MOV ECX, [ESP+stride] ; stride
  988. MOV EDI, [ESP+dest] ; dest
  989. loopL:
  990. MOV EAX, [ESP+count] ; count
  991. CMP EAX, 0 ;
  992. JLE endL ;
  993. SUB EAX, 4 ;
  994. MOV [ESP+count], EAX ;
  995. MOV EDX, ESI ;
  996. MOV EAX, [EDX] ;
  997. MOV [EDI], EAX ;
  998. ADD EDX, EBX ;
  999. MOV EAX, [EDX] ;
  1000. MOV [EDI+16], EAX ;
  1001. ADD EDX, EBX ;
  1002. MOV EAX, [EDX] ;
  1003. MOV [EDI+32], EAX ;
  1004. ADD EDX, EBX ;
  1005. MOV EAX, [EDX] ;
  1006. MOV [EDI+48], EAX ;
  1007. ADD EDX, EBX ;
  1008. MOV EAX, [EDX] ;
  1009. MOV [EDI+64], EAX ;
  1010. ADD ESI, ECX ;
  1011. ADD EDI, 4 ;
  1012. MOV EDX, ESI ;
  1013. MOV EAX, [EDX] ;
  1014. MOV [EDI], EAX ;
  1015. ADD EDX, EBX ;
  1016. MOV EAX, [EDX] ;
  1017. MOV [EDI+16], EAX ;
  1018. ADD EDX, EBX ;
  1019. MOV EAX, [EDX] ;
  1020. MOV [EDI+32], EAX ;
  1021. ADD EDX, EBX ;
  1022. MOV EAX, [EDX] ;
  1023. MOV [EDI+48], EAX ;
  1024. ADD EDX, EBX ;
  1025. MOV EAX, [EDX] ;
  1026. MOV [EDI+64], EAX ;
  1027. ADD ESI, ECX ;
  1028. ADD EDI, 4 ;
  1029. MOV EDX, ESI ;
  1030. MOV EAX, [EDX] ;
  1031. MOV [EDI], EAX ;
  1032. ADD EDX, EBX ;
  1033. MOV EAX, [EDX] ;
  1034. MOV [EDI+16], EAX ;
  1035. ADD EDX, EBX ;
  1036. MOV EAX, [EDX] ;
  1037. MOV [EDI+32], EAX ;
  1038. ADD EDX, EBX ;
  1039. MOV EAX, [EDX] ;
  1040. MOV [EDI+48], EAX ;
  1041. ADD EDX, EBX ;
  1042. MOV EAX, [EDX] ;
  1043. MOV [EDI+64], EAX ;
  1044. ADD ESI, ECX ;
  1045. ADD EDI, 4 ;
  1046. MOV EDX, ESI ;
  1047. MOV EAX, [EDX] ;
  1048. MOV [EDI], EAX ;
  1049. ADD EDX, EBX ;
  1050. MOV EAX, [EDX] ;
  1051. MOV [EDI+16], EAX ;
  1052. ADD EDX, EBX ;
  1053. MOV EAX, [EDX] ;
  1054. MOV [EDI+32], EAX ;
  1055. ADD EDX, EBX ;
  1056. MOV EAX, [EDX] ;
  1057. MOV [EDI+48], EAX ;
  1058. ADD EDX, EBX ;
  1059. MOV EAX, [EDX] ;
  1060. MOV [EDI+64], EAX ;
  1061. ADD ESI, ECX ;
  1062. ADD EDI, 4 ;
  1063. ADD EDI, 64 ;
  1064. JMP loopL ;
  1065. endL:
  1066. ADD ESP, 20 ;
  1067. END MovR5;
  1068. (* *)
  1069. PROCEDURE AddAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1070. CODE {SYSTEM.i386, SYSTEM.FPU}
  1071. MOV EAX, [EBP+len] ;
  1072. MOV EBX, [EBP+ladr] ;
  1073. MOV ECX, [EBP+radr] ;
  1074. MOV EDX, [EBP+dadr] ;
  1075. start:
  1076. CMP EAX, 0 ;
  1077. JLE endL ;
  1078. FLD QWORD [EBX] ;
  1079. ADD EBX, [EBP+linc] ;
  1080. FLD QWORD [ECX] ;
  1081. ADD ECX, [EBP+rinc] ;
  1082. FADDP ;
  1083. FSTP QWORD [EDX] ;
  1084. ADD EDX, [EBP+dinc] ;
  1085. DEC EAX ;
  1086. JMP start ;
  1087. endL:
  1088. FWAIT ;
  1089. END AddAXAXLoopA;
  1090. PROCEDURE AddARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1091. CODE {SYSTEM.i386, SYSTEM.FPU}
  1092. MOV EAX, [EBP+len] ;
  1093. MOV EBX, [EBP+ladr] ;
  1094. MOV ECX, [EBP+radr] ;
  1095. MOV EDX, [EBP+dadr] ;
  1096. start:
  1097. CMP EAX, 0 ;
  1098. JLE endL ;
  1099. FLD DWORD [EBX] ;
  1100. ADD EBX, [EBP+linc] ;
  1101. FLD DWORD [ECX] ;
  1102. ADD ECX, [EBP+rinc] ;
  1103. FADDP ;
  1104. FSTP DWORD [EDX] ;
  1105. ADD EDX, [EBP+dinc] ;
  1106. DEC EAX ;
  1107. JMP start ;
  1108. endL:
  1109. FWAIT ;
  1110. END AddARARLoopA;
  1111. PROCEDURE AddAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1112. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1113. MOV EAX, [EBP+len] ;
  1114. CMP EAX, 0 ;
  1115. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1116. MOV EBX, [EBP+ladr] ;
  1117. MOV ECX, [EBP+radr] ;
  1118. MOV EDX, [EBP+dadr] ;
  1119. ; check IF data are contiguous IN memory
  1120. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1121. JNE single ; not continuous- > simplest method
  1122. CMP [EBP+rinc], 8 ; check right FOR contiunuity
  1123. JNE single ; not continuous- > simplest method
  1124. CMP [EBP+dinc], 8 ; check destination FOR contiunuity
  1125. JNE single ; not continuous- > simplest method
  1126. ; check FOR alignment
  1127. MOV ESI, EBX ;
  1128. AND ESI, 7 ; ladr MOD 8
  1129. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1130. JNE unaligned ; not 64 bit aligned
  1131. MOV ESI, ECX ;
  1132. AND ESI, 7 ; radr MOD 8
  1133. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1134. JNE unaligned ; not 64 bit aligned
  1135. MOV ESI, EDX ;
  1136. AND ESI, 7 ; dadr MOD 8
  1137. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1138. JNE unaligned ; not 64 bit aligned
  1139. MOV ESI, EBX ;
  1140. AND ESI, 8 ; 16 byte alignment
  1141. MOV EDI, ECX ;
  1142. AND EDI, 8 ; 16 byte alignment
  1143. CMP ESI, EDI ;
  1144. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1145. MOV EDI, EDX ;
  1146. AND EDI, 8 ; 16 byte alignment
  1147. CMP ESI, EDI ;
  1148. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1149. CMP ESI, 8 ;
  1150. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1151. ; one single element processing TO achieve 128 bt alignment
  1152. MOVSD XMM1, [EBX] ;
  1153. MOVSD XMM0, [ECX] ;
  1154. ADDSD XMM0, XMM1 ;
  1155. MOVSD [EDX], XMM0 ;
  1156. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1157. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1158. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1159. DEC EAX ; one element has been processed
  1160. aligned:
  1161. aligned8:
  1162. CMP EAX, 8 ;
  1163. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1164. MOVAPD XMM0, [EBX] ;
  1165. MOVAPD XMM1, [EBX+16] ;
  1166. MOVAPD XMM2, [EBX+32] ;
  1167. MOVAPD XMM3, [EBX+48] ;
  1168. ADD EBX, 64 ;
  1169. MOVAPD XMM4, [ECX] ;
  1170. MOVAPD XMM5, [ECX+16] ;
  1171. MOVAPD XMM6, [ECX+32] ;
  1172. MOVAPD XMM7, [ECX+48] ;
  1173. ADD ECX, 64 ;
  1174. ADDPD XMM0, XMM4 ;
  1175. ADDPD XMM1, XMM5 ;
  1176. ADDPD XMM2, XMM6 ;
  1177. ADDPD XMM3, XMM7 ;
  1178. MOVAPD [EDX], XMM0 ;
  1179. MOVAPD [EDX+16], XMM1 ;
  1180. MOVAPD [EDX+32], XMM2 ;
  1181. MOVAPD [EDX+48], XMM3 ;
  1182. ADD EDX, 64 ;
  1183. SUB EAX, 8 ;
  1184. JMP aligned8 ;
  1185. ; LOOP FOR 2 pieces aligned
  1186. aligned2: ;
  1187. CMP EAX, 2 ;
  1188. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1189. MOVAPD XMM0, [EBX] ;
  1190. ADD EBX, 16 ;
  1191. MOVAPD XMM1, [ECX] ;
  1192. ADD ECX, 16 ;
  1193. ADDPD XMM0, XMM1 ;
  1194. MOVAPD [EDX], XMM0 ;
  1195. ADD EDX, 16 ;
  1196. SUB EAX, 2 ;
  1197. JMP aligned2 ;
  1198. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1199. unaligned: ;
  1200. unaligned8: ;
  1201. CMP EAX, 8 ;
  1202. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1203. MOVUPD XMM0, [EBX] ;
  1204. MOVUPD XMM1, [EBX+16] ;
  1205. MOVUPD XMM2, [EBX+32] ;
  1206. MOVUPD XMM3, [EBX+48] ;
  1207. ADD EBX, 64 ;
  1208. MOVUPD XMM4, [ECX] ;
  1209. MOVUPD XMM5, [ECX+16] ;
  1210. MOVUPD XMM6, [ECX+32] ;
  1211. MOVUPD XMM7, [ECX+48] ;
  1212. ADD ECX, 64 ;
  1213. ADDPD XMM0, XMM4 ;
  1214. ADDPD XMM1, XMM5 ;
  1215. ADDPD XMM2, XMM6 ;
  1216. ADDPD XMM3, XMM7 ;
  1217. MOVUPD [EDX], XMM0 ;
  1218. MOVUPD [EDX+16], XMM1 ;
  1219. MOVUPD [EDX+32], XMM2 ;
  1220. MOVUPD [EDX+48], XMM3 ;
  1221. ADD EDX, 64 ;
  1222. SUB EAX, 8 ;
  1223. JMP unaligned8 ;
  1224. ; LOOP FOR 2 pieces aligned
  1225. unaligned2: ;
  1226. CMP EAX, 2 ;
  1227. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1228. MOVUPD XMM0, [EBX] ;
  1229. ADD EBX, 16 ;
  1230. MOVUPD XMM1, [ECX] ;
  1231. ADD ECX, 16 ;
  1232. ADDPD XMM0, XMM1 ;
  1233. MOVUPD [EDX], XMM0 ;
  1234. ADD EDX, 16 ;
  1235. SUB EAX, 2 ;
  1236. JMP unaligned2 ;
  1237. ; one piece left OR non-contiguous data
  1238. single:
  1239. singlepieces: ;
  1240. CMP EAX, 0 ;
  1241. JLE endL ; len <= 0- > EXIT
  1242. MOVSD XMM0, [EBX]
  1243. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1244. MOVSD XMM1, [ECX]
  1245. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1246. ADDSD XMM0, XMM1 ;
  1247. MOVSD [EDX], XMM0
  1248. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1249. DEC EAX ; DEC(len)
  1250. JMP singlepieces ;
  1251. endL:
  1252. END AddAXAXLoopSSE;
  1253. PROCEDURE AddARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1254. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1255. MOV EAX, [EBP+len] ;
  1256. CMP EAX, 0 ;
  1257. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1258. MOV EBX, [EBP+ladr] ;
  1259. MOV ECX, [EBP+radr] ;
  1260. MOV EDX, [EBP+dadr] ;
  1261. ; check IF data are contiguous IN memory
  1262. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1263. JNE single ; not continuous- > simplest method
  1264. CMP [EBP+rinc], 4 ; check right FOR contiunuity
  1265. JNE single ; not continuous- > simplest method
  1266. CMP [EBP+dinc], 4 ; check destination FOR contiunuity
  1267. JNE single ; not continuous- > simplest method
  1268. ; check FOR alignment
  1269. MOV ESI, EBX ;
  1270. AND ESI, 3 ; ladr MOD 4
  1271. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1272. JNE unaligned ; not 32 bit aligned
  1273. MOV ESI, ECX ;
  1274. AND ESI, 3 ; radr MOD 4
  1275. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1276. JNE unaligned ; not 32 bit aligned
  1277. MOV ESI, EDX ;
  1278. AND ESI, 3 ; dadr MOD 4
  1279. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1280. JNE unaligned ; not 32 bit aligned
  1281. MOV ESI, EBX ;
  1282. AND ESI, 8+4 ; 16 byte alignment?
  1283. MOV EDI, ECX ;
  1284. AND EDI, 8+4 ; 16 byte alignment?
  1285. CMP ESI, EDI ;
  1286. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1287. MOV EDI, EDX ;
  1288. AND EDI, 8+4 ; 16 byte alignment
  1289. CMP ESI, EDI ;
  1290. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1291. CMP ESI, 0 ;
  1292. JE aligned ; already aligned
  1293. align:
  1294. ; one single element processing UNTIL 128 bt alignment achieved
  1295. MOVSS XMM1, [EBX] ;
  1296. MOVSS XMM0, [ECX] ;
  1297. ADDSS XMM0, XMM1 ;
  1298. MOVSS [EDX], XMM0 ;
  1299. ADD EBX, 4 ;
  1300. ADD ECX, 4 ;
  1301. ADD EDX, 4 ;
  1302. DEC EAX ; one element has been processed ;
  1303. CMP EAX, 0 ; all elements already processed?
  1304. JLE single ;
  1305. MOV ESI, EBX ;
  1306. AND ESI, 8+4 ;
  1307. CMP ESI, 0 ;
  1308. JNE align ;
  1309. aligned:
  1310. aligned16:
  1311. CMP EAX, 16 ;
  1312. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1313. MOVAPS XMM0, [EBX] ;
  1314. MOVAPS XMM1, [EBX+16] ;
  1315. MOVAPS XMM2, [EBX+32] ;
  1316. MOVAPS XMM3, [EBX+48] ;
  1317. ADD EBX, 64 ;
  1318. MOVAPS XMM4, [ECX] ;
  1319. MOVAPS XMM5, [ECX+16] ;
  1320. MOVAPS XMM6, [ECX+32] ;
  1321. MOVAPS XMM7, [ECX+48] ;
  1322. ADD ECX, 64 ;
  1323. ADDPS XMM0, XMM4 ;
  1324. ADDPS XMM1, XMM5 ;
  1325. ADDPS XMM2, XMM6 ;
  1326. ADDPS XMM3, XMM7 ;
  1327. MOVAPS [EDX], XMM0 ;
  1328. MOVAPS [EDX+16], XMM1 ;
  1329. MOVAPS [EDX+32], XMM2 ;
  1330. MOVAPS [EDX+48], XMM3 ;
  1331. ADD EDX, 64 ;
  1332. SUB EAX, 16 ;
  1333. JMP aligned16 ;
  1334. ; LOOP FOR 2 pieces aligned
  1335. aligned4: ;
  1336. CMP EAX, 4 ;
  1337. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1338. MOVAPS XMM0, [EBX] ;
  1339. ADD EBX, 16 ;
  1340. MOVAPS XMM1, [ECX] ;
  1341. ADD ECX, 16 ;
  1342. ADDPS XMM0, XMM1 ;
  1343. MOVAPS [EDX], XMM0 ;
  1344. ADD EDX, 16 ;
  1345. SUB EAX, 4 ;
  1346. JMP aligned4 ;
  1347. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1348. unaligned: ;
  1349. unaligned16: ;
  1350. CMP EAX, 16 ;
  1351. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1352. MOVUPS XMM0, [EBX] ;
  1353. MOVUPS XMM1, [EBX+16] ;
  1354. MOVUPS XMM2, [EBX+32] ;
  1355. MOVUPS XMM3, [EBX+48] ;
  1356. ADD EBX, 64 ;
  1357. MOVUPS XMM4, [ECX] ;
  1358. MOVUPS XMM5, [ECX+16] ;
  1359. MOVUPS XMM6, [ECX+32] ;
  1360. MOVUPS XMM7, [ECX+48] ;
  1361. ADD ECX, 64 ;
  1362. ADDPS XMM0, XMM4 ;
  1363. ADDPS XMM1, XMM5 ;
  1364. ADDPS XMM2, XMM6 ;
  1365. ADDPS XMM3, XMM7 ;
  1366. MOVUPS [EDX], XMM0 ;
  1367. MOVUPS [EDX+16], XMM1 ;
  1368. MOVUPS [EDX+32], XMM2 ;
  1369. MOVUPS [EDX+48], XMM3 ;
  1370. ADD EDX, 64 ;
  1371. SUB EAX, 16 ;
  1372. JMP unaligned16 ;
  1373. ; LOOP FOR 2 pieces aligned
  1374. unaligned4: ;
  1375. CMP EAX, 4 ;
  1376. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1377. MOVUPS XMM0, [EBX] ;
  1378. ADD EBX, 16 ;
  1379. MOVUPS XMM1, [ECX] ;
  1380. ADD ECX, 16 ;
  1381. ADDPS XMM0, XMM1 ;
  1382. MOVUPS [EDX], XMM0 ;
  1383. ADD EDX, 16 ;
  1384. SUB EAX, 4 ;
  1385. JMP unaligned4 ;
  1386. ; one piece left OR non-contiguous data
  1387. single:
  1388. singlepieces: ;
  1389. CMP EAX, 0 ;
  1390. JLE endL ; len <= 0- > EXIT
  1391. MOVSS XMM0, [EBX]
  1392. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1393. MOVSS XMM1, [ECX]
  1394. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1395. ADDSS XMM0, XMM1 ;
  1396. MOVSS [EDX], XMM0
  1397. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1398. DEC EAX ; DEC(len)
  1399. JMP singlepieces ;
  1400. endL:
  1401. END AddARARLoopSSE;
  1402. PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1403. CODE {SYSTEM.i386, SYSTEM.FPU}
  1404. MOV EAX, [EBP+len] ; eax := len
  1405. MOV EBX, [EBP+ladr] ; ebx := ladr
  1406. MOV ECX, [EBP+radr] ; ecx := radr
  1407. MOV EDX, [EBP+dadr] ; edx := dadr
  1408. FLD QWORD [EDX] ; S.GET(dadr, x)
  1409. start:
  1410. CMP EAX, 0 ; WHILE len > 0 DO
  1411. JLE endL
  1412. FLD QWORD [EBX] ; S.GET(ladr, x)
  1413. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1414. FLD QWORD [ECX] ; S.GET(ladr, y)
  1415. FMULP ; x := x*y
  1416. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1417. FADDP ; z := z+x
  1418. DEC EAX ; DEC(len)
  1419. JMP start ;
  1420. endL:
  1421. FSTP QWORD [EDX] ; S.PUT(dadr, x)
  1422. FWAIT ;
  1423. END SPAXAXLoopA;
  1424. PROCEDURE SPARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1425. CODE {SYSTEM.i386, SYSTEM.FPU}
  1426. MOV EAX, [EBP+len] ; eax := len
  1427. MOV EBX, [EBP+ladr] ; ebx := ladr
  1428. MOV ECX, [EBP+radr] ; ecx := radr
  1429. MOV EDX, [EBP+dadr] ; edx := dadr
  1430. FLD DWORD [EDX] ; S.GET(dadr, x)
  1431. start:
  1432. CMP EAX, 0 ; WHILE len > 0 DO
  1433. JLE endL
  1434. FLD DWORD [EBX] ; S.GET(ladr, x)
  1435. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1436. FLD DWORD [ECX] ; S.GET(ladr, y)
  1437. FMULP ; x := x*y
  1438. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1439. FADDP ; z := z+x
  1440. DEC EAX ; DEC(len)
  1441. JMP start ;
  1442. endL:
  1443. FSTP DWORD [EDX] ; S.PUT(dadr, x)
  1444. FWAIT ;
  1445. END SPARARLoopA;
  1446. (* sse version of scalar product *)
  1447. PROCEDURE SPAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1448. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1449. ; register initialization
  1450. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1451. CMP EAX, 0 ;
  1452. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1453. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1454. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  1455. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1456. XORPD XMM0, XMM0 ;
  1457. MOVSD XMM0, [EDX] ; destination- > low bytes OF xmm0
  1458. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1459. JNE single ; not continuous- > simplest method
  1460. CMP [EBP+rinc], 8 ; check dest FOR continuity
  1461. JNE single ; not continuous- > simplest method
  1462. ; check FOR alignment
  1463. MOV ESI, EBX ;
  1464. AND ESI, 7 ; ladr MOD 8
  1465. CMP ESI, 0 ; ECX = 0- > 64 Bit alignment
  1466. JNE unaligned ; not 64 bit aligned
  1467. MOV ESI, ECX ;
  1468. AND ESI, 7 ; radr MOD 8
  1469. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1470. JNE unaligned ; not 64 bit aligned
  1471. MOV ESI, EBX ;
  1472. AND ESI, 8 ; 16 byte alignment
  1473. MOV EDI, ECX ;
  1474. AND EDI, 8 ; 16 byte alignment
  1475. CMP ESI, EDI ;
  1476. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1477. CMP ESI, 8 ;
  1478. JNE aligned ; ladr and dadr already 128 bit aligned
  1479. ; one single element processing TO achieve 128 bt alignment
  1480. MOVSD XMM1, [EBX] ;
  1481. MOVSD XMM2, [ECX] ;
  1482. MULSD XMM1, XMM2 ;
  1483. ADDSD XMM0, XMM1 ;
  1484. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1485. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1486. DEC EAX ; one element has been processed
  1487. ; LOOP FOR 4 pieces aligned
  1488. aligned:
  1489. aligned6:
  1490. CMP EAX, 6 ;
  1491. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1492. MOVAPD XMM1, [EBX] ;
  1493. MOVAPD XMM2, [EBX+16] ;
  1494. MOVAPD XMM3, [EBX+32] ;
  1495. MOVAPD XMM4, [ECX] ;
  1496. MOVAPD XMM5, [ECX+16] ;
  1497. MOVAPD XMM6, [ECX+32] ;
  1498. MULPD XMM1, XMM4 ;
  1499. ADDPD XMM0, XMM1 ;
  1500. MULPD XMM2, XMM5 ;
  1501. ADDPD XMM0, XMM2 ;
  1502. MULPD XMM3, XMM6 ;
  1503. ADDPD XMM0, XMM3 ;
  1504. ADD EBX, 48 ;
  1505. ADD ECX, 48 ;
  1506. SUB EAX, 6 ;
  1507. JMP aligned6 ;
  1508. ; LOOP FOR 2 pieces aligned
  1509. aligned2:
  1510. CMP EAX, 2 ;
  1511. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1512. MOVAPD XMM1, [EBX] ;
  1513. MOVAPD XMM2, [ECX] ;
  1514. MULPD XMM1, XMM2 ;
  1515. ADDPD XMM0, XMM1 ;
  1516. ADD EBX, 16 ;
  1517. ADD ECX, 16 ;
  1518. SUB EAX, 2 ;
  1519. JMP aligned2 ;
  1520. unaligned:
  1521. unaligned6:
  1522. CMP EAX, 6 ;
  1523. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1524. MOVUPD XMM1, [EBX] ;
  1525. MOVUPD XMM2, [EBX+16] ;
  1526. MOVUPD XMM3, [EBX+32] ;
  1527. MOVUPD XMM4, [ECX] ;
  1528. MOVUPD XMM5, [ECX+16] ;
  1529. MOVUPD XMM6, [ECX+32] ;
  1530. MULPD XMM1, XMM4 ;
  1531. ADDPD XMM0, XMM1 ;
  1532. MULPD XMM2, XMM5 ;
  1533. ADDPD XMM0, XMM2 ;
  1534. MULPD XMM3, XMM6 ;
  1535. ADDPD XMM0, XMM3 ;
  1536. ADD EBX, 48 ;
  1537. ADD ECX, 48 ;
  1538. SUB EAX, 6 ;
  1539. JMP unaligned6 ;
  1540. ; LOOP FOR 2 pieces aligned
  1541. unaligned2:
  1542. CMP EAX, 2 ;
  1543. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1544. MOVUPD XMM1, [EBX] ;
  1545. MOVUPD XMM2, [ECX] ;
  1546. MULPD XMM1, XMM2 ;
  1547. ADDPD XMM0, XMM1 ;
  1548. ADD EBX, 16 ;
  1549. ADD ECX, 16 ;
  1550. SUB EAX, 2 ;
  1551. JMP unaligned2 ;
  1552. horizontaladd: ;
  1553. MOVAPD XMM1, XMM0 ;
  1554. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  1555. ADDPD XMM0, XMM1 ;
  1556. JMP singlepieces ;
  1557. single:
  1558. singlepieces: ;
  1559. CMP EAX, 0 ;
  1560. JLE store ; len <= 0- > EXIT
  1561. MOVSD XMM1, [EBX]
  1562. MOVSD XMM2, [ECX]
  1563. MULSD XMM1, XMM2
  1564. ADDSD XMM0, XMM1
  1565. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1566. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1567. DEC EAX ; DEC(len)
  1568. JMP singlepieces ;
  1569. store:
  1570. MOVSD [EDX], XMM0 ;
  1571. endL:
  1572. END SPAXAXLoopSSE;
  1573. (* sse version of scalar product *)
  1574. PROCEDURE SPARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1575. CODE {SYSTEM.i386, SYSTEM.SSE}
  1576. ; register initialization
  1577. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1578. CMP EAX, 0 ;
  1579. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1580. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1581. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  1582. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1583. XORPS XMM0, XMM0 ;
  1584. MOVSS XMM0, [EDX] ; destination- > low bytes OF xmm0
  1585. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1586. JNE single ; not continuous- > simplest method
  1587. CMP [EBP+rinc], 4 ; check dest FOR continuity
  1588. JNE single ; not continuous- > simplest method
  1589. ; check FOR alignment
  1590. MOV ESI, EBX ;
  1591. AND ESI, 3 ; ladr MOD 4
  1592. CMP ESI, 0 ; ECX = 0- > 32 Bit alignment
  1593. JNE unaligned ; not 32 bit aligned
  1594. MOV ESI, ECX ;
  1595. AND ESI, 3 ; radr MOD 4
  1596. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1597. JNE unaligned ; not 32 bit aligned
  1598. MOV ESI, EBX ;
  1599. AND ESI, 8+4 ; 16 byte alignment
  1600. MOV EDI, ECX ;
  1601. AND EDI, 8+4 ; 16 byte alignment
  1602. CMP ESI, EDI ;
  1603. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1604. CMP ESI, 0 ;
  1605. JE aligned ; already aligned
  1606. align:
  1607. ; one single element processing UNTIL 128 bt alignment achieved
  1608. MOVSS XMM1, [EBX] ;
  1609. MOVSS XMM2, [ECX] ;
  1610. MULSS XMM1, XMM2 ;
  1611. ADDSS XMM0, XMM1 ;
  1612. ADD EBX, 4 ;
  1613. ADD ECX, 4 ;
  1614. DEC EAX ; one element has been processed ;
  1615. CMP EAX, 0 ; all elements already processed?
  1616. JLE single ;
  1617. MOV ESI, EBX ;
  1618. AND ESI, 8+4 ;
  1619. CMP ESI, 0 ;
  1620. JNE align ;
  1621. aligned:
  1622. aligned12:
  1623. CMP EAX, 12 ;
  1624. JL aligned4 ; len < 4- > EXIT TO singlepieces
  1625. MOVAPS XMM1, [EBX] ;
  1626. MOVAPS XMM2, [EBX+16] ;
  1627. MOVAPS XMM3, [EBX+32] ;
  1628. MOVAPS XMM4, [ECX] ;
  1629. MOVAPS XMM5, [ECX+16] ;
  1630. MOVAPS XMM6, [ECX+32] ;
  1631. MULPS XMM1, XMM4 ;
  1632. ADDPS XMM0, XMM1 ;
  1633. MULPS XMM2, XMM5 ;
  1634. ADDPS XMM0, XMM2 ;
  1635. MULPS XMM3, XMM6 ;
  1636. ADDPS XMM0, XMM3 ;
  1637. ADD EBX, 48 ;
  1638. ADD ECX, 48 ;
  1639. SUB EAX, 12 ;
  1640. JMP aligned12 ;
  1641. ; LOOP FOR 2 pieces aligned
  1642. aligned4:
  1643. CMP EAX, 4 ;
  1644. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1645. MOVAPS XMM1, [EBX] ;
  1646. MOVAPS XMM2, [ECX] ;
  1647. MULPS XMM1, XMM2 ;
  1648. ADDPS XMM0, XMM1 ;
  1649. ADD EBX, 16 ;
  1650. ADD ECX, 16 ;
  1651. SUB EAX, 4 ;
  1652. JMP aligned4 ;
  1653. unaligned:
  1654. unaligned12:
  1655. CMP EAX, 12 ;
  1656. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1657. MOVUPS XMM1, [EBX] ;
  1658. MOVUPS XMM2, [EBX+16] ;
  1659. MOVUPS XMM3, [EBX+32] ;
  1660. MOVUPS XMM4, [ECX] ;
  1661. MOVUPS XMM5, [ECX+16] ;
  1662. MOVUPS XMM6, [ECX+32] ;
  1663. MULPS XMM1, XMM4 ;
  1664. ADDPS XMM0, XMM1 ;
  1665. MULPS XMM2, XMM5 ;
  1666. ADDPS XMM0, XMM2 ;
  1667. MULPS XMM3, XMM6 ;
  1668. ADDPS XMM0, XMM3 ;
  1669. ADD EBX, 48 ;
  1670. ADD ECX, 48 ;
  1671. SUB EAX, 12 ;
  1672. JMP unaligned12 ;
  1673. ; LOOP FOR 2 pieces aligned
  1674. unaligned4:
  1675. CMP EAX, 4 ;
  1676. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1677. MOVUPS XMM1, [EBX] ;
  1678. MOVUPS XMM2, [ECX] ;
  1679. MULPS XMM1, XMM2 ;
  1680. ADDPS XMM0, XMM1 ;
  1681. ADD EBX, 16 ;
  1682. ADD ECX, 16 ;
  1683. SUB EAX, 4 ;
  1684. JMP unaligned4 ;
  1685. horizontaladd: ;
  1686. MOVAPS XMM1, XMM0 ;
  1687. ; 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *)
  1688. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  1689. ADDPS XMM1, XMM0 ;
  1690. MOVAPS XMM0, XMM1
  1691. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  1692. ADDPS XMM0, XMM1 ;
  1693. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  1694. JMP singlepieces ;
  1695. single:
  1696. singlepieces: ;
  1697. CMP EAX, 0 ;
  1698. JLE store ; len <= 0- > EXIT
  1699. MOVSS XMM1, [EBX]
  1700. MOVSS XMM2, [ECX]
  1701. MULSS XMM1, XMM2
  1702. ADDSS XMM0, XMM1
  1703. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1704. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1705. DEC EAX ; DEC(len)
  1706. JMP singlepieces ;
  1707. store:
  1708. MOVSS [EDX], XMM0 ;
  1709. endL:
  1710. END SPARARLoopSSE;
  1711. PROCEDURE MulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1712. CODE {SYSTEM.i386, SYSTEM.FPU}
  1713. MOV EAX, [EBP+len] ; eax := len
  1714. MOV EBX, [EBP+ladr] ; ebx := ladr
  1715. MOV ECX, [EBP+radr] ; ecx := radr
  1716. MOV EDX, [EBP+dadr] ; edx := dadr
  1717. start:
  1718. CMP EAX, 0 ; WHILE len > 0 DO
  1719. JLE endL
  1720. FLD QWORD [EBX] ; S.GET(ladr, x)
  1721. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1722. FLD QWORD [ECX] ; S.GET(ladr, y)
  1723. FMULP ; x := x*y
  1724. FSTP QWORD [EDX]
  1725. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1726. DEC EAX ; DEC(len)
  1727. JMP start ;
  1728. endL:
  1729. FWAIT ;
  1730. END MulAXSXLoopA;
  1731. PROCEDURE MulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1732. CODE {SYSTEM.i386, SYSTEM.FPU}
  1733. MOV EAX, [EBP+len] ; eax := len
  1734. MOV EBX, [EBP+ladr] ; ebx := ladr
  1735. MOV ECX, [EBP+radr] ; ecx := radr
  1736. MOV EDX, [EBP+dadr] ; edx := dadr
  1737. start:
  1738. CMP EAX, 0 ; WHILE len > 0 DO
  1739. JLE endL
  1740. FLD DWORD [EBX] ; S.GET(ladr, x)
  1741. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1742. FLD DWORD [ECX] ; S.GET(ladr, y)
  1743. FMULP ; x := x*y
  1744. FSTP DWORD [EDX]
  1745. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1746. DEC EAX ; DEC(len)
  1747. JMP start ;
  1748. endL:
  1749. FWAIT ;
  1750. END MulARSRLoopA;
  1751. PROCEDURE IncMulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1752. CODE {SYSTEM.i386, SYSTEM.FPU}
  1753. MOV EAX, [EBP+len] ; eax := len
  1754. MOV EBX, [EBP+ladr] ; ebx := ladr
  1755. MOV ECX, [EBP+radr] ; ecx := radr
  1756. MOV EDX, [EBP+dadr] ; edx := dadr
  1757. start:
  1758. CMP EAX, 0 ; WHILE len > 0 DO
  1759. JLE endL
  1760. FLD QWORD [EBX] ; S.GET(ladr, x)
  1761. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1762. FLD QWORD [ECX] ; S.GET(ladr, y)
  1763. FMULP ; x := x*y
  1764. FLD QWORD [EDX+8] ;
  1765. FADDP ;
  1766. FSTP QWORD [EDX]
  1767. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1768. DEC EAX ; DEC(len)
  1769. JMP start ;
  1770. endL:
  1771. FWAIT ;
  1772. END IncMulAXSXLoopA;
  1773. PROCEDURE IncMulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1774. CODE {SYSTEM.i386, SYSTEM.FPU}
  1775. MOV EAX, [EBP+len] ; eax := len
  1776. MOV EBX, [EBP+ladr] ; ebx := ladr
  1777. MOV ECX, [EBP+radr] ; ecx := radr
  1778. MOV EDX, [EBP+dadr] ; edx := dadr
  1779. start:
  1780. CMP EAX, 0 ; WHILE len > 0 DO
  1781. JLE endL
  1782. FLD DWORD [EBX] ; S.GET(ladr, x)
  1783. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1784. FLD DWORD [ECX] ; S.GET(ladr, y)
  1785. FMULP ; x := x*y
  1786. FLD DWORD [EDX+8] ;
  1787. FADDP ;
  1788. FSTP DWORD [EDX]
  1789. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1790. DEC EAX ; DEC(len)
  1791. JMP start ;
  1792. endL:
  1793. FWAIT ;
  1794. END IncMulARSRLoopA;
  1795. PROCEDURE MulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1796. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  1797. (*
  1798. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  1799. 2.) process starting unaligned data ( using single instructions)
  1800. 3.) process aligned data
  1801. 4.) process remaining unaligned data (using single instructions)
  1802. *)
  1803. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1804. ; register initialization
  1805. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1806. CMP EAX, 0 ;
  1807. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1808. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1809. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1810. MOV ECX, [EBP+radr] ;
  1811. MOVSD XMM0, [ECX] ;
  1812. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  1813. ; check IF data are contiguous IN memory
  1814. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1815. JNE single ; not continuous- > simplest method
  1816. CMP [EBP+dinc], 8 ; check dest FOR continuity
  1817. JNE single ; not continuous- > simplest method
  1818. ; check FOR alignment
  1819. MOV ECX, EBX ;
  1820. AND ECX, 7 ; ladr MOD 8
  1821. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  1822. JNE unaligned ; not 64 bit aligned
  1823. MOV ECX, EDX ;
  1824. AND ECX, 7 ; dadr MOD 8
  1825. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  1826. JNE unaligned ; not 64 bit aligned
  1827. MOV ESI, EBX ;
  1828. AND ESI, 8 ; 16 byte alignment
  1829. MOV EDI, EDX ;
  1830. AND EDI, 8 ; 16 byte alignment
  1831. CMP ESI, EDI ;
  1832. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1833. CMP ESI, 8 ;
  1834. JNE aligned ; ladr and dadr already 128 bit aligned
  1835. ; one single element processing TO achieve 128 bt alignment
  1836. MOVSD XMM1, [EBX] ;
  1837. MULSD XMM1, XMM0 ;
  1838. MOVSD [EDX], XMM1 ;
  1839. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1840. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1841. DEC EAX ; one element has been processed
  1842. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  1843. aligned:
  1844. aligned8:
  1845. CMP EAX, 8 ;
  1846. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1847. MOVAPD XMM1, [EBX] ;
  1848. MOVAPD XMM2, [EBX+16] ;
  1849. MOVAPD XMM3, [EBX+32] ;
  1850. MOVAPD XMM4, [EBX+48] ;
  1851. ADD EBX, 64 ;
  1852. MULPD XMM1, XMM0 ;
  1853. MULPD XMM2, XMM0 ;
  1854. MULPD XMM3, XMM0 ;
  1855. MULPD XMM4, XMM0 ;
  1856. MOVAPD [EDX], XMM1 ;
  1857. MOVAPD [EDX+16], XMM2 ;
  1858. MOVAPD [EDX+32], XMM3 ;
  1859. MOVAPD [EDX+48], XMM4 ;
  1860. ADD EDX, 64 ;
  1861. SUB EAX, 8 ;
  1862. JMP aligned8 ;
  1863. ; LOOP FOR 2 pieces aligned
  1864. aligned2: ;
  1865. CMP EAX, 2 ;
  1866. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1867. MOVAPD XMM1, [EBX] ;
  1868. ADD EBX, 16 ;
  1869. MULPD XMM1, XMM0 ;
  1870. MOVAPD [EDX], XMM1 ;
  1871. ADD EDX, 16 ;
  1872. SUB EAX, 2 ;
  1873. JMP aligned2 ;
  1874. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1875. unaligned: ;
  1876. unaligned8: ;
  1877. CMP EAX, 8 ;
  1878. JL unaligned2 ; len < 12- > EXIT
  1879. MOVUPD XMM1, [EBX] ;
  1880. MOVUPD XMM2, [EBX+16] ;
  1881. MOVUPD XMM3, [EBX+32] ;
  1882. MOVUPD XMM4, [EBX+48] ;
  1883. ADD EBX, 64
  1884. MULPD XMM1, XMM0 ;
  1885. MULPD XMM2, XMM0 ;
  1886. MULPD XMM3, XMM0 ;
  1887. MULPD XMM4, XMM0 ;
  1888. MOVUPD [EDX], XMM1 ;
  1889. MOVUPD [EDX+16], XMM2 ;
  1890. MOVUPD [EDX+32], XMM3 ;
  1891. MOVUPD [EDX+48], XMM4 ;
  1892. ADD EDX, 64 ;
  1893. SUB EAX, 8 ;
  1894. JMP unaligned8 ;
  1895. ; LOOP FOR 2 pieces unaligned
  1896. unaligned2: ;
  1897. CMP EAX, 2 ;
  1898. JL singlepieces ; len < 2- > EXIT
  1899. MOVUPD XMM1, [EBX] ;
  1900. ADD EBX, 16 ;
  1901. MULPD XMM1, XMM0 ;
  1902. MOVUPD [EDX], XMM1 ;
  1903. ADD EDX, 16 ;
  1904. SUB EAX, 2 ;
  1905. JMP unaligned2 ;
  1906. ; one piece left OR non-contiguous data
  1907. single:
  1908. singlepieces: ;
  1909. CMP EAX, 0 ;
  1910. JLE endL ; len <= 0- > EXIT
  1911. MOVSD XMM1, [EBX]
  1912. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1913. MULSD XMM1, XMM0
  1914. MOVSD [EDX], XMM1
  1915. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1916. DEC EAX ; DEC(len)
  1917. JMP singlepieces ;
  1918. endL:
  1919. END MulAXSXLoopSSE;
  1920. PROCEDURE MulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1921. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  1922. (*
  1923. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  1924. 2.) process starting unaligned data ( using single instructions)
  1925. 3.) process aligned data
  1926. 4.) process remaining unaligned data (using single instructions)
  1927. *)
  1928. CODE {SYSTEM.i386, SYSTEM.SSE}
  1929. ; register initialization
  1930. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1931. CMP EAX, 0 ;
  1932. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1933. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1934. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1935. MOV ECX, [EBP+radr] ;
  1936. MOVSS XMM0, [ECX] ;
  1937. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  1938. ; check IF data are contiguous IN memory
  1939. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1940. JNE single ; not continuous- > simplest method
  1941. CMP [EBP+dinc], 4 ; check dest FOR continuity
  1942. JNE single ; not continuous- > simplest method
  1943. ; check FOR alignment
  1944. MOV ECX, EBX ;
  1945. AND ECX, 3 ; ladr MOD 4
  1946. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  1947. JNE unaligned ; not 32 bit aligned
  1948. MOV ECX, EDX ;
  1949. AND ECX, 3 ; dadr MOD 4
  1950. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  1951. JNE unaligned ; not 64 bit aligned
  1952. MOV ESI, EBX ;
  1953. AND ESI, 8+4 ; 16 byte alignment
  1954. MOV EDI, EDX ;
  1955. AND EDI, 8+4 ; 16 byte alignment
  1956. CMP ESI, EDI ;
  1957. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1958. CMP ESI, 0 ;
  1959. JE aligned ; already aligned
  1960. align:
  1961. ; one single element processing UNTIL 128 bt alignment achieved
  1962. MOVSS XMM1, [EBX] ;
  1963. MULSS XMM1, XMM0 ;
  1964. MOVSS [EDX], XMM1 ;
  1965. ADD EBX, 4 ;
  1966. ADD EDX, 4 ;
  1967. DEC EAX ; one element has been processed ;
  1968. CMP EAX, 0 ; all elements already processed?
  1969. JLE single
  1970. MOV ESI, EBX ;
  1971. AND ESI, 8+4 ;
  1972. CMP ESI, 0 ;
  1973. JNE align ;
  1974. aligned:
  1975. aligned16:
  1976. CMP EAX, 16 ;
  1977. JL aligned4 ; len < 4- > EXIT TO singlepieces
  1978. MOVAPS XMM1, [EBX] ;
  1979. MOVAPS XMM2, [EBX+16] ;
  1980. MOVAPS XMM3, [EBX+32] ;
  1981. MOVAPS XMM4, [EBX+48] ;
  1982. ADD EBX, 64 ;
  1983. MULPS XMM1, XMM0 ;
  1984. MULPS XMM2, XMM0 ;
  1985. MULPS XMM3, XMM0 ;
  1986. MULPS XMM4, XMM0 ;
  1987. MOVAPS [EDX], XMM1 ;
  1988. MOVAPS [EDX+16], XMM2 ;
  1989. MOVAPS [EDX+32], XMM3 ;
  1990. MOVAPS [EDX+48], XMM4 ;
  1991. ADD EDX, 64 ;
  1992. SUB EAX, 16 ;
  1993. JMP aligned16 ;
  1994. ; LOOP FOR 2 pieces aligned
  1995. aligned4: ;
  1996. CMP EAX, 4 ;
  1997. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1998. MOVAPS XMM1, [EBX] ;
  1999. ADD EBX, 16 ;
  2000. MULPS XMM1, XMM0 ;
  2001. MOVAPS [EDX], XMM1 ;
  2002. ADD EDX, 16 ;
  2003. SUB EAX, 4 ;
  2004. JMP aligned4 ;
  2005. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2006. unaligned: ;
  2007. unaligned16: ;
  2008. CMP EAX, 16 ;
  2009. JL unaligned4 ; len < 12- > EXIT
  2010. MOVUPS XMM1, [EBX] ;
  2011. MOVUPS XMM2, [EBX+16] ;
  2012. MOVUPS XMM3, [EBX+32] ;
  2013. MOVUPS XMM4, [EBX+48] ;
  2014. ADD EBX, 64
  2015. MULPS XMM1, XMM0 ;
  2016. MULPS XMM2, XMM0 ;
  2017. MULPS XMM3, XMM0 ;
  2018. MULPS XMM4, XMM0 ;
  2019. MOVUPS [EDX], XMM1 ;
  2020. MOVUPS [EDX+16], XMM2 ;
  2021. MOVUPS [EDX+32], XMM3 ;
  2022. MOVUPS [EDX+48], XMM4 ;
  2023. ADD EDX, 64 ;
  2024. SUB EAX, 16 ;
  2025. JMP unaligned16 ;
  2026. ; LOOP FOR 2 pieces unaligned
  2027. unaligned4: ;
  2028. CMP EAX, 4 ;
  2029. JL singlepieces ; len < 2- > EXIT
  2030. MOVUPS XMM1, [EBX] ;
  2031. ADD EBX, 16 ;
  2032. MULPS XMM1, XMM0 ;
  2033. MOVUPS [EDX], XMM1 ;
  2034. ADD EDX, 16 ;
  2035. SUB EAX, 4 ;
  2036. JMP unaligned4 ;
  2037. ; one piece left OR non-contiguous data
  2038. single:
  2039. singlepieces: ;
  2040. CMP EAX, 0 ;
  2041. JLE endL ; len <= 0- > EXIT
  2042. MOVSS XMM1, [EBX]
  2043. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2044. MULSS XMM1, XMM0
  2045. MOVSS [EDX], XMM1
  2046. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2047. DEC EAX ; DEC(len)
  2048. JMP singlepieces ;
  2049. endL:
  2050. END MulARSRLoopSSE;
  2051. PROCEDURE IncMulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2052. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2053. (*
  2054. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2055. 2.) process starting unaligned data ( using single instructions)
  2056. 3.) process aligned data
  2057. 4.) process remaining unaligned data (using single instructions)
  2058. *)
  2059. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2060. ; register initialization
  2061. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2062. CMP EAX, 0 ;
  2063. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2064. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2065. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2066. MOV ECX, [EBP+radr] ;
  2067. MOVSD XMM0, [ECX] ;
  2068. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2069. ; check IF data are contiguous IN memory
  2070. CMP [EBP+linc], 8 ; check left FOR contiunuity
  2071. JNE single ; not continuous- > simplest method
  2072. CMP [EBP+dinc], 8 ; check dest FOR continuity
  2073. JNE single ; not continuous- > simplest method
  2074. ; check FOR alignment
  2075. MOV ECX, EBX ;
  2076. AND ECX, 7 ; ladr MOD 8
  2077. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2078. JNE unaligned ; not 64 bit aligned
  2079. MOV ECX, EDX ;
  2080. AND ECX, 7 ; dadr MOD 8
  2081. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2082. JNE unaligned ; not 64 bit aligned
  2083. MOV ESI, EBX ;
  2084. AND ESI, 8 ; 16 byte alignment
  2085. MOV EDI, EDX ;
  2086. AND EDI, 8 ; 16 byte alignment
  2087. CMP ESI, EDI ;
  2088. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2089. CMP ESI, 8 ;
  2090. JNE aligned ; ladr and dadr already 128 bit aligned
  2091. ; one single element processing TO achieve 128 bt alignment
  2092. MOVSD XMM1, [EBX] ;
  2093. MULSD XMM1, XMM0 ;
  2094. MOVSD XMM2, [EDX] ;
  2095. ADDSD XMM1, XMM2 ;
  2096. MOVSD [EDX], XMM1 ;
  2097. ADD EBX, 8 ; now EBX IS 16 byte aligned
  2098. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  2099. DEC EAX ; one element has been processed
  2100. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2101. aligned:
  2102. aligned8:
  2103. CMP EAX, 8 ;
  2104. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2105. MOVAPD XMM1, [EBX] ;
  2106. MOVAPD XMM2, [EBX+16] ;
  2107. MOVAPD XMM3, [EBX+32] ;
  2108. MOVAPD XMM4, [EBX+48] ;
  2109. ADD EBX, 64 ;
  2110. MULPD XMM1, XMM0 ;
  2111. MULPD XMM2, XMM0 ;
  2112. MULPD XMM3, XMM0 ;
  2113. MULPD XMM4, XMM0 ;
  2114. MOVAPD XMM5, [EDX] ;
  2115. ADDPD XMM1, XMM5
  2116. MOVAPD [EDX], XMM1 ;
  2117. MOVAPD XMM6, [EDX+16] ;
  2118. ADDPD XMM2, XMM6
  2119. MOVAPD [EDX+16], XMM2 ;
  2120. MOVAPD XMM7, [EDX+32] ;
  2121. ADDPD XMM3, XMM7
  2122. MOVAPD [EDX+32], XMM3 ;
  2123. MOVAPD XMM5, [EDX+48] ;
  2124. ADDPD XMM4, XMM5
  2125. MOVAPD [EDX+48], XMM4 ;
  2126. ADD EDX, 64 ;
  2127. SUB EAX, 8 ;
  2128. JMP aligned8 ;
  2129. ; LOOP FOR 2 pieces aligned
  2130. aligned2: ;
  2131. CMP EAX, 2 ;
  2132. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2133. MOVAPD XMM1, [EBX] ;
  2134. ADD EBX, 16 ;
  2135. MULPD XMM1, XMM0 ;
  2136. MOVAPD XMM2, [EDX] ;
  2137. ADDPD XMM1, XMM2
  2138. MOVAPD [EDX], XMM1 ;
  2139. ADD EDX, 16 ;
  2140. SUB EAX, 2 ;
  2141. JMP aligned2 ;
  2142. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2143. unaligned: ;
  2144. unaligned8: ;
  2145. CMP EAX, 8 ;
  2146. JL unaligned2 ; len < 12- > EXIT
  2147. MOVUPD XMM1, [EBX] ;
  2148. MOVUPD XMM2, [EBX+16] ;
  2149. MOVUPD XMM3, [EBX+32] ;
  2150. MOVUPD XMM4, [EBX+48] ;
  2151. ADD EBX, 64
  2152. MULPD XMM1, XMM0 ;
  2153. MULPD XMM2, XMM0 ;
  2154. MULPD XMM3, XMM0 ;
  2155. MULPD XMM4, XMM0 ;
  2156. MOVUPD XMM5, [EDX] ;
  2157. ADDPD XMM1, XMM5
  2158. MOVUPD [EDX], XMM1 ;
  2159. MOVUPD XMM6, [EDX+16] ;
  2160. ADDPD XMM2, XMM6
  2161. MOVUPD [EDX+16], XMM2 ;
  2162. MOVUPD XMM7, [EDX+32] ;
  2163. ADDPD XMM3, XMM7
  2164. MOVUPD [EDX+32], XMM3 ;
  2165. MOVUPD XMM5, [EDX+48] ;
  2166. ADDPD XMM4, XMM5
  2167. MOVUPD [EDX+48], XMM4 ;
  2168. ADD EDX, 64 ;
  2169. SUB EAX, 8 ;
  2170. JMP unaligned8 ;
  2171. ; LOOP FOR 2 pieces unaligned
  2172. unaligned2: ;
  2173. CMP EAX, 2 ;
  2174. JL singlepieces ; len < 2- > EXIT
  2175. MOVUPD XMM1, [EBX] ;
  2176. ADD EBX, 16 ;
  2177. MULPD XMM1, XMM0 ;
  2178. MOVUPD XMM2, [EDX] ;
  2179. ADDPD XMM1, XMM2
  2180. MOVUPD [EDX], XMM1 ;
  2181. ADD EDX, 16 ;
  2182. SUB EAX, 2 ;
  2183. JMP unaligned2 ;
  2184. ; one piece left OR non-contiguous data
  2185. single:
  2186. singlepieces: ;
  2187. CMP EAX, 0 ;
  2188. JLE endL ; len <= 0- > EXIT
  2189. MOVSD XMM1, [EBX]
  2190. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2191. MULSD XMM1, XMM0
  2192. MOVSD XMM2, [EDX] ;
  2193. ADDSD XMM1, XMM2
  2194. MOVSD [EDX], XMM1
  2195. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2196. DEC EAX ; DEC(len)
  2197. JMP singlepieces ;
  2198. endL:
  2199. END IncMulAXSXLoopSSE;
  2200. PROCEDURE IncMulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2201. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2202. (*
  2203. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2204. 2.) process starting unaligned data ( using single instructions)
  2205. 3.) process aligned data
  2206. 4.) process remaining unaligned data (using single instructions)
  2207. *)
  2208. CODE {SYSTEM.i386, SYSTEM.SSE}
  2209. ; register initialization
  2210. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2211. CMP EAX, 0 ;
  2212. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2213. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2214. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2215. MOV ECX, [EBP+radr] ;
  2216. MOVSS XMM0, [ECX] ;
  2217. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2218. ; check IF data are contiguous IN memory
  2219. CMP [EBP+linc], 4 ; check left FOR contiunuity
  2220. JNE single ; not continuous- > simplest method
  2221. CMP [EBP+dinc], 4 ; check dest FOR continuity
  2222. JNE single ; not continuous- > simplest method
  2223. ; check FOR alignment
  2224. MOV ECX, EBX ;
  2225. AND ECX, 3 ; ladr MOD 4
  2226. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2227. JNE unaligned ; not 32 bit aligned
  2228. MOV ECX, EDX ;
  2229. AND ECX, 3 ; dadr MOD 4
  2230. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2231. JNE unaligned ; not 64 bit aligned
  2232. MOV ESI, EBX ;
  2233. AND ESI, 8+4 ; 16 byte alignment
  2234. MOV EDI, EDX ;
  2235. AND EDI, 8+4 ; 16 byte alignment
  2236. CMP ESI, EDI ;
  2237. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2238. CMP ESI, 0 ;
  2239. JE aligned ; already aligned
  2240. align:
  2241. ; one single element processing UNTIL 128 bt alignment achieved
  2242. MOVSS XMM1, [EBX] ;
  2243. MULSS XMM1, XMM0 ;
  2244. MOVSS XMM2, [EDX] ;
  2245. ADDSS XMM1, XMM2 ;
  2246. MOVSS [EDX], XMM1 ;
  2247. ADD EBX, 4 ;
  2248. ADD EDX, 4 ;
  2249. DEC EAX ; one element has been processed ;
  2250. CMP EAX, 0 ; all elements already processed?
  2251. JLE single
  2252. MOV ESI, EBX ;
  2253. AND ESI, 8+4 ;
  2254. CMP ESI, 0 ;
  2255. JNE align ;
  2256. aligned:
  2257. aligned16:
  2258. CMP EAX, 16 ;
  2259. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2260. MOVAPS XMM1, [EBX] ;
  2261. MOVAPS XMM2, [EBX+16] ;
  2262. MOVAPS XMM3, [EBX+32] ;
  2263. MOVAPS XMM4, [EBX+48] ;
  2264. ADD EBX, 64 ;
  2265. MULPS XMM1, XMM0 ;
  2266. MULPS XMM2, XMM0 ;
  2267. MULPS XMM3, XMM0 ;
  2268. MULPS XMM4, XMM0 ;
  2269. MOVAPS XMM5, [EDX] ;
  2270. ADDPS XMM1, XMM5 ;
  2271. MOVAPS [EDX], XMM1 ;
  2272. MOVAPS XMM6, [EDX+16] ;
  2273. ADDPS XMM2, XMM6 ;
  2274. MOVAPS [EDX+16], XMM2 ;
  2275. MOVAPS XMM7, [EDX+32] ;
  2276. ADDPS XMM3, XMM7 ;
  2277. MOVAPS [EDX+32], XMM3 ;
  2278. MOVAPS XMM5, [EDX+48] ;
  2279. ADDPS XMM4, XMM5 ;
  2280. MOVAPS [EDX+48], XMM4 ;
  2281. ADD EDX, 64 ;
  2282. SUB EAX, 16 ;
  2283. JMP aligned16 ;
  2284. ; LOOP FOR 2 pieces aligned
  2285. aligned4: ;
  2286. CMP EAX, 4 ;
  2287. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2288. MOVAPS XMM1, [EBX] ;
  2289. ADD EBX, 16 ;
  2290. MULPS XMM1, XMM0 ;
  2291. MOVAPS XMM2, [EDX] ;
  2292. ADDPS XMM1, XMM2 ;
  2293. MOVAPS [EDX], XMM1 ;
  2294. ADD EDX, 16 ;
  2295. SUB EAX, 4 ;
  2296. JMP aligned4 ;
  2297. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2298. unaligned: ;
  2299. unaligned16: ;
  2300. CMP EAX, 16 ;
  2301. JL unaligned4 ; len < 12- > EXIT
  2302. MOVUPS XMM1, [EBX] ;
  2303. MOVUPS XMM2, [EBX+16] ;
  2304. MOVUPS XMM3, [EBX+32] ;
  2305. MOVUPS XMM4, [EBX+48] ;
  2306. ADD EBX, 64
  2307. MULPS XMM1, XMM0 ;
  2308. MULPS XMM2, XMM0 ;
  2309. MULPS XMM3, XMM0 ;
  2310. MULPS XMM4, XMM0 ;
  2311. MOVUPS XMM5, [EDX] ;
  2312. ADDPS XMM1, XMM5 ;
  2313. MOVUPS [EDX], XMM1 ;
  2314. MOVUPS XMM6, [EDX+16] ;
  2315. ADDPS XMM2, XMM6 ;
  2316. MOVUPS [EDX+16], XMM2 ;
  2317. MOVUPS XMM7, [EDX+32] ;
  2318. ADDPS XMM3, XMM7 ;
  2319. MOVUPS [EDX+32], XMM3 ;
  2320. MOVUPS XMM5, [EDX+48] ;
  2321. ADDPS XMM4, XMM5 ;
  2322. MOVUPS [EDX+48], XMM4 ;
  2323. ADD EDX, 64 ;
  2324. SUB EAX, 16 ;
  2325. JMP unaligned16 ;
  2326. ; LOOP FOR 2 pieces unaligned
  2327. unaligned4: ;
  2328. CMP EAX, 4 ;
  2329. JL singlepieces ; len < 2- > EXIT
  2330. MOVUPS XMM1, [EBX] ;
  2331. ADD EBX, 16 ;
  2332. MULPS XMM1, XMM0 ;
  2333. MOVUPS XMM2, [EDX] ;
  2334. ADDPS XMM1, XMM2 ;
  2335. MOVUPS [EDX], XMM1 ;
  2336. ADD EDX, 16 ;
  2337. SUB EAX, 4 ;
  2338. JMP unaligned4 ;
  2339. ; one piece left OR non-contiguous data
  2340. single:
  2341. singlepieces: ;
  2342. CMP EAX, 0 ;
  2343. JLE endL ; len <= 0- > EXIT
  2344. MOVSS XMM1, [EBX]
  2345. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2346. MULSS XMM1, XMM0
  2347. MOVSS XMM2, [EDX] ;
  2348. ADDSS XMM1, XMM2 ;
  2349. MOVSS [EDX], XMM1
  2350. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2351. DEC EAX ; DEC(len)
  2352. JMP singlepieces ;
  2353. endL:
  2354. END IncMulARSRLoopSSE;
  2355. (*
  2356. PROCEDURE AlignedSPXSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2357. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2358. ; ; register initialization
  2359. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  2360. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  2361. MOV ESI, [EBP+radr] ; ESI reserved for radr
  2362. MOV EAX, [EBP+len] ; EAX reserverd for length
  2363. MOV ECX, [EBP+stride] ; ECX reserved for stride
  2364. XORPD XMM2, XMM2 ;
  2365. XORPD XMM3, XMM3 ;
  2366. XORPD XMM4, XMM4 ;
  2367. XORPD XMM5, XMM5 ;
  2368. XORPD XMM6, XMM6 ;
  2369. XOR EDI, EDI ;
  2370. aligned4:
  2371. CMP EAX, 4 ;
  2372. JL aligned2 ; ; len < 4- > exit to singlepieces
  2373. MOV ESI, [EBP+radr] ;
  2374. ADD ESI, EDI ;
  2375. MOVAPD XMM7, [EBX] ;
  2376. MOVAPD XMM0, [ESI] ;
  2377. ADD ESI, ECX ;
  2378. MOVAPD XMM1, [ESI] ;
  2379. MULPD XMM0, XMM7 ;
  2380. ADDPD XMM2, XMM0 ;
  2381. ADD ESI, ECX ;
  2382. MOVAPD XMM0, [ESI] ;
  2383. MULPD XMM1, XMM7 ;
  2384. ADDPD XMM3, XMM1 ;
  2385. ADD ESI, ECX ;
  2386. MOVAPD XMM1, [ESI] ;
  2387. MULPD XMM0, XMM7 ;
  2388. ADDPD XMM4, XMM0 ;
  2389. ADD ESI, ECX ;
  2390. MOVAPD XMM0, [ESI] ;
  2391. MULPD XMM1, XMM7 ;
  2392. ADDPD XMM5, XMM1 ;
  2393. MULPD XMM0, XMM7 ;
  2394. ADDPD XMM6, XMM0 ;
  2395. ADD EBX, 16 ;
  2396. ADD EDI, 16 ;
  2397. MOV ESI, [EBP+radr] ;
  2398. ADD ESI, EDI ;
  2399. MOVAPD XMM7, [EBX] ;
  2400. MOVAPD XMM0, [ESI] ;
  2401. ADD ESI, ECX ;
  2402. MOVAPD XMM1, [ESI] ;
  2403. MULPD XMM0, XMM7 ;
  2404. ADDPD XMM2, XMM0 ;
  2405. ADD ESI, ECX ;
  2406. MOVAPD XMM0, [ESI] ;
  2407. MULPD XMM1, XMM7 ;
  2408. ADDPD XMM3, XMM1 ;
  2409. ADD ESI, ECX ;
  2410. MOVAPD XMM1, [ESI] ;
  2411. MULPD XMM0, XMM7 ;
  2412. ADDPD XMM4, XMM0 ;
  2413. ADD ESI, ECX ;
  2414. MOVAPD XMM0, [ESI] ;
  2415. MULPD XMM1, XMM7 ;
  2416. ADDPD XMM5, XMM1 ;
  2417. MULPD XMM0, XMM7 ;
  2418. ADDPD XMM6, XMM0 ;
  2419. ADD EBX, 16 ;
  2420. ADD EDI, 16 ;
  2421. SUB EAX, 4 ;
  2422. JMP aligned4 ;
  2423. aligned2:
  2424. CMP EAX, 2 ;
  2425. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2426. MOV ESI, [EBP+radr] ;
  2427. ADD ESI, EDI ;
  2428. MOVAPD XMM7, [EBX] ;
  2429. MOVAPD XMM0, [ESI] ;
  2430. ADD ESI, ECX ;
  2431. MOVAPD XMM1, [ESI] ;
  2432. MULPD XMM0, XMM7 ;
  2433. ADDPD XMM2, XMM0 ;
  2434. ADD ESI, ECX ;
  2435. MOVAPD XMM0, [ESI] ;
  2436. MULPD XMM1, XMM7 ;
  2437. ADDPD XMM3, XMM1 ;
  2438. ADD ESI, ECX ;
  2439. MOVAPD XMM1, [ESI] ;
  2440. MULPD XMM0, XMM7 ;
  2441. ADDPD XMM4, XMM0 ;
  2442. ADD ESI, ECX ;
  2443. MOVAPD XMM0, [ESI] ;
  2444. MULPD XMM1, XMM7 ;
  2445. ADDPD XMM5, XMM1 ;
  2446. MULPD XMM0, XMM7 ;
  2447. ADDPD XMM6, XMM0 ;
  2448. ADD EBX, 16 ;
  2449. ADD EDI, 16 ;
  2450. SUB EAX, 2 ;
  2451. JMP aligned2 ;
  2452. horizontaladd: ;
  2453. MOVAPD XMM1, XMM2 ;
  2454. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2455. ADDPD XMM2, XMM1 ;
  2456. MOVAPD XMM1, XMM3 ;
  2457. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2458. ADDPD XMM3, XMM1 ;
  2459. MOVAPD XMM1, XMM4 ;
  2460. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2461. ADDPD XMM4, XMM1 ;
  2462. MOVAPD XMM1, XMM5 ;
  2463. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2464. ADDPD XMM5, XMM1 ;
  2465. MOVAPD XMM1, XMM6 ;
  2466. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2467. ADDPD XMM6, XMM1 ;
  2468. singlepieces: ;
  2469. CMP EAX, 0 ;
  2470. JLE store ; len <= 0- > exit
  2471. MOV ESI, [EBP+radr] ;
  2472. MOVSD XMM7, [EBX] ;
  2473. MOVSD XMM0, [ESI+EDI] ;
  2474. ADD ESI, ECX ;
  2475. MOVSD XMM1, [ESI+EDI] ;
  2476. MULSD XMM0, XMM7 ;
  2477. ADDSD XMM2, XMM0 ;
  2478. ADD ESI, ECX ;
  2479. MOVSD XMM0, [ESI+EDI] ;
  2480. MULSD XMM1, XMM7 ;
  2481. ADDSD XMM3, XMM1 ;
  2482. ADD ESI, ECX ;
  2483. MOVSD XMM1, [ESI+EDI] ;
  2484. MULSD XMM0, XMM7 ;
  2485. ADDSD XMM4, XMM0 ;
  2486. ADD ESI, ECX ;
  2487. MOVSD XMM1, [ESI+EDI] ;
  2488. MULSD XMM0, XMM7 ;
  2489. ADDSD XMM4, XMM0 ;
  2490. ADD ESI, ECX ;
  2491. MOVSD XMM0, [ESI+EDI] ;
  2492. MULSD XMM1, XMM7 ;
  2493. ADDSD XMM5, XMM1 ;
  2494. MULSD XMM0, XMM7 ;
  2495. ADDSD XMM6, XMM0 ;
  2496. ADD EBX, 4 (* INC(ladr,incl) *)
  2497. ADD EDI, 4 (* INC(radr,incr) *)
  2498. DEC EAX ; DEC(len)
  2499. JMP singlepieces ;
  2500. store:
  2501. MOVSD [EDX], XMM2 ;
  2502. ADD EDX, [EBP+incd] ;
  2503. MOVSD [EDX], XMM3 ;
  2504. ADD EDX, [EBP+incd] ;
  2505. MOVSD [EDX], XMM4 ;
  2506. ADD EDX, [EBP+incd] ;
  2507. MOVSD [EDX], XMM5 ;
  2508. ADD EDX, [EBP+incd] ;
  2509. MOVSD [EDX], XMM6 ;
  2510. end:
  2511. END AlignedSPXSSE5;
  2512. *)
  2513. (* sse version of scalar product *)
  2514. PROCEDURE AlignedSPXSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  2515. add: BOOLEAN );
  2516. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2517. ; register initialization
  2518. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2519. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2520. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  2521. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2522. XORPD XMM0, XMM0 ;
  2523. CMP [EBP+add], 0 ; add?
  2524. JE aligned8 ; no add
  2525. MOVSD XMM0, [EDX] ;
  2526. aligned8:
  2527. CMP EAX, 8 ;
  2528. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2529. MOVAPD XMM1, [EBX] ;
  2530. MOVAPD XMM2, [EBX+16] ;
  2531. MOVAPD XMM3, [EBX+32] ;
  2532. MOVAPD XMM4, [ECX] ;
  2533. MOVAPD XMM5, [ECX+16] ;
  2534. MOVAPD XMM6, [ECX+32] ;
  2535. MULPD XMM1, XMM4 ;
  2536. ADDPD XMM0, XMM1 ;
  2537. MULPD XMM2, XMM5 ;
  2538. ADDPD XMM0, XMM2 ;
  2539. MULPD XMM3, XMM6 ;
  2540. ADDPD XMM0, XMM3 ;
  2541. MOVAPD XMM7, [EBX+48] ;
  2542. MOVAPD XMM1, [ECX+48] ;
  2543. MULPD XMM1, XMM7 ;
  2544. ADDPD XMM0, XMM1 ;
  2545. ADD EBX, 64 ;
  2546. ADD ECX, 64 ;
  2547. SUB EAX, 8 ;
  2548. JMP aligned8 ;
  2549. ; LOOP FOR 2 pieces aligned
  2550. aligned4:
  2551. CMP EAX, 4 ;
  2552. JL aligned2 ; ; len < 4- > EXIT TO singlepieces
  2553. MOVAPD XMM1, [EBX] ;
  2554. MOVAPD XMM2, [ECX] ;
  2555. MOVAPD XMM3, [EBX+16] ;
  2556. MOVAPD XMM4, [ECX+16] ;
  2557. MULPD XMM1, XMM2 ;
  2558. ADDPD XMM0, XMM1 ;
  2559. MULPD XMM3, XMM4 ;
  2560. ADDPD XMM0, XMM3 ;
  2561. ADD EBX, 32 ;
  2562. ADD ECX, 32 ;
  2563. SUB EAX, 4 ;
  2564. JMP aligned4 ;
  2565. aligned2:
  2566. CMP EAX, 2 ;
  2567. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2568. MOVAPD XMM1, [EBX] ;
  2569. MOVAPD XMM2, [ECX] ;
  2570. MULPD XMM1, XMM2 ;
  2571. ADDPD XMM0, XMM1 ;
  2572. ADD EBX, 16 ;
  2573. ADD ECX, 16 ;
  2574. SUB EAX, 2 ;
  2575. JMP aligned2 ;
  2576. horizontaladd: ;
  2577. MOVAPD XMM1, XMM0 ;
  2578. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2579. ADDPD XMM0, XMM1 ;
  2580. singlepieces: ;
  2581. CMP EAX, 0 ;
  2582. JLE store ; len <= 0- > EXIT
  2583. MOVSD XMM1, [EBX]
  2584. MOVSD XMM2, [ECX]
  2585. MULSD XMM1, XMM2
  2586. ADDSD XMM0, XMM1
  2587. ADD EBX, 8 ; INC(ladr, incl)
  2588. ADD ECX, 8 ; INC(radr, incr)
  2589. DEC EAX ; DEC(len)
  2590. JMP singlepieces ;
  2591. store:
  2592. MOVSD [EDX], XMM0 ;
  2593. endL:
  2594. END AlignedSPXSSE;
  2595. (*
  2596. PROCEDURE AlignedSPRSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2597. CODE {SYSTEM.i386, SYSTEM.SSE}
  2598. ; register initialization
  2599. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  2600. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  2601. MOV ESI, [EBP+radr] ; ECX reserved for radr
  2602. MOV EAX, [EBP+len] ; EAX reserverd for length
  2603. MOV ECX, [EBP+stride] ;
  2604. XORPS XMM2, XMM2 ;
  2605. XORPS XMM3, XMM3 ;
  2606. XORPS XMM4, XMM4 ;
  2607. XORPS XMM5, XMM5 ;
  2608. XORPS XMM6, XMM6 ;
  2609. XOR EDI, EDI ;
  2610. aligned8:
  2611. CMP EAX, 8 ;
  2612. JL aligned4 ; ; len < 4- > exit to singlepieces
  2613. PREFETCH0 24[EBX] ;
  2614. ; PREFETCH0[ESI] ;
  2615. MOV ESI, [EBP+radr] ;
  2616. ADD ESI, EDI ;
  2617. MOVAPS XMM7, [EBX] ;
  2618. MOVAPS XMM0, [ESI] ;
  2619. ADD ESI, ECX ;
  2620. MOVAPS XMM1, [ESI] ;
  2621. MULPS XMM0, XMM7 ;
  2622. ADDPS XMM2, XMM0 ;
  2623. ADD ESI, ECX ;
  2624. MOVAPS XMM0, [ESI] ;
  2625. MULPS XMM1, XMM7 ;
  2626. ADDPS XMM3, XMM1 ;
  2627. ADD ESI, ECX ;
  2628. MOVAPS XMM1, [ESI] ;
  2629. MULPS XMM0, XMM7 ;
  2630. ADDPS XMM4, XMM0 ;
  2631. ADD ESI, ECX ;
  2632. MOVAPS XMM0, [ESI] ;
  2633. MULPS XMM1, XMM7 ;
  2634. ADDPS XMM5, XMM1 ;
  2635. MULPS XMM0, XMM7 ;
  2636. ADDPS XMM6, XMM0 ;
  2637. ADD EBX, 16 ;
  2638. ADD EDI, 16 ;
  2639. MOV ESI, [EBP+radr] ;
  2640. ADD ESI, EDI ;
  2641. MOVAPS XMM7, [EBX] ;
  2642. MOVAPS XMM0, [ESI] ;
  2643. ADD ESI, ECX ;
  2644. MOVAPS XMM1, [ESI] ;
  2645. MULPS XMM0, XMM7 ;
  2646. ADDPS XMM2, XMM0 ;
  2647. ADD ESI, ECX ;
  2648. MOVAPS XMM0, [ESI] ;
  2649. MULPS XMM1, XMM7 ;
  2650. ADDPS XMM3, XMM1 ;
  2651. ADD ESI, ECX ;
  2652. MOVAPS XMM1, [ESI] ;
  2653. MULPS XMM0, XMM7 ;
  2654. ADDPS XMM4, XMM0 ;
  2655. ADD ESI, ECX ;
  2656. MOVAPS XMM0, [ESI] ;
  2657. MULPS XMM1, XMM7 ;
  2658. ADDPS XMM5, XMM1 ;
  2659. MULPS XMM0, XMM7 ;
  2660. ADDPS XMM6, XMM0 ;
  2661. ADD EBX, 16 ;
  2662. ADD EDI, 16 ;
  2663. SUB EAX, 8 ;
  2664. JMP aligned8 ;
  2665. aligned4:
  2666. CMP EAX, 4 ;
  2667. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2668. MOV ESI, [EBP+radr] ;
  2669. ADD ESI, EDI ;
  2670. MOVAPS XMM7, [EBX] ;
  2671. MOVAPS XMM0, [ESI] ;
  2672. ADD ESI, ECX ;
  2673. MOVAPS XMM1, [ESI] ;
  2674. MULPS XMM0, XMM7 ;
  2675. ADDPS XMM2, XMM0 ;
  2676. ADD ESI, ECX ;
  2677. MOVAPS XMM0, [ESI] ;
  2678. MULPS XMM1, XMM7 ;
  2679. ADDPS XMM3, XMM1 ;
  2680. ADD ESI, ECX ;
  2681. MOVAPS XMM1, [ESI] ;
  2682. MULPS XMM0, XMM7 ;
  2683. ADDPS XMM4, XMM0 ;
  2684. ADD ESI, ECX ;
  2685. MOVAPS XMM0, [ESI] ;
  2686. MULPS XMM1, XMM7 ;
  2687. ADDPS XMM5, XMM1 ;
  2688. MULPS XMM0, XMM7 ;
  2689. ADDPS XMM6, XMM0 ;
  2690. ADD EBX, 16 ;
  2691. ADD EDI, 16 ;
  2692. SUB EAX, 4 ;
  2693. JMP aligned4 ;
  2694. horizontaladd: ;
  2695. MOVLHPS XMM1, XMM2 ;
  2696. ADDPS XMM1, XMM2 ;
  2697. SHUFPS XMM2, XMM1, 48 ;
  2698. ADDPS XMM2, XMM1 ;
  2699. MOVHLPS XMM2, XMM2 ;
  2700. MOVLHPS XMM1, XMM3 ;
  2701. ADDPS XMM1, XMM3 ;
  2702. SHUFPS XMM3, XMM1, 48 ;
  2703. ADDPS XMM3, XMM1 ;
  2704. MOVHLPS XMM3, XMM3 ;
  2705. MOVLHPS XMM1, XMM4 ;
  2706. ADDPS XMM1, XMM4 ;
  2707. SHUFPS XMM4, XMM1, 48 ;
  2708. ADDPS XMM4, XMM1 ;
  2709. MOVHLPS XMM4, XMM4 ;
  2710. MOVLHPS XMM1, XMM5 ;
  2711. ADDPS XMM1, XMM5 ;
  2712. SHUFPS XMM5, XMM1, 48 ;
  2713. ADDPS XMM5, XMM1 ;
  2714. MOVHLPS XMM5, XMM5 ;
  2715. MOVLHPS XMM1, XMM6 ;
  2716. ADDPS XMM1, XMM6 ;
  2717. SHUFPS XMM6, XMM1, 48 ;
  2718. ADDPS XMM6, XMM1 ;
  2719. MOVHLPS XMM6, XMM6 ;
  2720. singlepieces: ;
  2721. CMP EAX, 0 ;
  2722. JLE store ; len <= 0- > exit
  2723. MOV ESI, [EBP+radr] ;
  2724. MOVSS XMM7, [EBX] ;
  2725. MOVSS XMM0, [ESI+EDI] ;
  2726. ADD ESI, ECX ;
  2727. MOVSS XMM1, [ESI+EDI] ;
  2728. MULSS XMM0, XMM7 ;
  2729. ADDSS XMM2, XMM0 ;
  2730. ADD ESI, ECX ;
  2731. MOVSS XMM0, [ESI+EDI] ;
  2732. MULSS XMM1, XMM7 ;
  2733. ADDSS XMM3, XMM1 ;
  2734. ADD ESI, ECX ;
  2735. MOVSS XMM1, [ESI+EDI] ;
  2736. MULSS XMM0, XMM7 ;
  2737. ADDSS XMM4, XMM0 ;
  2738. ADD ESI, ECX ;
  2739. MOVSS XMM0, [ESI+EDI] ;
  2740. MULSS XMM1, XMM7 ;
  2741. ADDSS XMM5, XMM1 ;
  2742. MULSS XMM0, XMM7 ;
  2743. ADDSS XMM6, XMM0 ;
  2744. ADD EBX, 4 (* INC(ladr,incl) *)
  2745. ADD EDI, 4 (* INC(radr,incr) *)
  2746. DEC EAX ; DEC(len)
  2747. JMP singlepieces ;
  2748. store:
  2749. MOVSS [EDX], XMM2 ;
  2750. ADD EDX, [EBP+incd] ;
  2751. MOVSS [EDX], XMM3 ;
  2752. ADD EDX, [EBP+incd] ;
  2753. MOVSS [EDX], XMM4 ;
  2754. ADD EDX, [EBP+incd] ;
  2755. MOVSS [EDX], XMM5 ;
  2756. ADD EDX, [EBP+incd] ;
  2757. MOVSS [EDX], XMM6 ;
  2758. end:
  2759. END AlignedSPRSSE5;
  2760. *)
  2761. PROCEDURE AlignedSPRSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  2762. add: BOOLEAN );
  2763. CODE {SYSTEM.i386, SYSTEM.SSE}
  2764. ; register initialization
  2765. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2766. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2767. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  2768. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2769. XORPS XMM0, XMM0 ;
  2770. CMP [EBP+add], 0 ; add?
  2771. JE aligned16 ; no add
  2772. MOVSS XMM0, [EDX] ;
  2773. aligned16:
  2774. CMP EAX, 16 ;
  2775. JL aligned8 ; len < 4- > EXIT TO singlepieces
  2776. MOVAPS XMM1, [EBX] ;
  2777. MOVAPS XMM4, [ECX] ;
  2778. MOVAPS XMM2, [EBX+16] ;
  2779. MOVAPS XMM5, [ECX+16] ;
  2780. MULPS XMM1, XMM4 ;
  2781. ADDPS XMM0, XMM1 ;
  2782. MOVAPS XMM3, [EBX+32] ;
  2783. MOVAPS XMM6, [ECX+32] ;
  2784. MULPS XMM2, XMM5 ;
  2785. ADDPS XMM0, XMM2 ;
  2786. MOVAPS XMM7, [EBX+48] ;
  2787. MOVAPS XMM1, [ECX+48] ;
  2788. MULPS XMM3, XMM6 ;
  2789. ADDPS XMM0, XMM3 ;
  2790. MULPS XMM1, XMM7 ;
  2791. ADDPS XMM0, XMM1 ;
  2792. ADD EBX, 64 ;
  2793. ADD ECX, 64 ;
  2794. SUB EAX, 16 ;
  2795. JMP aligned16 ;
  2796. ; LOOP FOR 8 pieces aligned
  2797. aligned8:
  2798. CMP EAX, 8 ;
  2799. JL aligned4 ; ; len < 4- > EXIT TO singlepieces
  2800. MOVAPS XMM1, [EBX] ;
  2801. MOVAPS XMM4, [ECX] ;
  2802. MOVAPS XMM2, [EBX+16] ;
  2803. MOVAPS XMM5, [ECX+16] ;
  2804. MULPS XMM1, XMM4 ;
  2805. ADDPS XMM0, XMM1 ;
  2806. MULPS XMM2, XMM5 ;
  2807. ADDPS XMM0, XMM2 ;
  2808. ADD EBX, 32 ;
  2809. ADD ECX, 32 ;
  2810. SUB EAX, 8 ;
  2811. JMP aligned8 ;
  2812. aligned4:
  2813. CMP EAX, 4 ;
  2814. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2815. MOVAPS XMM1, [EBX] ;
  2816. MOVAPS XMM2, [ECX] ;
  2817. MULPS XMM1, XMM2 ;
  2818. ADDPS XMM0, XMM1 ;
  2819. ADD EBX, 16 ;
  2820. ADD ECX, 16 ;
  2821. SUB EAX, 4 ;
  2822. JMP aligned4 ;
  2823. horizontaladd: ;
  2824. MOVAPS XMM1, XMM0 ;
  2825. ; 1*0 (* dest 0 -> dest 0 *) + 4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  2826. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  2827. ADDPS XMM1, XMM0 ;
  2828. MOVAPS XMM0, XMM1
  2829. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  2830. ADDPS XMM0, XMM1 ;
  2831. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  2832. singlepieces: ;
  2833. CMP EAX, 0 ;
  2834. JLE store ; len <= 0- > EXIT
  2835. MOVSS XMM1, [EBX]
  2836. MOVSS XMM2, [ECX]
  2837. MULSS XMM1, XMM2
  2838. ADDSS XMM0, XMM1
  2839. ADD EBX, 4 ; INC(ladr, incl)
  2840. ADD ECX, 4 ; INC(radr, incr)
  2841. DEC EAX ; DEC(len)
  2842. JMP singlepieces ;
  2843. store:
  2844. MOVSS [EDX], XMM0 ;
  2845. endL:
  2846. END AlignedSPRSSE;
  2847. (*
  2848. (* sse version of scalar product *)
  2849. PROCEDURE AlignedSPRSSE( ladr, radr, dadr, rows, stride, dinc, len: LONGINT );
  2850. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2851. ; register initialization
  2852. MOV EDI, [EBP+radr] ; radr start
  2853. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  2854. MOV ESI, [EBP+rows] ; outer loop counter
  2855. outerloop:
  2856. CMP ESI, 0 ;
  2857. JLE end ;
  2858. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  2859. MOV ECX, EDI ; ECX reserved for radr
  2860. MOV EAX, [EBP+len] ; EAX reserverd for length
  2861. XORPS XMM0, XMM0 ;
  2862. aligned16:
  2863. CMP EAX, 16 ;
  2864. JL aligned8 ; len < 4- > exit to singlepieces
  2865. MOVAPS XMM1, [EBX] ;
  2866. MOVAPS XMM2, [EBX+16] ;
  2867. MOVAPS XMM3, [EBX+32] ;
  2868. MOVAPS XMM4, [ECX] ;
  2869. MOVAPS XMM5, [ECX+16] ;
  2870. MOVAPS XMM6, [ECX+32] ;
  2871. MULPS XMM1, XMM4 ;
  2872. ADDPS XMM0, XMM1 ;
  2873. MULPS XMM2, XMM5 ;
  2874. ADDPS XMM0, XMM2 ;
  2875. MULPS XMM3, XMM6 ;
  2876. ADDPS XMM0, XMM3 ;
  2877. MOVAPS XMM7, [EBX+48] ;
  2878. MOVAPS XMM1, [ECX+48] ;
  2879. MULPS XMM1, XMM7 ;
  2880. ADDPS XMM0, XMM1 ;
  2881. ADD EBX, 64 ;
  2882. ADD ECX, 64 ;
  2883. SUB EAX, 16 ;
  2884. JMP aligned16 ;
  2885. ; loop for 8 pieces aligned
  2886. aligned8:
  2887. CMP EAX, 8 ;
  2888. JL aligned4 ; ; len < 4- > exit to singlepieces
  2889. MOVAPS XMM1, [EBX] ;
  2890. MOVAPS XMM2, [EBX+16] ;
  2891. MOVAPS XMM4, [ECX] ;
  2892. MOVAPS XMM5, [ECX+16] ;
  2893. MULPS XMM1, XMM4 ;
  2894. ADDPS XMM0, XMM1 ;
  2895. MULPS XMM2, XMM5 ;
  2896. ADDPS XMM0, XMM2 ;
  2897. ADD EBX, 32 ;
  2898. ADD ECX, 32 ;
  2899. SUB EAX, 8 ;
  2900. JMP aligned8 ;
  2901. aligned4:
  2902. CMP EAX, 4 ;
  2903. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2904. MOVAPS XMM1, [EBX] ;
  2905. MOVAPS XMM2, [ECX] ;
  2906. MULPS XMM1, XMM2 ;
  2907. ADDPS XMM0, XMM1 ;
  2908. ADD EBX, 16 ;
  2909. ADD ECX, 16 ;
  2910. SUB EAX, 4 ;
  2911. JMP aligned4 ;
  2912. horizontaladd: ;
  2913. MOVAPS XMM1, XMM0 ;
  2914. SHUFPS XMM1, XMM1, 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  2915. ADDPS XMM1, XMM0 ;
  2916. MOVAPS XMM0, XMM1
  2917. SHUFPS XMM0, XMM0, 16*3 ; (* src 3-> dest 2 *)
  2918. ADDPS XMM0, XMM1 ;
  2919. SHUFPS XMM0, XMM0, 1*2 ; (* dest 2 -> dest 0 *)
  2920. singlepieces: ;
  2921. CMP EAX, 0 ;
  2922. JLE store ; len <= 0- > exit
  2923. MOVSS XMM1, [EBX]
  2924. MOVSS XMM2, [ECX]
  2925. MULSS XMM1, XMM2
  2926. ADDSS XMM0, XMM1
  2927. ADD EBX, 4 (* INC(ladr,incl) *)
  2928. ADD ECX, 4 (* INC(radr,incr) *)
  2929. DEC EAX ; DEC(len)
  2930. JMP singlepieces ;
  2931. store:
  2932. MOVSS [EDX], XMM0 ;
  2933. ADD EDX, [EBP+dinc] ;
  2934. ADD EDI, [EBP+stride] ;
  2935. DEC ESI ;
  2936. JMP outerloop ;
  2937. end:
  2938. END AlignedSPRSSE;
  2939. *)
  2940. PROCEDURE Copy4( ladr, dadr: ADDRESS; linc, dinc, len: SIZE);
  2941. CODE {SYSTEM.i386}
  2942. MOV ESI, [EBP+ladr] ; ECX := ladr
  2943. MOV EDI, [EBP+dadr] ; EDX := dadr
  2944. MOV ECX, [EBP+len] ; EBX := len
  2945. MOV EAX, [EBP+linc] ;
  2946. CMP EAX, 4 ;
  2947. JNE loopL ;
  2948. MOV EAX, [EBP+dinc] ;
  2949. CMP EAX, 4 ;
  2950. JNE loopL ;
  2951. fastmove:
  2952. CLD ; incremental
  2953. REP ;
  2954. MOVSD ; move rest IN one byte steps
  2955. JMP endL ;
  2956. loopL:
  2957. CMP ECX, 0 ;
  2958. JLE endL ; WHILE ECX > 0 DO
  2959. MOV EAX, [ESI] ; EAX := SYSTEM.GET32(ESI)
  2960. MOV [EDI], EAX ; SYSTEM.PUT32(EDI, EAX))
  2961. ADD ESI, [EBP+linc] ; INC(ESI, linc)
  2962. ADD EDI, [EBP+dinc] ; INC(EDI, rinc)
  2963. DEC ECX ; DEC(ECX)
  2964. JMP loopL
  2965. endL:
  2966. END Copy4;
  2967. PROCEDURE Copy8( ladr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2968. CODE {SYSTEM.i386}
  2969. MOV ESI, [EBP+ladr] ; ECX := ladr
  2970. MOV EDI, [EBP+dadr] ; EDX := dadr
  2971. MOV ECX, [EBP+len] ; EBX := len
  2972. MOV EAX, [EBP+linc] ;
  2973. CMP EAX, 8 ;
  2974. JNE loopL ;
  2975. MOV EAX, [EBP+dinc] ;
  2976. CMP EAX, 8 ;
  2977. JNE loopL ;
  2978. fastmove:
  2979. SHL ECX, 1 ;
  2980. CLD ; incremental
  2981. REP ;
  2982. MOVSD ; move rest IN one byte steps
  2983. JMP endL ;
  2984. loopL:
  2985. CMP ECX, 0 ;
  2986. JLE endL ; WHILE EBX > 0 DO
  2987. MOV EAX, [ESI] ; EAX := SYSTEM.GET32(ECX)
  2988. MOV [EDI], EAX ; SYSTEM.PUT32(EDX, EAX))
  2989. MOV EAX, [ESI+4] ; EAX := SYSTEM.GET32(ECX+4)
  2990. MOV [EDI+4], EAX ; SYSTEM.PUT32(EDX+4, EAX))
  2991. ADD ESI, [EBP+linc] ; INC(ECX, linc)
  2992. ADD EDI, [EBP+dinc] ; INC(EDX, rinc)
  2993. DEC ECX ; DEC(EBX)
  2994. JMP loopL
  2995. endL:
  2996. END Copy8;
  2997. PROCEDURE Transpose4A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  2998. CODE {SYSTEM.i386}
  2999. startrows:
  3000. MOV EAX, [EBP+rows] ;
  3001. startouter:
  3002. CMP EAX, 0 ;
  3003. JLE endL ;
  3004. MOV ESI, [EBP+ladr] ;
  3005. MOV EDI, [EBP+dadr] ;
  3006. MOV EBX, [EBP+linc] ;
  3007. MOV ECX, [EBP+dstride] ;
  3008. MOV EAX, [EBP+cols] ;
  3009. startinner:
  3010. CMP EAX, 0 ;
  3011. JLE endinner ;
  3012. MOV EDX, [ESI] ;
  3013. MOV [EDI], EDX ;
  3014. ADD ESI, EBX ;
  3015. ADD EDI, ECX ;
  3016. DEC EAX ;
  3017. JMP startinner ;
  3018. endinner:
  3019. MOV ESI, [EBP+ladr] ;
  3020. ADD ESI, [EBP+lstride] ;
  3021. MOV [EBP+ladr], ESI
  3022. MOV EDI, [EBP+dadr] ;
  3023. ADD EDI, [EBP+dinc] ;
  3024. MOV [EBP+dadr], EDI ;
  3025. MOV EAX, [EBP+rows] ;
  3026. DEC EAX ;
  3027. MOV [EBP+rows], EAX ;
  3028. JMP startouter ;
  3029. endL:
  3030. END Transpose4A;
  3031. PROCEDURE Transpose4( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3032. VAR l, d, c: SIZE; BlockSize: SIZE;
  3033. BEGIN
  3034. BlockSize :=
  3035. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3036. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3037. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3038. BlockSize := MAX( 8, BlockSize );
  3039. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3040. WHILE (rows >= BlockSize) DO
  3041. c := cols; l := ladr; d := dadr;
  3042. WHILE (c >= BlockSize) DO
  3043. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3044. BlockSize );
  3045. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3046. INC( d, BlockSize * dstride );
  3047. END;
  3048. IF c > 0 THEN
  3049. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3050. END;
  3051. DEC( rows, BlockSize ); INC( ladr, BlockSize * lstride );
  3052. INC( dadr, BlockSize * dinc );
  3053. END;
  3054. IF (rows > 0) THEN
  3055. c := cols; l := ladr; d := dadr;
  3056. WHILE (c >= BlockSize) DO
  3057. Transpose4A( l, d, lstride, linc, dstride, dinc, rows,
  3058. BlockSize );
  3059. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3060. INC( d, BlockSize * dstride );
  3061. END;
  3062. IF c > 0 THEN
  3063. Transpose4A( l, d, lstride, linc, dstride, dinc, rows, c );
  3064. END;
  3065. END;
  3066. END Transpose4;
  3067. PROCEDURE Transpose8( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3068. VAR l, d, c: SIZE; BlockSize: SIZE;
  3069. BEGIN
  3070. BlockSize :=
  3071. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3072. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3073. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3074. BlockSize := MAX( 8, BlockSize );
  3075. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3076. WHILE (rows >= BlockSize) DO
  3077. c := cols; l := ladr; d := dadr;
  3078. WHILE (c >= BlockSize) DO
  3079. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3080. BlockSize );
  3081. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3082. INC( d, BlockSize * dstride );
  3083. END;
  3084. IF c > 0 THEN
  3085. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3086. END;
  3087. DEC( rows, BlockSize ); INC( ladr, lstride * BlockSize );
  3088. INC( dadr, dinc * BlockSize );
  3089. END;
  3090. IF (rows > 0) THEN
  3091. c := cols; l := ladr; d := dadr;
  3092. WHILE (c >= BlockSize) DO
  3093. Transpose8A( l, d, lstride, linc, dstride, dinc, rows,
  3094. BlockSize );
  3095. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3096. INC( d, BlockSize * dstride );
  3097. END;
  3098. IF c > 0 THEN
  3099. Transpose8A( l, d, lstride, linc, dstride, dinc, rows, c );
  3100. END;
  3101. END;
  3102. END Transpose8;
  3103. PROCEDURE Transpose8A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3104. CODE {SYSTEM.i386}
  3105. startrows:
  3106. MOV EAX, [EBP+rows] ;
  3107. startouter:
  3108. CMP EAX, 0 ;
  3109. JLE endL ;
  3110. MOV ESI, [EBP+ladr] ;
  3111. MOV EDI, [EBP+dadr] ;
  3112. MOV EBX, [EBP+linc] ;
  3113. MOV ECX, [EBP+dstride] ;
  3114. MOV EAX, [EBP+cols] ;
  3115. startinner:
  3116. CMP EAX, 0 ;
  3117. JLE endinner ;
  3118. MOV EDX, [ESI] ;
  3119. MOV [EDI], EDX ;
  3120. MOV EDX, [ESI+4] ;
  3121. MOV [EDI+4], EDX ;
  3122. ADD ESI, EBX ;
  3123. ADD EDI, ECX ;
  3124. DEC EAX ;
  3125. JMP startinner ;
  3126. endinner:
  3127. MOV ESI, [EBP+ladr] ;
  3128. ADD ESI, [EBP+lstride] ;
  3129. MOV [EBP+ladr], ESI
  3130. MOV EDI, [EBP+dadr] ;
  3131. ADD EDI, [EBP+dinc] ;
  3132. MOV [EBP+dadr], EDI ;
  3133. MOV EAX, [EBP+rows] ;
  3134. DEC EAX ;
  3135. MOV [EBP+rows], EAX ;
  3136. JMP startouter ;
  3137. endL:
  3138. END Transpose8A;
  3139. PROCEDURE SSEMul24BlockR( VAR CbFirst: SIZE;
  3140. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3141. add: BOOLEAN );
  3142. CODE {SYSTEM.i386, SYSTEM.SSE}
  3143. MatrixOfResultsSetup:
  3144. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3145. RowOfResultsLoop:
  3146. MOV EBX, 0 ; counter FOR columns IN B-Cb
  3147. DotProductSetup:
  3148. MOV ESI, [EBP+matrixA] ; matrixA
  3149. MOV EDI, [EBP+matrixB] ; matrixB
  3150. LEA EDI, [EDI+EBX*4] ; current position IN matrixB
  3151. XORPS XMM2, XMM2
  3152. XORPS XMM3, XMM3
  3153. XORPS XMM4, XMM4
  3154. XORPS XMM5, XMM5
  3155. XORPS XMM6, XMM6
  3156. XORPS XMM7, XMM7
  3157. MOV EAX, 0 ;
  3158. MOV AL, [EBP+add] ;
  3159. CMP AL, 0 ; add?
  3160. JE DotProductLoop ;
  3161. MOV EAX, [EBP+matrixC] ; matrixC
  3162. LEA EAX, [EAX+EBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3163. MOVUPS XMM2, [EAX]
  3164. MOVUPS XMM3, [EAX+16]
  3165. MOVUPS XMM4, [EAX+32]
  3166. MOVUPS XMM5, [EAX+48]
  3167. MOVUPS XMM6, [EAX+64]
  3168. MOVUPS XMM7, [EAX+80]
  3169. MOV EAX, 0
  3170. DotProductLoop:
  3171. MOV EDX, [ESI+EAX*4]
  3172. SHL EDX, 1
  3173. CMP EDX, 0
  3174. JE SparseEntryEscape
  3175. MOVSS XMM0, [ESI+EAX*4]
  3176. SHUFPS XMM0, XMM0, 0H
  3177. MOVUPS XMM1, [EDI]
  3178. MULPS XMM1, XMM0
  3179. ADDPS XMM2, XMM1
  3180. MOVUPS XMM1, [EDI+16]
  3181. MULPS XMM1, XMM0
  3182. ADDPS XMM3, XMM1
  3183. MOVUPS XMM1, [EDI+32]
  3184. MULPS XMM1, XMM0
  3185. ADDPS XMM4, XMM1
  3186. MOVUPS XMM1, [EDI+48]
  3187. MULPS XMM1, XMM0
  3188. ADDPS XMM5, XMM1
  3189. MOVUPS XMM1, [EDI+64]
  3190. MULPS XMM1, XMM0
  3191. ADDPS XMM6, XMM1
  3192. MOVUPS XMM1, [EDI+80]
  3193. MULPS XMM1, XMM0
  3194. ADDPS XMM7, XMM1
  3195. SparseEntryEscape:
  3196. ADD EDI, [EBP+StrideB] ; StrideB
  3197. INC EAX
  3198. CMP EAX, [EBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3199. JL DotProductLoop
  3200. ; endL DopProductLoop
  3201. MOV EAX, [EBP+matrixC] ; matrixC
  3202. LEA EAX, [EAX+EBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3203. MOVUPS [EAX], XMM2
  3204. MOVUPS [EAX+16], XMM3
  3205. MOVUPS [EAX+32], XMM4
  3206. MOVUPS [EAX+48], XMM5
  3207. MOVUPS [EAX+64], XMM6
  3208. MOVUPS [EAX+80], XMM7
  3209. ADD EBX, 24 ; move over TO next batch OF 24
  3210. MOV EDX, EBX
  3211. ADD EDX, 24
  3212. CMP EDX, [EBP+Cb] ; Cb, check TO see IF row IS complete
  3213. JLE DotProductSetup
  3214. ; endL RowOfResultsLoop
  3215. MOV EAX, [EBP+matrixA] ; matrixA
  3216. ADD EAX, [EBP+StrideA] ; StrideA
  3217. MOV [EBP+matrixA], EAX ; matrixA
  3218. MOV EAX, [EBP+matrixC] ; matrixC
  3219. ADD EAX, [EBP+StrideC] ; StrideC
  3220. MOV [EBP+matrixC], EAX ; matrixC
  3221. INC ECX
  3222. CMP ECX, [EBP+Ra] ; Ra
  3223. JL RowOfResultsLoop
  3224. Done:
  3225. MOV EAX, [EBP+CbFirst] ; CbFirst
  3226. MOV [EAX], EBX ;
  3227. END SSEMul24BlockR;
  3228. (*! might be better to make a 10Block operation and utilize 2 registers for temporary calculations, see article abaout Emmerald*)
  3229. PROCEDURE SSEMul12BlockX( VAR CbFirst: SIZE;
  3230. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC :ADDRESS;
  3231. add: BOOLEAN );
  3232. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3233. MatrixOfResultsSetup:
  3234. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3235. RowOfResultsLoop:
  3236. MOV EBX, 0 ; counter FOR columns IN B-Cb
  3237. DotProductSetup:
  3238. MOV ESI, [EBP+matrixA] ; matrixA
  3239. MOV EDI, [EBP+matrixB] ; matrixB
  3240. LEA EDI, [EDI+EBX*8]
  3241. XORPD XMM2, XMM2
  3242. XORPD XMM3, XMM3
  3243. XORPD XMM4, XMM4
  3244. XORPD XMM5, XMM5
  3245. XORPD XMM6, XMM6
  3246. XORPD XMM7, XMM7
  3247. MOV EAX, 0 ;
  3248. MOV AL, [EBP+add] ;
  3249. CMP AL, 0 ; add?
  3250. JE DotProductLoop ;
  3251. MOV EAX, [EBP+matrixC] ; matrixC
  3252. LEA EAX, [EAX+EBX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3253. MOVUPD XMM2, [EAX]
  3254. MOVUPD XMM3, [EAX+16]
  3255. MOVUPD XMM4, [EAX+32]
  3256. MOVUPD XMM5, [EAX+48]
  3257. MOVUPD XMM6, [EAX+64]
  3258. MOVUPD XMM7, [EAX+80]
  3259. MOV EAX, 0
  3260. DotProductLoop:
  3261. ; MOV EDX, [ESI+EAX*8]
  3262. ; SHL EDX, 1
  3263. ; CMP EDX, 0
  3264. ; JE SparseEntryEscape
  3265. MOVSD XMM0, [ESI+EAX*8]
  3266. SHUFPD XMM0, XMM0, 0H
  3267. MOVUPD XMM1, [EDI]
  3268. MULPD XMM1, XMM0
  3269. ADDPD XMM2, XMM1
  3270. MOVUPD XMM1, [EDI+16]
  3271. MULPD XMM1, XMM0
  3272. ADDPD XMM3, XMM1
  3273. MOVUPD XMM1, [EDI+32]
  3274. MULPD XMM1, XMM0
  3275. ADDPD XMM4, XMM1
  3276. MOVUPD XMM1, [EDI+48]
  3277. MULPD XMM1, XMM0
  3278. ADDPD XMM5, XMM1
  3279. MOVUPD XMM1, [EDI+64]
  3280. MULPD XMM1, XMM0
  3281. ADDPD XMM6, XMM1
  3282. MOVUPD XMM1, [EDI+80]
  3283. MULPD XMM1, XMM0
  3284. ADDPD XMM7, XMM1
  3285. SparseEntryEscape:
  3286. ADD EDI, [EBP+StrideB] ; StrideB
  3287. INC EAX
  3288. CMP EAX, [EBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3289. JL DotProductLoop ; endL DopProductLoop
  3290. MOV EAX , [EBP+matrixC] ; matrixC
  3291. LEA EAX, [EAX+EBX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3292. MOVUPD [EAX], XMM2
  3293. MOVUPD [EAX+16], XMM3
  3294. MOVUPD [EAX+32], XMM4
  3295. MOVUPD [EAX+48], XMM5
  3296. MOVUPD [EAX+64], XMM6
  3297. MOVUPD [EAX+80], XMM7
  3298. ADD EBX, 12 ; move over TO next batch OF 12
  3299. MOV EDX, EBX
  3300. ADD EDX, 12
  3301. CMP EDX, [EBP+Cb] ; Cb, check TO see IF row IS complete
  3302. JLE DotProductSetup ; end RowOfResultsLoop
  3303. MOV EAX , [EBP+matrixA] ; matrixA
  3304. ADD EAX, [EBP+StrideA] ; StrideA
  3305. MOV [EBP+matrixA], EAX ; matrixA
  3306. MOV EAX, [EBP+matrixC] ; matrixC
  3307. ADD EAX, [EBP+StrideC] ; StrideC
  3308. MOV [EBP+matrixC], EAX ; matrixC
  3309. INC ECX
  3310. CMP ECX, [EBP+Ra] ; Ra
  3311. JL RowOfResultsLoop
  3312. Done:
  3313. MOV EAX, [EBP+CbFirst] ; CbFirst
  3314. MOV [EAX], EBX ;
  3315. END SSEMul12BlockX;
  3316. PROCEDURE SSEMul16BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3317. add: BOOLEAN );
  3318. CODE {SYSTEM.i386, SYSTEM.SSE}
  3319. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3320. DotProductSetup:
  3321. MOV ESI, [EBP+matrixA] ; matrixA
  3322. MOV EDI, [EBP+matrixB] ; matrixB
  3323. MOV EDX, [EBP+CbFrom] ; CbFrom
  3324. LEA EDI, [EDI+EDX*4]
  3325. XORPS XMM2, XMM2
  3326. XORPS XMM3, XMM3
  3327. XORPS XMM4, XMM4
  3328. XORPS XMM5, XMM5
  3329. MOV EAX, 0 ;
  3330. MOV AL, [EBP+add] ;
  3331. CMP AL, 0 ; add?
  3332. JE DotProductLoop ;
  3333. MOV EAX, [EBP+matrixC] ; matrixC
  3334. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally
  3335. MOVUPS XMM2, [EAX]
  3336. MOVUPS XMM3, [EAX+16]
  3337. MOVUPS XMM4, [EAX+32]
  3338. MOVUPS XMM5, [EAX+48]
  3339. MOV EAX, 0
  3340. DotProductLoop:
  3341. MOV EDX, [ESI+EAX*4]
  3342. SHL EDX, 1
  3343. CMP EDX, 0
  3344. JE SparseEntryEscape
  3345. MOVSS XMM0, [ESI+EAX*4]
  3346. SHUFPS XMM0, XMM0, 0H
  3347. MOVUPS XMM1, [EDI]
  3348. MULPS XMM1, XMM0
  3349. ADDPS XMM2, XMM1
  3350. MOVUPS XMM1, [EDI+16]
  3351. MULPS XMM1, XMM0
  3352. ADDPS XMM3, XMM1
  3353. MOVUPS XMM1, [EDI+32]
  3354. MULPS XMM1, XMM0
  3355. ADDPS XMM4, XMM1
  3356. MOVUPS XMM1, [EDI+48]
  3357. MULPS XMM1, XMM0
  3358. ADDPS XMM5, XMM1
  3359. SparseEntryEscape:
  3360. ADD EDI, [EBP+StrideB] ; StrideB
  3361. INC EAX
  3362. CMP EAX, [EBP+Ca] ; Ca
  3363. JL DotProductLoop ; end DotProductLoop
  3364. MOV EAX , [EBP+matrixC] ; matrixC
  3365. MOV EDX, [EBP+CbFrom] ; CbFirst
  3366. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 12
  3367. MOVUPS [EAX], XMM2
  3368. MOVUPS [EAX+16], XMM3
  3369. MOVUPS [EAX+32], XMM4
  3370. MOVUPS [EAX+48], XMM5
  3371. MOV EAX, [EBP+matrixA] ; matrixA
  3372. ADD EAX, [EBP+StrideA] ; StrideA
  3373. MOV [EBP+matrixA], EAX ; matrixA
  3374. MOV EAX, [EBP+matrixC] ; matrixC
  3375. ADD EAX, [EBP+StrideC] ; StrideC
  3376. MOV [EBP+matrixC], EAX ; matrixC
  3377. INC ECX
  3378. CMP ECX, [EBP+Ra] ; Ra
  3379. JL DotProductSetup ;
  3380. END SSEMul16BlockR;
  3381. PROCEDURE SSEMul8BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3382. add: BOOLEAN );
  3383. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3384. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3385. DotProductSetup:
  3386. MOV ESI, [EBP+matrixA] ; matrixA
  3387. MOV EDI, [EBP+matrixB] ; matrixB
  3388. MOV EDX, [EBP+CbFrom] ; CbFrom
  3389. LEA EDI, [EDI+EDX*8]
  3390. XORPD XMM2, XMM2
  3391. XORPD XMM3, XMM3
  3392. XORPD XMM4, XMM4
  3393. XORPD XMM5, XMM5
  3394. MOV EAX, 0 ;
  3395. MOV AL, [EBP+add] ;
  3396. CMP AL, 0 ; add?
  3397. JE DotProductLoop ;
  3398. MOV EAX, [EBP+matrixC] ; matrixC
  3399. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3400. MOVUPD XMM2, [EAX]
  3401. MOVUPD XMM3, [EAX+16]
  3402. MOVUPD XMM4, [EAX+32]
  3403. MOVUPD XMM5, [EAX+48]
  3404. MOV EAX, 0
  3405. DotProductLoop:
  3406. ; MOV EDX, [ESI+EAX*8]
  3407. ; SHL EDX, 1
  3408. ; CMP EDX, 0
  3409. ; JE SparseEntryEscape
  3410. MOVSD XMM0, [ESI+EAX*8]
  3411. SHUFPD XMM0, XMM0, 0H
  3412. MOVUPD XMM1, [EDI]
  3413. MULPD XMM1, XMM0
  3414. ADDPD XMM2, XMM1
  3415. MOVUPD XMM1, [EDI+16]
  3416. MULPD XMM1, XMM0
  3417. ADDPD XMM3, XMM1
  3418. MOVUPD XMM1, [EDI+32]
  3419. MULPD XMM1, XMM0
  3420. ADDPD XMM4, XMM1
  3421. MOVUPD XMM1, [EDI+48]
  3422. MULPD XMM1, XMM0
  3423. ADDPD XMM5, XMM1
  3424. SparseEntryEscape:
  3425. ADD EDI, [EBP+StrideB] ; StrideB
  3426. INC EAX
  3427. CMP EAX, [EBP+Ca] ; Ca
  3428. JL DotProductLoop ; end DotProductLoop
  3429. MOV EAX , [EBP+matrixC] ; matrixC
  3430. MOV EDX, [EBP+CbFrom] ; CbFirst
  3431. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3432. MOVUPD [EAX], XMM2
  3433. MOVUPD [EAX+16], XMM3
  3434. MOVUPD [EAX+32], XMM4
  3435. MOVUPD [EAX+48], XMM5
  3436. MOV EAX, [EBP+matrixA] ; matrixA
  3437. ADD EAX, [EBP+StrideA] ; StrideA
  3438. MOV [EBP+matrixA], EAX ; matrixA
  3439. MOV EAX, [EBP+matrixC] ; matrixC
  3440. ADD EAX, [EBP+StrideC] ; StrideC
  3441. MOV [EBP+matrixC], EAX ; matrixC
  3442. INC ECX
  3443. CMP ECX, [EBP+Ra] ; Ra
  3444. JL DotProductSetup ;
  3445. END SSEMul8BlockX;
  3446. PROCEDURE SSEMul8BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3447. add: BOOLEAN );
  3448. CODE {SYSTEM.i386, SYSTEM.SSE}
  3449. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3450. DotProductSetup:
  3451. MOV ESI, [EBP+matrixA] ; matrixA
  3452. MOV EDI, [EBP+matrixB] ; matrixB
  3453. MOV EDX, [EBP+CbFrom] ; CbFrom
  3454. LEA EDI, [EDI+EDX*4]
  3455. XORPS XMM2, XMM2
  3456. XORPS XMM3, XMM3
  3457. MOV EAX, 0 ;
  3458. MOV AL, [EBP+add] ;
  3459. CMP AL, 0 ; add?
  3460. JE DotProductLoop ;
  3461. MOV EAX, [EBP+matrixC] ; matrixC
  3462. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3463. MOVUPS XMM2, [EAX]
  3464. MOVUPS XMM3, [EAX+16]
  3465. MOV EAX, 0
  3466. DotProductLoop:
  3467. MOV EDX, [ESI+EAX*4]
  3468. SHL EDX, 1
  3469. CMP EDX, 0
  3470. JE SparseEntryEscape
  3471. MOVSS XMM0, [ESI+EAX*4]
  3472. SHUFPS XMM0, XMM0, 0H
  3473. MOVUPS XMM1, [EDI]
  3474. MULPS XMM1, XMM0
  3475. ADDPS XMM2, XMM1
  3476. MOVUPS XMM1, [EDI+16]
  3477. MULPS XMM1, XMM0
  3478. ADDPS XMM3, XMM1
  3479. SparseEntryEscape:
  3480. ADD EDI, [EBP+StrideB] ; StrideB
  3481. INC EAX
  3482. CMP EAX, [EBP+Ca] ; Ca
  3483. JL DotProductLoop ; end DotProductLoop
  3484. MOV EAX , [EBP+matrixC] ; matrixC
  3485. MOV EDX, [EBP+CbFrom] ; CbFrom
  3486. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3487. MOVUPS [EAX], XMM2
  3488. MOVUPS [EAX+16], XMM3
  3489. MOV EAX, [EBP+matrixA] ; matrixA
  3490. ADD EAX, [EBP+StrideA] ; StrideA
  3491. MOV [EBP+matrixA], EAX ; matrixA
  3492. MOV EAX, [EBP+matrixC] ; matrixC
  3493. ADD EAX, [EBP+StrideC] ; StrideC
  3494. MOV [EBP+matrixC], EAX ; matrixC
  3495. INC ECX
  3496. CMP ECX, [EBP+Ra] ; Ra
  3497. JL DotProductSetup ;
  3498. END SSEMul8BlockR;
  3499. PROCEDURE SSEMul4BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3500. add: BOOLEAN );
  3501. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3502. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3503. DotProductSetup:
  3504. MOV EAX, 0 ; cols IN A
  3505. MOV ESI, [EBP+matrixA] ; matrixA
  3506. MOV EDI, [EBP+matrixB] ; matrixB
  3507. MOV EDX, [EBP+CbFrom] ; CbFrom
  3508. LEA EDI, [EDI+EDX*8]
  3509. XORPS XMM2, XMM2
  3510. XORPS XMM3, XMM3
  3511. MOV EAX, 0 ;
  3512. MOV AL, [EBP+add] ;
  3513. CMP AL, 0 ; add?
  3514. JE DotProductLoop ;
  3515. MOV EAX, [EBP+matrixC] ; matrixC
  3516. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3517. MOVUPD XMM2, [EAX]
  3518. MOVUPD XMM3, [EAX+16]
  3519. MOV EAX, 0
  3520. DotProductLoop:
  3521. ; MOV EDX, [ESI+EAX*8]
  3522. ; SHL EDX, 1
  3523. ; CMP EDX, 0
  3524. ; JE SparseEntryEscape
  3525. MOVSD XMM0, [ESI+EAX*8]
  3526. SHUFPD XMM0, XMM0, 0H
  3527. MOVUPD XMM1, [EDI]
  3528. MULPD XMM1, XMM0
  3529. ADDPD XMM2, XMM1
  3530. MOVUPD XMM1, [EDI+16]
  3531. MULPD XMM1, XMM0
  3532. ADDPD XMM3, XMM1
  3533. SparseEntryEscape:
  3534. ADD EDI, [EBP+StrideB] ; StrideB
  3535. INC EAX
  3536. CMP EAX, [EBP+Ca] ; Ca
  3537. JL DotProductLoop ; end DotProductLoop
  3538. MOV EAX , [EBP+matrixC] ; matrixC
  3539. MOV EDX, [EBP+CbFrom] ; CbFrom
  3540. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3541. MOVUPD [EAX], XMM2
  3542. MOVUPD [EAX+16], XMM3
  3543. MOV EAX, [EBP+matrixA] ; matrixA
  3544. ADD EAX, [EBP+StrideA] ; StrideA
  3545. MOV [EBP+matrixA], EAX ; matrixA
  3546. MOV EAX, [EBP+matrixC] ; matrixC
  3547. ADD EAX, [EBP+StrideC] ; StrideC
  3548. MOV [EBP+matrixC], EAX ; matrixC
  3549. INC ECX
  3550. CMP ECX, [EBP+Ra] ; Ra
  3551. JL DotProductSetup ;
  3552. END SSEMul4BlockX;
  3553. PROCEDURE SSEMul4BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3554. add: BOOLEAN );
  3555. CODE {SYSTEM.i386, SYSTEM.SSE}
  3556. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3557. DotProductSetup:
  3558. MOV EAX, 0 ; cols IN A
  3559. MOV ESI, [EBP+matrixA] ; matrixA
  3560. MOV EDI, [EBP+matrixB] ; matrixB
  3561. MOV EDX, [EBP+CbFrom] ; CbFrom
  3562. LEA EDI, [EDI+EDX*4]
  3563. XORPS XMM2, XMM2
  3564. MOV EAX, 0 ;
  3565. MOV AL, [EBP+add] ;
  3566. CMP AL, 0 ; add?
  3567. JE DotProductLoop ;
  3568. MOV EAX, [EBP+matrixC] ; matrixC
  3569. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3570. MOVUPS XMM2, [EAX]
  3571. MOV EAX, 0
  3572. DotProductLoop:
  3573. MOV EDX, [ESI+EAX*4]
  3574. SHL EDX, 1
  3575. CMP EDX, 0
  3576. JE SparseEntryEscape
  3577. MOVSS XMM0, [ESI+EAX*4]
  3578. SHUFPS XMM0, XMM0, 0H
  3579. MOVUPS XMM1, [EDI]
  3580. MULPS XMM1, XMM0
  3581. ADDPS XMM2, XMM1
  3582. SparseEntryEscape:
  3583. ADD EDI, [EBP+StrideB] ; StrideB
  3584. INC EAX
  3585. CMP EAX, [EBP+Ca] ; Ca
  3586. JL DotProductLoop ; end DopProductLoop
  3587. MOV EAX, [EBP+matrixC] ; matrixC
  3588. MOV EDX, [EBP+CbFrom] ; CbFrom
  3589. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3590. MOVUPS [EAX], XMM2
  3591. MOV EAX, [EBP+matrixA] ; matrixA
  3592. ADD EAX, [EBP+StrideA] ; StrideA
  3593. MOV [EBP+matrixA], EAX ; matrixA
  3594. MOV EAX, [EBP+matrixC] ; matrixC
  3595. ADD EAX, [EBP+StrideC] ; StrideC
  3596. MOV [EBP+matrixC], EAX ; matrixC
  3597. INC ECX
  3598. CMP ECX, [EBP+Ra] ; Ra
  3599. JL DotProductSetup ;
  3600. END SSEMul4BlockR;
  3601. PROCEDURE SSEMul2BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3602. add: BOOLEAN );
  3603. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3604. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3605. DotProductSetup:
  3606. MOV EAX, 0 ; cols IN A
  3607. MOV ESI, [EBP+matrixA] ; matrixA
  3608. MOV EDI, [EBP+matrixB] ; matrixB
  3609. MOV EDX, [EBP+CbFrom] ; CbFrom
  3610. LEA EDI, [EDI+EDX*8]
  3611. XORPD XMM2, XMM2
  3612. MOV EAX, 0 ;
  3613. MOV AL, [EBP+add] ;
  3614. CMP AL, 0 ; add?
  3615. JE DotProductLoop ;
  3616. MOV EAX, [EBP+matrixC] ; matrixC
  3617. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3618. MOVUPD XMM2, [EAX]
  3619. MOV EAX, 0
  3620. DotProductLoop:
  3621. ; MOV EDX, [ESI+EAX*4] ;
  3622. ; SHL EDX, 1 ;
  3623. ; CMP EDX, 0
  3624. ; JE SparseEntryEscape
  3625. MOVSD XMM0, [ESI+EAX*8]
  3626. SHUFPD XMM0, XMM0, 0H
  3627. MOVUPD XMM1, [EDI]
  3628. MULPD XMM1, XMM0
  3629. ADDPD XMM2, XMM1
  3630. SparseEntryEscape:
  3631. ADD EDI, [EBP+StrideB] ; StrideB
  3632. INC EAX
  3633. CMP EAX, [EBP+Ca] ; Ca
  3634. JL DotProductLoop ; end DotProductLoop
  3635. MOV EAX , [EBP+matrixC] ; matrixC
  3636. MOV EDX, [EBP+CbFrom] ; CbFrom
  3637. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3638. MOVUPD [EAX], XMM2
  3639. MOV EAX, [EBP+matrixA] ; matrixA
  3640. ADD EAX, [EBP+StrideA] ; StrideA
  3641. MOV [EBP+matrixA], EAX ; matrixA
  3642. MOV EAX, [EBP+matrixC] ; matrixC
  3643. ADD EAX, [EBP+StrideC] ; StrideC
  3644. MOV [EBP+matrixC], EAX ; matrixC
  3645. INC ECX
  3646. CMP ECX, [EBP+Ra] ; Ra
  3647. JL DotProductSetup ;
  3648. END SSEMul2BlockX;
  3649. (****** blocking matrix multiplication with copy of data ******)
  3650. PROCEDURE MagicBlockR( M, N, K: SIZE;
  3651. VAR L2BlockM, L2BlockN, L2BlockK: SIZE );
  3652. BEGIN
  3653. K := (K DIV L0BlockKR) * L0BlockKR;
  3654. N := (N DIV L1BlockN) * L1BlockN;
  3655. IF M = 0 THEN M := 1 END;
  3656. IF N = 0 THEN N := 1 END;
  3657. IF K = 0 THEN K := 1 END;
  3658. L2BlockK :=
  3659. K DIV ((K + L1MaxBlockKR - 1) DIV L1MaxBlockKR);
  3660. (* Round up to next multiple of 16 *)
  3661. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  3662. L2BlockN :=
  3663. L2BlockSize DIV SIZEOF( REAL ) DIV
  3664. (L2BlockK * (L2BARatio + 1));
  3665. IF L2BlockN > N THEN L2BlockN := N
  3666. ELSIF L2BlockN < 1 THEN L2BlockN := 1;
  3667. END;
  3668. L2BlockM :=
  3669. (L2BlockSize DIV SIZEOF( REAL ) - L2BlockN * L2BlockK) DIV
  3670. L2BlockK;
  3671. (* Round up to next multiple of 5 *)
  3672. IF L2BlockM > M THEN L2BlockM := M
  3673. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  3674. END;
  3675. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  3676. END MagicBlockR;
  3677. PROCEDURE MagicBlockX( M, N, K: SIZE;
  3678. VAR L2BlockM, L2BlockN, L2BlockK:SIZE );
  3679. BEGIN
  3680. K := (K DIV L0BlockKX) * L0BlockKX;
  3681. N := (N DIV L1BlockN) * L1BlockN;
  3682. IF M = 0 THEN M := 1 END;
  3683. IF N = 0 THEN N := 1 END;
  3684. IF K = 0 THEN K := 1 END;
  3685. L2BlockK :=
  3686. K DIV ((K + L1MaxBlockKX - 1) DIV L1MaxBlockKX);
  3687. (* Round up to next multiple of 16 *)
  3688. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  3689. L2BlockN :=
  3690. L2BlockSize DIV SIZEOF( LONGREAL ) DIV
  3691. (L2BlockK * (L2BARatio + 1));
  3692. IF L2BlockN > N THEN L2BlockN := N END;
  3693. L2BlockM :=
  3694. (L2BlockSize DIV SIZEOF( LONGREAL ) - L2BlockN * L2BlockK) DIV
  3695. L2BlockK;
  3696. (* Round up to next multiple of 5 *)
  3697. IF L2BlockM > M THEN L2BlockM := M
  3698. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  3699. END;
  3700. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  3701. END MagicBlockX;
  3702. (*
  3703. PROCEDURE L1Block1X( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3704. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  3705. PROCEDURE null( i: LONGINT );
  3706. BEGIN
  3707. reg[i, 0] := 0; reg[i, 1] := 0;
  3708. END null;
  3709. PROCEDURE get1( adr, i: LONGINT );
  3710. BEGIN
  3711. SYSTEM.GET( adr, reg[i, 0] );
  3712. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3713. END get1;
  3714. PROCEDURE get2( adr, i: LONGINT );
  3715. BEGIN
  3716. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  3717. IF debug THEN
  3718. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3719. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  3720. END;
  3721. END get2;
  3722. PROCEDURE mul2( i, j: LONGINT );
  3723. BEGIN
  3724. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3725. END mul2;
  3726. PROCEDURE add2( i, j: LONGINT );
  3727. BEGIN
  3728. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3729. END add2;
  3730. PROCEDURE put1( adr, i: LONGINT );
  3731. BEGIN
  3732. SYSTEM.PUT( adr, reg[i, 0] );
  3733. END put1;
  3734. PROCEDURE horadd( i: LONGINT );
  3735. BEGIN
  3736. reg[i, 0] := reg[i, 0] + reg[i, 1];
  3737. END horadd;
  3738. BEGIN
  3739. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  3740. null( 2 ); get1( adrC, 2 );
  3741. WHILE (K > 0) DO (* padding guaranteed *)
  3742. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 ); INC( adrB, 16 );
  3743. INC( adrA, 16 ); DEC( K, 2 );
  3744. END;
  3745. horadd( 2 ); put1( adrC, 2 );
  3746. END L1Block1X;
  3747. PROCEDURE L1Block5X( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3748. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  3749. PROCEDURE null( i: LONGINT );
  3750. BEGIN
  3751. reg[i, 0] := 0; reg[i, 1] := 0;
  3752. END null;
  3753. PROCEDURE get1( adr, i: LONGINT );
  3754. BEGIN
  3755. SYSTEM.GET( adr, reg[i, 0] );
  3756. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3757. END get1;
  3758. PROCEDURE get2( adr, i: LONGINT );
  3759. BEGIN
  3760. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  3761. IF debug THEN
  3762. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3763. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  3764. END;
  3765. END get2;
  3766. PROCEDURE mul2( i, j: LONGINT );
  3767. BEGIN
  3768. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3769. END mul2;
  3770. PROCEDURE add2( i, j: LONGINT );
  3771. BEGIN
  3772. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3773. END add2;
  3774. PROCEDURE put1( adr, i: LONGINT );
  3775. BEGIN
  3776. SYSTEM.PUT( adr, reg[i, 0] );
  3777. END put1;
  3778. PROCEDURE horadd( i: LONGINT );
  3779. BEGIN
  3780. reg[i, 0] := reg[i, 0] + reg[i, 1];
  3781. END horadd;
  3782. BEGIN
  3783. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  3784. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  3785. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  3786. get1( adrC + 4 * IncC, 6 );
  3787. WHILE (K > 0) DO (* padding guaranteed *)
  3788. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 );
  3789. get2( adrB + 16, 0 ); mul2( 0, 7 ); add2( 3, 0 ); get2( adrB + 32, 0 );
  3790. mul2( 0, 7 ); add2( 4, 0 ); get2( adrB + 48, 0 ); mul2( 0, 7 );
  3791. add2( 5, 0 ); get2( adrB + 64, 0 ); mul2( 0, 7 ); add2( 6, 0 ); INC( adrB, 80 );
  3792. INC( adrA, 16 ); DEC( K, 2 );
  3793. END;
  3794. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  3795. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  3796. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  3797. END L1Block5X;
  3798. PROCEDURE L1Block1R( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3799. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  3800. PROCEDURE null( i: LONGINT );
  3801. BEGIN
  3802. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  3803. END null;
  3804. PROCEDURE get1( adr, i: LONGINT );
  3805. BEGIN
  3806. SYSTEM.GET( adr, reg[i, 0] );
  3807. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3808. END get1;
  3809. PROCEDURE get4( adr, i: LONGINT );
  3810. BEGIN
  3811. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  3812. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  3813. IF debug THEN
  3814. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3815. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  3816. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  3817. END;
  3818. END get4;
  3819. PROCEDURE mul4( i, j: LONGINT );
  3820. BEGIN
  3821. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3822. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  3823. END mul4;
  3824. PROCEDURE add4( i, j: LONGINT );
  3825. BEGIN
  3826. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3827. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  3828. END add4;
  3829. PROCEDURE put1( adr, i: LONGINT );
  3830. BEGIN
  3831. SYSTEM.PUT( adr, reg[i, 0] );
  3832. END put1;
  3833. PROCEDURE horadd( i: LONGINT );
  3834. BEGIN
  3835. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  3836. END horadd;
  3837. BEGIN
  3838. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  3839. null( 2 ); get1( adrC, 2 );
  3840. WHILE (K > 0) DO (* padding guaranteed *)
  3841. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 ); INC( adrB, 16 );
  3842. INC( adrA, 16 ); DEC( K, 4 );
  3843. END;
  3844. horadd( 2 ); put1( adrC, 2 );
  3845. END L1Block1R;
  3846. PROCEDURE L1Block5R( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3847. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  3848. PROCEDURE null( i: LONGINT );
  3849. BEGIN
  3850. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  3851. END null;
  3852. PROCEDURE get1( adr, i: LONGINT );
  3853. BEGIN
  3854. SYSTEM.GET( adr, reg[i, 0] );
  3855. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3856. END get1;
  3857. PROCEDURE get4( adr, i: LONGINT );
  3858. BEGIN
  3859. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  3860. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  3861. IF debug THEN
  3862. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3863. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  3864. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  3865. END;
  3866. END get4;
  3867. PROCEDURE mul4( i, j: LONGINT );
  3868. BEGIN
  3869. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3870. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  3871. END mul4;
  3872. PROCEDURE add4( i, j: LONGINT );
  3873. BEGIN
  3874. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3875. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  3876. END add4;
  3877. PROCEDURE put1( adr, i: LONGINT );
  3878. BEGIN
  3879. SYSTEM.PUT( adr, reg[i, 0] );
  3880. END put1;
  3881. PROCEDURE horadd( i: LONGINT );
  3882. BEGIN
  3883. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  3884. END horadd;
  3885. BEGIN
  3886. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  3887. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  3888. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  3889. get1( adrC + 4 * IncC, 6 );
  3890. WHILE (K > 0) DO (* padding guaranteed *)
  3891. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 );
  3892. get4( adrB + 16, 0 ); mul4( 0, 7 ); add4( 3, 0 ); get4( adrB + 32, 0 );
  3893. mul4( 0, 7 ); add4( 4, 0 ); get4( adrB + 48, 0 ); mul4( 0, 7 );
  3894. add4( 5, 0 ); get4( adrB + 64, 0 ); mul4( 0, 7 ); add4( 6, 0 ); INC( adrB, 80 );
  3895. INC( adrA, 16 ); DEC( K, 4 );
  3896. END;
  3897. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  3898. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  3899. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  3900. END L1Block5R;
  3901. *)
  3902. PROCEDURE DispCR( adrM: ADDRESS;
  3903. inc, stride, M, N: SIZE );
  3904. VAR i, j: SIZE; adr: ADDRESS; val: REAL;
  3905. BEGIN
  3906. FOR i := 0 TO M - 1 DO
  3907. adr := adrM + i * stride;
  3908. FOR j := 0 TO N - 1 DO
  3909. SYSTEM.GET( adr, val );
  3910. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  3911. END;
  3912. KernelLog.Ln;
  3913. END;
  3914. END DispCR;
  3915. PROCEDURE DispCX( adrM: ADDRESS;
  3916. inc, stride, M, N: SIZE );
  3917. VAR i, j: SIZE; adr: ADDRESS; val: LONGREAL;
  3918. BEGIN
  3919. FOR i := 0 TO M - 1 DO
  3920. adr := adrM + i * stride;
  3921. FOR j := 0 TO N - 1 DO
  3922. SYSTEM.GET( adr, val );
  3923. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  3924. END;
  3925. KernelLog.Ln;
  3926. END;
  3927. END DispCX;
  3928. PROCEDURE L3BlockX( matrixA, matrixB, matrixC: ADDRESS;
  3929. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  3930. (*
  3931. K N
  3932. *** N *****
  3933. M *** ****** -> ***** M
  3934. *** K ****** *****
  3935. *** ****** *****
  3936. A * B -> C
  3937. *)
  3938. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  3939. KAligned: SIZE;
  3940. CONST Size = SIZEOF( LONGREAL );
  3941. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  3942. (* M,N and K arbitrary ! *)
  3943. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  3944. m, k, KAligned: SIZE;
  3945. BEGIN
  3946. KAligned := Align2( K ) * 8;
  3947. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  3948. END;
  3949. adrB := matrixB;
  3950. WHILE (N >= L1BlockN) DO
  3951. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  3952. adrC := matrixC; adrA := matrixA; m := M;
  3953. WHILE (m > 0) DO
  3954. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  3955. IF SSE THEN
  3956. L1Block5XSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  3957. ELSE
  3958. aadrA := adrA; aadrB := adrB; k := K;
  3959. WHILE (k > 0) DO
  3960. L1Block1XA( aadrA, aadrB, adrC, 2 );
  3961. L1Block1XA( aadrA, aadrB + 16, adrC + incC, 2 );
  3962. L1Block1XA( aadrA, aadrB + 32, adrC + 2 * incC,
  3963. 2 );
  3964. L1Block1XA( aadrA, aadrB + 48, adrC + 3 * incC,
  3965. 2 );
  3966. L1Block1XA( aadrA, aadrB + 64, adrC + 4 * incC,
  3967. 2 );
  3968. DEC( k, 2 ); INC( aadrA, 16 );
  3969. INC( aadrB, 16 * L1BlockN );
  3970. END;
  3971. END;
  3972. IF debug THEN
  3973. DispCX( matrixC, incC, strideC, M, N );
  3974. END;
  3975. INC( adrA, KAligned ); INC( adrC, strideC );
  3976. DEC( m );
  3977. END;
  3978. INC( matrixC, L1BlockN * incC );
  3979. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  3980. END;
  3981. WHILE (N > 0) DO
  3982. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  3983. adrC := matrixC; adrA := matrixA; m := M;
  3984. WHILE (m > 0) DO
  3985. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  3986. IF SSE THEN
  3987. L1Block1XSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  3988. ELSE L1Block1XA( adrA, adrB, adrC, K );
  3989. END;
  3990. IF debug THEN
  3991. DispCX( matrixC, incC, strideC, M, N );
  3992. END;
  3993. INC( adrA, KAligned ); INC( adrC, strideC );
  3994. DEC( m );
  3995. END;
  3996. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  3997. END;
  3998. END L2Block;
  3999. BEGIN
  4000. KAligned := Align2( K ) * 8;
  4001. ASSERT( L2BlockK MOD 2 = 0 );
  4002. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4003. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4004. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4005. WHILE (n >= L2BlockN) DO
  4006. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4007. a1 := matrixA; adrC := matrixC; m := M;
  4008. WHILE (m >= L2BlockM) DO
  4009. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4010. adrA := a1; adrB := b1; k := K;
  4011. (* core: do matching level 2 cache Blocks *)
  4012. WHILE (k >= L2BlockK) DO
  4013. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4014. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4015. L2BlockK );
  4016. INC( adrA, L2BlockK * L2BlockM * Size );
  4017. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4018. DEC( k, L2BlockK );
  4019. END;
  4020. (* core: do rest of k *)
  4021. IF k > 0 THEN
  4022. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4023. END;
  4024. INC( a1, KAligned * L2BlockM );
  4025. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4026. END;
  4027. IF m > 0 THEN
  4028. (* clean up M *)
  4029. adrA := a1; adrB := b1; k := K;
  4030. WHILE (k >= L2BlockK) DO
  4031. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4032. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4033. INC( adrA, L2BlockK * Size * m );
  4034. INC( adrB, L2BlockK * L2BlockN * Size );
  4035. DEC( k, L2BlockK );
  4036. END;
  4037. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4038. IF k > 0 THEN
  4039. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4040. END;
  4041. END;
  4042. INC( b1, L2BlockN * KAligned );
  4043. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4044. END;
  4045. IF (n = 0) THEN RETURN
  4046. END;
  4047. a1 := matrixA; adrC := matrixC; m := M;
  4048. WHILE (m >= L2BlockM) DO
  4049. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4050. adrA := a1; adrB := b1; k := K;
  4051. WHILE (k >= L2BlockK) DO
  4052. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4053. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4054. INC( adrA, L2BlockM * L2BlockK * Size );
  4055. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4056. END;
  4057. IF k > 0 THEN
  4058. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4059. END;
  4060. INC( a1, L2BlockM * KAligned );
  4061. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4062. END;
  4063. IF (m = 0) THEN RETURN
  4064. END;
  4065. adrA := a1; adrB := b1; k := K;
  4066. WHILE (k >= L2BlockK) DO
  4067. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4068. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4069. INC( adrA, L2BlockK * m * Size );
  4070. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4071. END;
  4072. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4073. END;
  4074. END L3BlockX;
  4075. PROCEDURE L3BlockR( matrixA, matrixB, matrixC: ADDRESS;
  4076. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4077. (*
  4078. K N
  4079. *** N *****
  4080. M *** ****** -> ***** M
  4081. *** K ****** *****
  4082. *** ****** *****
  4083. A * B -> C
  4084. *)
  4085. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4086. KAligned: SIZE;
  4087. CONST Size = SIZEOF( REAL );
  4088. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4089. (* M,N and K arbitrary ! *)
  4090. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4091. m, KAligned, k: SIZE;
  4092. BEGIN
  4093. KAligned := Align4( K ) * 4;
  4094. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4095. END;
  4096. adrB := matrixB;
  4097. WHILE (N >= L1BlockN) DO
  4098. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4099. adrC := matrixC; adrA := matrixA; m := M;
  4100. WHILE (m > 0) DO
  4101. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4102. IF SSE THEN
  4103. L1Block5RSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4104. ELSE
  4105. aadrA := adrA; aadrB := adrB; k := K;
  4106. WHILE (k > 0) DO
  4107. L1Block1RA( aadrA, aadrB, adrC, 4 );
  4108. L1Block1RA( aadrA, aadrB + 16, adrC + incC, 4 );
  4109. L1Block1RA( aadrA, aadrB + 32, adrC + 2 * incC,
  4110. 4 );
  4111. L1Block1RA( aadrA, aadrB + 48, adrC + 3 * incC,
  4112. 4 );
  4113. L1Block1RA( aadrA, aadrB + 64, adrC + 4 * incC,
  4114. 4 );
  4115. DEC( k, 4 ); INC( aadrA, 16 );
  4116. INC( aadrB, 16 * L1BlockN );
  4117. END;
  4118. END;
  4119. IF debug THEN
  4120. DispCR( matrixC, incC, strideC, M, N );
  4121. END;
  4122. INC( adrA, KAligned ); INC( adrC, strideC );
  4123. DEC( m );
  4124. END;
  4125. INC( matrixC, L1BlockN * incC );
  4126. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4127. END;
  4128. WHILE (N > 0) DO
  4129. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4130. adrC := matrixC; adrA := matrixA; m := M;
  4131. WHILE (m > 0) DO
  4132. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4133. IF SSE THEN
  4134. L1Block1RSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4135. ELSE L1Block1RA( adrA, adrB, adrC, K );
  4136. END;
  4137. IF debug THEN
  4138. DispCR( matrixC, incC, strideC, M, N );
  4139. END;
  4140. INC( adrA, KAligned ); INC( adrC, strideC );
  4141. DEC( m );
  4142. END;
  4143. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4144. END;
  4145. END L2Block;
  4146. BEGIN
  4147. KAligned := Align4( K ) * 4;
  4148. ASSERT( L2BlockK MOD 4 = 0 );
  4149. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4150. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4151. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4152. WHILE (n >= L2BlockN) DO
  4153. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4154. a1 := matrixA; adrC := matrixC; m := M;
  4155. WHILE (m >= L2BlockM) DO
  4156. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4157. adrA := a1; adrB := b1; k := K;
  4158. (* core: do matching level 2 cache Blocks *)
  4159. WHILE (k >= L2BlockK) DO
  4160. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4161. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4162. L2BlockK );
  4163. INC( adrA, L2BlockK * L2BlockM * Size );
  4164. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4165. DEC( k, L2BlockK );
  4166. END;
  4167. (* core: do rest of k *)
  4168. IF k > 0 THEN
  4169. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4170. END;
  4171. INC( a1, KAligned * L2BlockM );
  4172. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4173. END;
  4174. IF m > 0 THEN
  4175. (* clean up M *)
  4176. adrA := a1; adrB := b1; k := K;
  4177. WHILE (k >= L2BlockK) DO
  4178. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4179. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4180. INC( adrA, L2BlockK * Size * m );
  4181. INC( adrB, L2BlockK * L2BlockN * Size );
  4182. DEC( k, L2BlockK );
  4183. END;
  4184. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4185. IF k > 0 THEN
  4186. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4187. END;
  4188. END;
  4189. INC( b1, L2BlockN * KAligned );
  4190. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4191. END;
  4192. IF (n = 0) THEN RETURN
  4193. END;
  4194. a1 := matrixA; adrC := matrixC; m := M;
  4195. WHILE (m >= L2BlockM) DO
  4196. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4197. adrA := a1; adrB := b1; k := K;
  4198. WHILE (k >= L2BlockK) DO
  4199. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4200. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4201. INC( adrA, L2BlockM * L2BlockK * Size );
  4202. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4203. END;
  4204. IF k > 0 THEN
  4205. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4206. END;
  4207. INC( a1, L2BlockM * KAligned );
  4208. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4209. END;
  4210. IF (m = 0) THEN RETURN
  4211. END;
  4212. adrA := a1; adrB := b1; k := K;
  4213. WHILE (k >= L2BlockK) DO
  4214. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4215. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4216. INC( adrA, L2BlockK * m * Size );
  4217. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4218. END;
  4219. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4220. END;
  4221. END L3BlockR;
  4222. PROCEDURE Align( adr: ADDRESS; align: SIZE ): ADDRESS;
  4223. BEGIN
  4224. RETURN adr + (-adr) MOD align; (* 128 bit = 16 byte alignment *)
  4225. END Align;
  4226. PROCEDURE CopyAX( matrixA, dest: ADDRESS;
  4227. IncA, StrideA: SIZE;
  4228. K, M, L2BlockK, L2BlockM: SIZE );
  4229. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4230. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4231. VAR rest: SIZE;
  4232. BEGIN
  4233. IF debug THEN
  4234. KernelLog.String( "CopyMK:" ); KernelLog.Int( M, 10 ); KernelLog.Int( K, 10 );
  4235. KernelLog.Ln;
  4236. END;
  4237. rest := (-K) MOD 2;
  4238. WHILE (M > 0) DO
  4239. MovX( matrixA, dest, IncA, K ); INC( dest, K * 8 );
  4240. IF rest # 0 THEN
  4241. ZeroX( dest, rest ); INC( dest, 8 * rest );
  4242. END;
  4243. INC( matrixA, StrideA ); DEC( M );
  4244. END;
  4245. END CopyMK;
  4246. BEGIN
  4247. Tic( t ); m := M;
  4248. WHILE (m >= L2BlockM) DO
  4249. k := K; adrA := matrixA;
  4250. WHILE (k >= L2BlockK) DO
  4251. CopyMK( adrA, L2BlockM, L2BlockK );
  4252. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4253. END;
  4254. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4255. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4256. END;
  4257. adrA := matrixA; k := K;
  4258. WHILE (k >= L2BlockK) DO
  4259. CopyMK( adrA, m, L2BlockK );
  4260. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4261. END;
  4262. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4263. Toc( t, copyT );
  4264. END CopyAX;
  4265. PROCEDURE CopyAR( matrixA, dest: ADDRESS;
  4266. IncA, StrideA: SIZE;
  4267. K, M, L2BlockK, L2BlockM: SIZE );
  4268. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4269. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4270. VAR rest: SIZE;
  4271. BEGIN
  4272. rest := (-K) MOD 4;
  4273. WHILE (M > 0) DO
  4274. MovR( matrixA, dest, IncA, K ); INC( dest, K * 4 );
  4275. IF rest # 0 THEN
  4276. ZeroR( dest, rest ); INC( dest, 4 * rest );
  4277. END;
  4278. INC( matrixA, StrideA ); DEC( M );
  4279. END;
  4280. END CopyMK;
  4281. BEGIN
  4282. Tic( t ); m := M;
  4283. WHILE (m >= L2BlockM) DO
  4284. k := K; adrA := matrixA;
  4285. WHILE (k >= L2BlockK) DO
  4286. CopyMK( adrA, L2BlockM, L2BlockK );
  4287. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4288. END;
  4289. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4290. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4291. END;
  4292. adrA := matrixA; k := K;
  4293. WHILE (k >= L2BlockK) DO
  4294. CopyMK( adrA, m, L2BlockK );
  4295. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4296. END;
  4297. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4298. Toc( t, copyT );
  4299. END CopyAR;
  4300. PROCEDURE CopyBX( matrixB, dest: ADDRESS;
  4301. IncB, StrideB: SIZE;
  4302. N, K, L2BlockN, L2BlockK: SIZE );
  4303. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4304. PROCEDURE Copy5x2k( matrixB: ADDRESS; k: SIZE );
  4305. VAR i: SIZE; adrB: ADDRESS; rest: SIZE;
  4306. BEGIN
  4307. rest := (-k) MOD 2;
  4308. WHILE (k >= 2) DO (* store 5x4 Block in line *)
  4309. adrB := matrixB;
  4310. FOR i := 1 TO L1BlockN DO
  4311. MovX( adrB, dest, StrideB, 2 ); INC( dest, 16 );
  4312. INC( adrB, IncB );
  4313. END;
  4314. INC( matrixB, 2 * StrideB ); DEC( k, 2 );
  4315. END;
  4316. IF k > 0 THEN
  4317. adrB := matrixB;
  4318. FOR i := 1 TO L1BlockN DO
  4319. MovX( adrB, dest, StrideB, k ); INC( dest, 8 * k );
  4320. IF rest # 0 THEN
  4321. ZeroX( dest, rest ); INC( dest, rest * 8 );
  4322. END;
  4323. INC( adrB, IncB );
  4324. END;
  4325. END;
  4326. END Copy5x2k;
  4327. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4328. VAR n, rest: SIZE;
  4329. BEGIN
  4330. rest := (-K) MOD 2;
  4331. IF debug THEN
  4332. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4333. END;
  4334. n := N;
  4335. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4336. Copy5x2k( matrixB, K );
  4337. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4338. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4339. END;
  4340. IF debug THEN KernelLog.String( "Copy1, n=" ); KernelLog.Int( n, 10 ); KernelLog.Ln;
  4341. END;
  4342. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4343. MovX( matrixB, dest, StrideB, K ); INC( dest, K * 8 );
  4344. ZeroR( dest, rest ); INC( dest, rest * 8 );
  4345. INC( matrixB, IncB ); DEC( n );
  4346. END;
  4347. END Copy1;
  4348. BEGIN
  4349. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4350. ASSERT( L2BlockK MOD 2 = 0 ); n := N;
  4351. WHILE (n >= L2BlockN) DO
  4352. k := K; adrB := matrixB;
  4353. WHILE (k >= L2BlockK) DO
  4354. Copy1( adrB, L2BlockK, L2BlockN );
  4355. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4356. END;
  4357. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4358. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4359. END;
  4360. IF (n = 0) THEN RETURN
  4361. END;
  4362. k := K; adrB := matrixB;
  4363. WHILE (k >= L2BlockK) DO
  4364. Copy1( adrB, L2BlockK, n );
  4365. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4366. END;
  4367. Copy1( adrB, k, n ); Toc( t, copyT );
  4368. END CopyBX;
  4369. PROCEDURE CopyBR( matrixB, dest: ADDRESS;
  4370. IncB, StrideB: SIZE;
  4371. N, K, L2BlockN, L2BlockK: SIZE );
  4372. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4373. PROCEDURE Copy5x4k( matrixB: ADDRESS; k: SIZE );
  4374. VAR i: SIZE; adrB: ADDRESS; rest, k4: SIZE;
  4375. BEGIN
  4376. k4 := k - k MOD 4; rest := (-k) MOD 4;
  4377. IF k4 > 0 THEN
  4378. MovR5( matrixB, IncB, StrideB, dest, k4 );
  4379. INC( matrixB, k4 * StrideB ); INC( dest, k4 * 80 DIV 4 );
  4380. DEC( k, k4 );
  4381. END;
  4382. (*
  4383. WHILE (k >= 4) DO (* store 5x4 Block in line *)
  4384. adrB := matrixB;
  4385. FOR i := 1 TO L1BlockN DO
  4386. MovR( adrB, dest, StrideB, 4 ); INC( dest, 16 ); INC( adrB, IncB );
  4387. END;
  4388. INC( matrixB, 4 * StrideB ); DEC( k, 4 );
  4389. END;
  4390. *)
  4391. IF k > 0 THEN
  4392. adrB := matrixB;
  4393. FOR i := 1 TO L1BlockN DO
  4394. MovR( adrB, dest, StrideB, k ); INC( dest, 4 * k );
  4395. IF rest # 0 THEN
  4396. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4397. END;
  4398. INC( adrB, IncB );
  4399. END;
  4400. END;
  4401. END Copy5x4k;
  4402. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4403. VAR n, rest: SIZE;
  4404. BEGIN
  4405. rest := (-K) MOD 4;
  4406. IF debug THEN
  4407. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4408. END;
  4409. n := N;
  4410. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4411. Copy5x4k( matrixB, K );
  4412. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4413. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4414. END;
  4415. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4416. MovR( matrixB, dest, StrideB, K ); INC( dest, K * 4 );
  4417. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4418. INC( matrixB, IncB ); DEC( n );
  4419. END;
  4420. END Copy1;
  4421. BEGIN
  4422. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4423. ASSERT( L2BlockK MOD 4 = 0 ); n := N;
  4424. WHILE (n >= L2BlockN) DO
  4425. k := K; adrB := matrixB;
  4426. WHILE (k >= L2BlockK) DO
  4427. Copy1( adrB, L2BlockK, L2BlockN );
  4428. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4429. END;
  4430. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4431. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4432. END;
  4433. IF (n = 0) THEN RETURN
  4434. END;
  4435. k := K; adrB := matrixB;
  4436. WHILE (k >= L2BlockK) DO
  4437. Copy1( adrB, L2BlockK, n );
  4438. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4439. END;
  4440. Copy1( adrB, k, n ); Toc( t, copyT );
  4441. END CopyBR;
  4442. (*
  4443. PROCEDURE FillMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4444. VAR i, j: LONGINT;
  4445. BEGIN
  4446. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4447. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4448. A[i, j] := ran.Dice( 10 );
  4449. IF debug THEN A[i, j] := 10 * i + j; END;
  4450. END;
  4451. END;
  4452. END FillMR;
  4453. PROCEDURE DispMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4454. VAR i, j: LONGINT;
  4455. BEGIN
  4456. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4457. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4458. KernelLog.Ln;
  4459. END;
  4460. END DispMR;
  4461. PROCEDURE FillMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4462. VAR i, j: LONGINT;
  4463. BEGIN
  4464. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4465. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4466. A[i, j] := ran.Dice( 10 );
  4467. IF debug THEN A[i, j] := 10 * i + j; END;
  4468. END;
  4469. END;
  4470. END FillMX;
  4471. PROCEDURE DispMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4472. VAR i, j: LONGINT;
  4473. BEGIN
  4474. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4475. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4476. KernelLog.Ln;
  4477. END;
  4478. END DispMX;
  4479. *)
  4480. PROCEDURE Tic( VAR t: HUGEINT );
  4481. BEGIN
  4482. t := Machine.GetTimer();
  4483. END Tic;
  4484. PROCEDURE Toc( VAR t, addto: HUGEINT );
  4485. BEGIN
  4486. INC( addto, Machine.GetTimer() - t ); t := Machine.GetTimer();
  4487. END Toc;
  4488. PROCEDURE MultiplyX( A, B, C: ADDRESS;
  4489. M, N, K, L2BlockM, L2BlockN, L2BlockK:SIZE;
  4490. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4491. add: BOOLEAN );
  4492. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4493. M1, M2, i: SIZE; val: LONGREAL; t: HUGEINT;
  4494. inc: SIZE;
  4495. obj: POINTER TO ARRAY OF MultiplyObjectX;
  4496. cache: Cache;
  4497. BEGIN
  4498. NEW(obj,nrProcesses+1);
  4499. lenA := M * Align2( K ) * 8; lenB := N * Align2( K ) * 8;
  4500. cache := cachePool.Acquire( lenA + lenB );
  4501. adrA := cache.adr; adrB := adrA + lenA;
  4502. CopyAX( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4503. CopyBX( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4504. Tic( t ); m := M; adrC := C;
  4505. IF ~add THEN
  4506. WHILE (m > 0) DO
  4507. ZeroXI( adrC, IncC, N ); INC( adrC, StrideC ); DEC( m );
  4508. END;
  4509. END;
  4510. Toc( t, zeroT );
  4511. IF debug THEN
  4512. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4513. FOR i := 0 TO M * Align2( K ) - 1 DO
  4514. SYSTEM.GET( adrA + i * 8, val );
  4515. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4516. END;
  4517. END;
  4518. IF debug THEN
  4519. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4520. FOR i := 0 TO N * Align2( K ) - 1 DO
  4521. SYSTEM.GET( adrB + i * 8, val );
  4522. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4523. END;
  4524. END;
  4525. IF parallel & (M > L2BlockM) THEN
  4526. inc := Align( M DIV nrProcesses, L2BlockM ); M1 := 0;
  4527. i := 0;
  4528. WHILE (M1 < M) DO
  4529. M2 := M1 + inc;
  4530. IF M2 > M THEN M2 := M END;
  4531. NEW( obj[i], adrA + M1 * Align2( K ) * 8, adrB,
  4532. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4533. L2BlockM, L2BlockN, L2BlockK );
  4534. M1 := M2; INC( i );
  4535. END;
  4536. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4537. ELSE
  4538. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4539. L2BlockN, L2BlockK );
  4540. END;
  4541. Toc( t, compT ); cachePool.Release( cache );
  4542. END MultiplyX;
  4543. PROCEDURE MultiplyR( A, B, C: ADDRESS;
  4544. M, N, K, L2BlockM, L2BlockN, L2BlockK: SIZE;
  4545. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4546. add: BOOLEAN );
  4547. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4548. M1, M2, i: SIZE; val: REAL; inc: SIZE;
  4549. obj: POINTER TO ARRAY OF MultiplyObjectR;
  4550. t: HUGEINT; cache: Cache;
  4551. BEGIN
  4552. NEW(obj,nrProcesses+1);
  4553. lenA := M * Align4( K ) * 4; lenB := N * Align4( K ) * 4;
  4554. cache := cachePool.Acquire( lenA + lenB );
  4555. adrA := cache.adr; adrB := adrA + lenA;
  4556. CopyAR( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4557. CopyBR( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4558. Tic( t ); m := M; adrC := C;
  4559. IF ~add THEN
  4560. WHILE (m > 0) DO
  4561. ZeroRI( adrC, IncC, N ); INC( adrC, StrideC );
  4562. DEC( m );
  4563. END;
  4564. END;
  4565. Toc( t, zeroT );
  4566. IF debug THEN
  4567. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4568. FOR i := 0 TO M * Align4( K ) - 1 DO
  4569. SYSTEM.GET( adrA + i * 4, val );
  4570. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4571. END;
  4572. END;
  4573. IF debug THEN
  4574. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4575. FOR i := 0 TO N * Align4( K ) - 1 DO
  4576. SYSTEM.GET( adrB + i * 4, val );
  4577. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4578. END;
  4579. END;
  4580. IF parallel & (M > L2BlockM) THEN
  4581. inc := Align( M DIV nrProcesses, L2BlockM ); M1 := 0;
  4582. i := 0;
  4583. WHILE (M1 < M) DO
  4584. M2 := M1 + inc;
  4585. IF M2 > M THEN M2 := M END;
  4586. NEW( obj[i], adrA + M1 * Align4( K ) * 4, adrB,
  4587. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4588. L2BlockM, L2BlockN, L2BlockK );
  4589. M1 := M2; INC( i );
  4590. END;
  4591. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4592. ELSE
  4593. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4594. L2BlockN, L2BlockK );
  4595. END;
  4596. Toc( t, compT ); cachePool.Release( cache );
  4597. END MultiplyR;
  4598. (*
  4599. PROCEDURE DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4600. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4601. A, B, C, D: ARRAY [ .. , .. ] OF LONGREAL;
  4602. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4603. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: LONGREAL; atime, time: LONGINT;
  4604. BEGIN
  4605. KernelLog.String( "LONGREAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4606. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4607. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4608. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4609. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMX( A ); FillMX( B );
  4610. IF debug THEN DispMX( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMX( B );
  4611. END;
  4612. atime := Input.Time(); (* C := 0; *)
  4613. WHILE (iter > 0) DO
  4614. MultiplyX( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4615. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4616. (*
  4617. 8,
  4618. LEN( A, 1 ) * 8, 8, LEN( B, 1 ) * 8, 8, LEN( C, 1 ) * 8
  4619. *)
  4620. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4621. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4622. );
  4623. DEC( iter );
  4624. END;
  4625. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4626. IF debug THEN
  4627. DispMX( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMX( B ); KernelLog.String( " = " );
  4628. KernelLog.Ln; DispMX( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  4629. END;
  4630. IF check THEN
  4631. (*
  4632. NEW(D,M,N);
  4633. MatMulAXAXNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4634. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4635. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  4636. *)
  4637. D := A * B;
  4638. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  4639. END;
  4640. END DoTestX;
  4641. PROCEDURE DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4642. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4643. A, B, C, D: ARRAY [ .. , .. ] OF REAL;
  4644. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4645. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: REAL; atime, time: LONGINT;
  4646. BEGIN
  4647. KernelLog.String( "REAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4648. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4649. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4650. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4651. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMR( A ); FillMR( B );
  4652. IF debug THEN DispMR( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMR( B );
  4653. END;
  4654. atime := Input.Time(); (* C := 0; *)
  4655. FOR i := 1 TO iter DO
  4656. MultiplyR( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4657. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4658. (* 4,
  4659. LEN( A, 1 ) * 4, 4, LEN( B, 1 ) * 4, 4, LEN( C, 1 ) * 4 *)
  4660. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4661. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4662. );
  4663. END;
  4664. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4665. IF debug THEN
  4666. DispMR( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMR( B ); KernelLog.String( " = " );
  4667. KernelLog.Ln; DispMR( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  4668. END;
  4669. IF check THEN
  4670. (*
  4671. NEW(D,M,N);
  4672. MatMulARARNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4673. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4674. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  4675. *)
  4676. D := A * B;
  4677. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  4678. END;
  4679. END DoTestR;
  4680. PROCEDURE RandTestR*;
  4681. VAR iter, i, time: LONGINT;
  4682. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  4683. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  4684. BEGIN
  4685. IF Min = Max THEN RETURN Min
  4686. ELSE RETURN ran.Dice( Max - Min ) + Min
  4687. END;
  4688. END Ran;
  4689. BEGIN
  4690. In.Open(); In.LongInt( iter );
  4691. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  4692. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  4693. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  4694. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  4695. K := Ran( MinK, MaxK );
  4696. IF N < 5 THEN N := 5 END;
  4697. IF K < 4 THEN K := 4 END;
  4698. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  4699. BN := Align( BN, 5 );
  4700. IF BN > N THEN DEC( BN, 5 ) END;
  4701. BK := Align( BK, 4 );
  4702. IF BK > K THEN DEC( BK, 4 ) END;
  4703. DoTestR( M, N, K, BM, BN, BK, TRUE , 1 );
  4704. END;
  4705. END RandTestR;
  4706. PROCEDURE RandTestX*;
  4707. VAR iter, i, time: LONGINT;
  4708. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  4709. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  4710. BEGIN
  4711. IF Min = Max THEN RETURN Min
  4712. ELSE RETURN ran.Dice( Max - Min ) + Min
  4713. END;
  4714. END Ran;
  4715. BEGIN
  4716. In.Open(); In.LongInt( iter );
  4717. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  4718. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  4719. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  4720. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  4721. K := Ran( MinK, MaxK );
  4722. IF N < 5 THEN N := 5 END;
  4723. IF K < 4 THEN K := 4 END;
  4724. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  4725. BN := Align( BN, 5 );
  4726. IF BN > N THEN DEC( BN, 5 ) END;
  4727. BK := Align( BK, 4 );
  4728. IF BK > K THEN DEC( BK, 4 ) END;
  4729. DoTestX( M, N, K, BM, BN, BK, TRUE , 1 );
  4730. END;
  4731. END RandTestX;
  4732. *)
  4733. (*
  4734. PROCEDURE Times*;
  4735. VAR all: HUGEINT;
  4736. BEGIN
  4737. all := allocT + copyT + zeroT + compT; KernelLog.String( "alloc=" );
  4738. KernelLog.LongRealFix( allocT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4739. KernelLog.Int( ENTIER( 100 * allocT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4740. KernelLog.Ln; KernelLog.String( "copy=" );
  4741. KernelLog.LongRealFix( copyT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4742. KernelLog.Int( ENTIER( 100 * copyT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4743. KernelLog.Ln; KernelLog.String( "zero=" );
  4744. KernelLog.LongRealFix( zeroT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4745. KernelLog.Int( ENTIER( 100 * zeroT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4746. KernelLog.Ln; KernelLog.String( "comp=" );
  4747. KernelLog.LongRealFix( compT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4748. KernelLog.Int( ENTIER( 100 * compT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4749. KernelLog.Ln;
  4750. END Times;
  4751. *)
  4752. (*
  4753. PROCEDURE TestRMM*;
  4754. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  4755. check, iter: LONGINT;
  4756. BEGIN
  4757. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  4758. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  4759. In.LongInt( iter ); In.LongInt( check );
  4760. IF L2BlockM = 0 THEN
  4761. MagicBlockR( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  4762. END;
  4763. DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  4764. END TestRMM;
  4765. PROCEDURE TestXMM*;
  4766. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  4767. iter, check: LONGINT;
  4768. BEGIN
  4769. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  4770. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  4771. In.LongInt( iter ); In.LongInt( check );
  4772. IF L2BlockM = 0 THEN
  4773. MagicBlockX( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  4774. END;
  4775. DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  4776. END TestXMM;
  4777. *)
  4778. (****** matrix multiplication using fast scalar product ******)
  4779. PROCEDURE MatMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4780. BEGIN
  4781. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  4782. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4783. END MatMulAXAXLoopA;
  4784. PROCEDURE MatMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4785. BEGIN
  4786. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  4787. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4788. END MatMulAXAXLoopSSE;
  4789. PROCEDURE MatMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4790. BEGIN
  4791. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  4792. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4793. END MatMulARARLoopA;
  4794. PROCEDURE MatMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4795. BEGIN
  4796. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  4797. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4798. END MatMulARARLoopSSE;
  4799. PROCEDURE MatMulIncAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4800. BEGIN
  4801. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4802. END MatMulIncAXAXLoopA;
  4803. PROCEDURE MatMulIncAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4804. BEGIN
  4805. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4806. END MatMulIncAXAXLoopSSE;
  4807. PROCEDURE MatMulIncARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4808. BEGIN
  4809. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4810. END MatMulIncARARLoopA;
  4811. PROCEDURE MatMulIncARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4812. BEGIN
  4813. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4814. END MatMulIncARARLoopSSE;
  4815. (****** matrix multiplication over rows with transposition of B *)
  4816. PROCEDURE MatMulHBlockR( MatrixA, MatrixB, MatrixC: ADDRESS;
  4817. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  4818. add: BOOLEAN );
  4819. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  4820. (*
  4821. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  4822. *)
  4823. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  4824. VAR i, j: SIZE; adrA, adrB, adrC: ADDRESS;
  4825. BEGIN
  4826. FOR i := fromA TO toA - 1 DO
  4827. adrA := MatrixA + i * Stride;
  4828. FOR j := fromB TO toB - 1 DO
  4829. adrB := MatrixB + j * Stride;
  4830. adrC := MatrixC + i * StrideC + j * IncC;
  4831. AlignedSPRSSE( adrA, adrB, adrC, Cols, add );
  4832. END;
  4833. END;
  4834. END Block;
  4835. BEGIN
  4836. IF cBlockSize = 0 THEN
  4837. BlockSize := L2CacheSize DIV Stride DIV 4;
  4838. ELSE BlockSize := cBlockSize;
  4839. END;
  4840. lastUsedBlockSize := BlockSize;
  4841. fromA := 0;
  4842. REPEAT
  4843. toA := fromA + BlockSize;
  4844. IF toA > RowsA THEN toA := RowsA END;
  4845. fromB := 0;
  4846. REPEAT
  4847. toB := fromB + BlockSize;
  4848. IF toB > RowsB THEN toB := RowsB END;
  4849. Block( fromA, toA, fromB, toB ); fromB := toB;
  4850. UNTIL toB = RowsB;
  4851. fromA := toA;
  4852. UNTIL toA = RowsA;
  4853. END MatMulHBlockR;
  4854. PROCEDURE MatMulHBlockX( MatrixA, MatrixB, MatrixC: ADDRESS;
  4855. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  4856. add: BOOLEAN );
  4857. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  4858. (*
  4859. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  4860. *)
  4861. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  4862. VAR adrA, adrB, adrC: ADDRESS; i, j: SIZE;
  4863. BEGIN
  4864. FOR i := fromA TO toA - 1 DO
  4865. adrA := MatrixA + i * Stride;
  4866. FOR j := fromB TO toB - 1 DO
  4867. adrB := MatrixB + j * Stride;
  4868. adrC := MatrixC + i * StrideC + j * IncC;
  4869. AlignedSPXSSE( adrA, adrB, adrC, Cols, add );
  4870. END;
  4871. END;
  4872. END Block;
  4873. BEGIN
  4874. IF cBlockSize = 0 THEN
  4875. BlockSize := L2CacheSize DIV Stride DIV 8;
  4876. ELSE BlockSize := cBlockSize;
  4877. END;
  4878. lastUsedBlockSize := BlockSize;
  4879. fromA := 0;
  4880. REPEAT
  4881. toA := fromA + BlockSize;
  4882. IF toA > RowsA THEN toA := RowsA END;
  4883. fromB := 0;
  4884. REPEAT
  4885. toB := fromB + BlockSize;
  4886. IF toB > RowsB THEN toB := RowsB END;
  4887. Block( fromA, toA, fromB, toB ); fromB := toB;
  4888. UNTIL toB = RowsB;
  4889. fromA := toA;
  4890. UNTIL toA = RowsA;
  4891. END MatMulHBlockX;
  4892. PROCEDURE CopyDataR( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  4893. VAR i: SIZE; t: HUGEINT;
  4894. BEGIN
  4895. Tic( t );
  4896. FOR i := 0 TO rows - 1 DO
  4897. Copy4( src, dest, incSrc, incDest, cols );
  4898. INC( src, strideSrc ); INC( dest, strideDest );
  4899. END;
  4900. Toc( t, copyT );
  4901. END CopyDataR;
  4902. PROCEDURE CopyDataX( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  4903. VAR i: SIZE; t: HUGEINT;
  4904. BEGIN
  4905. Tic( t );
  4906. FOR i := 0 TO rows - 1 DO
  4907. Copy8( src, dest, incSrc, incDest, cols );
  4908. INC( src, strideSrc ); INC( dest, strideDest );
  4909. END;
  4910. Toc( t, copyT );
  4911. END CopyDataX;
  4912. PROCEDURE MatMulARARTransposed( matrixA, matrixB, matrixC: ADDRESS;
  4913. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  4914. add: BOOLEAN ): BOOLEAN;
  4915. VAR stride: SIZE; adrB, adrC: ADDRESS;
  4916. proc: POINTER TO ARRAY OF MatMulHObjR;
  4917. from, to0, i: SIZE; cacheA, cacheB: Cache;
  4918. t: HUGEINT;
  4919. BEGIN
  4920. NEW(proc,nrProcesses);
  4921. ASSERT( ColsA = RowsB );
  4922. (* allocate 128 bit = 16 byte aligned matrix *)
  4923. stride := Align( ColsA * SIZEOF( REAL ), 16 );
  4924. IF (IncA # SIZEOF( REAL )) OR (StrideA # stride) OR
  4925. (matrixA MOD 16 # 0) THEN
  4926. cacheA := cachePool.Acquire( stride * RowsA );
  4927. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  4928. SIZEOF( REAL ), stride, RowsA, ColsA ); (* copy to array *)
  4929. matrixA := cacheA.adr;
  4930. ELSE cacheA := NIL;
  4931. END;
  4932. IF (StrideB # SIZEOF( REAL )) OR (IncB # stride) OR
  4933. (matrixB MOD 16 # 0) THEN
  4934. cacheB := cachePool.Acquire( stride * ColsB );
  4935. CopyDataR( matrixB, cacheB.adr, StrideB, IncB,
  4936. SIZEOF( REAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  4937. matrixB := cacheB.adr;
  4938. ELSE cacheB := NIL;
  4939. END;
  4940. Tic( t );
  4941. (*! needs decision rule if to split by rows or columns *)
  4942. IF nrProcesses > 1 THEN
  4943. from := 0;
  4944. FOR i := 0 TO nrProcesses - 1 DO
  4945. (*
  4946. to := RowsA * (i + 1) DIV nrProcesses; adrA := matrixA + from * stride;
  4947. adrC := matrixC + from * StrideC;
  4948. *)
  4949. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  4950. adrB := matrixB + from * stride;
  4951. adrC := matrixC + from * IncC;
  4952. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  4953. RowsA, to0 - from, RowsB, add );
  4954. from := to0;
  4955. END;
  4956. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  4957. ELSE
  4958. MatMulHBlockR( matrixA, matrixB, matrixC, stride, IncC,
  4959. StrideC, RowsA, ColsB, RowsB, add );
  4960. END;
  4961. Toc( t, compT ); cachePool.Release( cacheA );
  4962. cachePool.Release( cacheB ); RETURN TRUE;
  4963. END MatMulARARTransposed;
  4964. PROCEDURE MatMulAXAXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  4965. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  4966. add: BOOLEAN ): BOOLEAN;
  4967. VAR stride: SIZE; adrB, adrC: ADDRESS;
  4968. proc: POINTER TO ARRAY OF MatMulHObjX;
  4969. from, to0, i: SIZE; cacheA, cacheB: Cache;
  4970. t: HUGEINT;
  4971. BEGIN
  4972. NEW(proc,nrProcesses);
  4973. ASSERT( ColsA = RowsB );
  4974. stride := Align( ColsA * SIZEOF( LONGREAL ), 16 );
  4975. IF (IncA # SIZEOF( LONGREAL )) OR (StrideA # stride) OR
  4976. (matrixA MOD 16 # 0) THEN
  4977. cacheA := cachePool.Acquire( stride * RowsA );
  4978. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  4979. SIZEOF( LONGREAL ), stride, RowsA, ColsA ); (* copy to array *)
  4980. matrixA := cacheA.adr;
  4981. ELSE cacheA := NIL;
  4982. END;
  4983. IF (StrideB # SIZEOF( LONGREAL )) OR (IncB # stride) OR
  4984. (matrixB MOD 16 # 0) THEN
  4985. cacheB := cachePool.Acquire( stride * ColsB );
  4986. CopyDataX( matrixB, cacheB.adr, StrideB, IncB,
  4987. SIZEOF( LONGREAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  4988. matrixB := cacheB.adr;
  4989. ELSE cacheB := NIL;
  4990. END;
  4991. Tic( t );
  4992. IF nrProcesses > 1 THEN
  4993. from := 0;
  4994. FOR i := 0 TO nrProcesses - 1 DO
  4995. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  4996. adrB := matrixB + from * stride;
  4997. adrC := matrixC + from * IncC;
  4998. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  4999. RowsA, to0 - from, RowsB, add );
  5000. from := to0;
  5001. END;
  5002. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5003. ELSE
  5004. MatMulHBlockX( matrixA, matrixB, matrixC, stride, IncC,
  5005. StrideC, RowsA, ColsB, RowsB, add );
  5006. END;
  5007. Toc( t, compT ); cachePool.Release( cacheA );
  5008. cachePool.Release( cacheB ); RETURN TRUE;
  5009. END MatMulAXAXTransposed;
  5010. (****** strided matrix multiplication with restrictions to increments ******)
  5011. PROCEDURE MatMulARARSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5012. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5013. add: BOOLEAN ): BOOLEAN;
  5014. VAR sum: REAL; CbFrom, i, j, k: SIZE; valA, valB: REAL;
  5015. adrA, adrB, adrC: ADDRESS;
  5016. cacheA, cacheB, cacheC: Cache;
  5017. matrixCO, StrideCO, IncCO: SIZE; t: HUGEINT;
  5018. (*VAR fromA, toA: LONGINT; *)
  5019. BEGIN
  5020. IF (IncA # SIZEOF( REAL )) THEN
  5021. cacheA :=
  5022. cachePool.Acquire( RowsA * ColsA * SIZEOF( REAL ) );
  5023. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5024. SIZEOF( REAL ), SIZEOF( REAL ) * ColsA, RowsA,
  5025. ColsA );
  5026. matrixA := cacheA.adr; IncA := SIZEOF( REAL );
  5027. StrideA := SIZEOF( REAL ) * ColsA;
  5028. END;
  5029. IF (IncB # SIZEOF( REAL )) THEN
  5030. cacheB :=
  5031. cachePool.Acquire( RowsB * ColsB * SIZEOF( REAL ) );
  5032. CopyDataR( matrixB, cacheB.adr, IncB, StrideB,
  5033. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsB,
  5034. ColsB );
  5035. matrixB := cacheB.adr; IncB := SIZEOF( REAL );
  5036. StrideB := SIZEOF( REAL ) * ColsB;
  5037. END;
  5038. IF (IncC # SIZEOF( REAL )) THEN
  5039. cacheC :=
  5040. cachePool.Acquire( RowsA * ColsB * SIZEOF( REAL ) );
  5041. CopyDataR( matrixC, cacheC.adr, IncC, StrideC,
  5042. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsA,
  5043. ColsB );
  5044. matrixCO := matrixC; StrideCO := StrideC;
  5045. IncCO := IncC; matrixC := cacheC.adr;
  5046. IncC := SIZEOF( REAL ); StrideC := SIZEOF( REAL ) * ColsB;
  5047. END;
  5048. Tic( t );
  5049. CbFrom := 0;
  5050. IF ColsB >= 24 THEN
  5051. SSEMul24BlockR( CbFrom, StrideA, StrideB, StrideC,
  5052. ColsA, RowsA, ColsB, RowsB, matrixA,
  5053. matrixB, matrixC, add );
  5054. END;
  5055. IF ColsB - CbFrom >= 16 THEN
  5056. SSEMul16BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5057. CbFrom, matrixA, matrixB, matrixC, add );
  5058. INC( CbFrom, 16 );
  5059. END;
  5060. IF ColsB - CbFrom >= 8 THEN
  5061. SSEMul8BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5062. CbFrom, matrixA, matrixB, matrixC, add );
  5063. INC( CbFrom, 8 );
  5064. END;
  5065. IF ColsB - CbFrom >= 4 THEN
  5066. SSEMul4BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5067. CbFrom, matrixA, matrixB, matrixC, add );
  5068. INC( CbFrom, 4 );
  5069. END;
  5070. IF ColsB - CbFrom > 0 THEN
  5071. (* do it in Oberon *)
  5072. FOR i := 0 TO RowsA - 1 DO
  5073. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5074. FOR j := CbFrom TO ColsB - 1 DO
  5075. adrA := matrixA + i * StrideA;
  5076. adrB := matrixB + j * IncB;
  5077. IF add THEN SYSTEM.GET( adrC, sum )
  5078. ELSE sum := 0
  5079. END;
  5080. FOR k := 0 TO RowsB - 1 DO
  5081. SYSTEM.GET( adrA, valA );
  5082. SYSTEM.GET( adrB, valB );
  5083. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5084. INC( adrA, IncA ); INC( adrB, StrideB );
  5085. END;
  5086. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5087. (* C[i, j] := sum; *)
  5088. END;
  5089. END;
  5090. END;
  5091. Toc( t, compT );
  5092. IF cacheC # NIL THEN
  5093. CopyDataR( matrixC, matrixCO, IncC, StrideC, IncCO,
  5094. StrideCO, RowsA, ColsB );
  5095. END;
  5096. cachePool.Release( cacheA );
  5097. cachePool.Release( cacheB );
  5098. cachePool.Release( cacheC );
  5099. RETURN TRUE;
  5100. END MatMulARARSSEStride;
  5101. PROCEDURE MatMulAXAXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5102. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5103. add: BOOLEAN ): BOOLEAN;
  5104. VAR sum: LONGREAL; CbFrom, i, j, k: SIZE;
  5105. valA, valB: LONGREAL; adrA, adrB, adrC: ADDRESS;
  5106. cacheA, cacheB, cacheC: Cache;
  5107. matrixCO, StrideCO, IncCO:SIZE; t: HUGEINT;
  5108. BEGIN
  5109. IF (IncA # SIZEOF( LONGREAL )) THEN
  5110. cacheA :=
  5111. cachePool.Acquire( RowsA * ColsA * SIZEOF( LONGREAL ) );
  5112. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5113. SIZEOF( LONGREAL ),
  5114. SIZEOF( LONGREAL ) * ColsA, RowsA, ColsA );
  5115. matrixA := cacheA.adr;
  5116. StrideA := SIZEOF( LONGREAL ) * ColsA;
  5117. IncA := SIZEOF( LONGREAL );
  5118. END;
  5119. IF (IncB # SIZEOF( LONGREAL )) THEN
  5120. cacheB :=
  5121. cachePool.Acquire( RowsB * ColsB * SIZEOF( LONGREAL ) );
  5122. CopyDataX( matrixB, cacheB.adr, IncB, StrideB,
  5123. SIZEOF( LONGREAL ),
  5124. SIZEOF( LONGREAL ) * ColsB, RowsB, ColsB );
  5125. matrixB := cacheB.adr;
  5126. StrideB := SIZEOF( LONGREAL ) * ColsB;
  5127. IncB := SIZEOF( LONGREAL );
  5128. END;
  5129. IF (IncC # SIZEOF( LONGREAL )) THEN
  5130. cacheC :=
  5131. cachePool.Acquire( RowsA * ColsB * SIZEOF( LONGREAL ) );
  5132. CopyDataX( matrixC, cacheC.adr, IncC, StrideC,
  5133. SIZEOF( LONGREAL ),
  5134. SIZEOF( LONGREAL ) * ColsB, RowsA, ColsB );
  5135. matrixCO := matrixC; StrideCO := StrideC;
  5136. IncCO := IncC; StrideC := SIZEOF( LONGREAL ) * ColsB;
  5137. IncC := SIZEOF( LONGREAL ); matrixC := cacheC.adr;
  5138. END;
  5139. Tic( t );
  5140. CbFrom := 0;
  5141. IF ColsB >= 12 THEN
  5142. SSEMul12BlockX( CbFrom, StrideA, StrideB, StrideC,
  5143. ColsA, RowsA, ColsB, RowsB, matrixA,
  5144. matrixB, matrixC, add );
  5145. END;
  5146. IF ColsB - CbFrom >= 8 THEN
  5147. SSEMul8BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5148. CbFrom, matrixA, matrixB, matrixC, add );
  5149. INC( CbFrom, 8 );
  5150. END;
  5151. IF ColsB - CbFrom >= 4 THEN
  5152. SSEMul4BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5153. CbFrom, matrixA, matrixB, matrixC, add );
  5154. INC( CbFrom, 4 );
  5155. END;
  5156. IF ColsB - CbFrom >= 2 THEN
  5157. SSEMul2BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5158. CbFrom, matrixA, matrixB, matrixC, add );
  5159. INC( CbFrom, 2 );
  5160. END;
  5161. IF ColsB - CbFrom > 0 THEN
  5162. (* do it in Oberon *)
  5163. FOR i := 0 TO RowsA - 1 DO
  5164. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5165. FOR j := CbFrom TO ColsB - 1 DO
  5166. adrA := matrixA + i * StrideA;
  5167. adrB := matrixB + j * IncB;
  5168. IF add THEN SYSTEM.GET( adrC, sum )
  5169. ELSE sum := 0
  5170. END;
  5171. FOR k := 0 TO RowsB - 1 DO
  5172. SYSTEM.GET( adrA, valA );
  5173. SYSTEM.GET( adrB, valB );
  5174. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5175. INC( adrA, IncA ); INC( adrB, StrideB );
  5176. END;
  5177. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5178. (* C[i, j] := sum; *)
  5179. END;
  5180. END;
  5181. END;
  5182. Toc( t, compT );
  5183. IF cacheC # NIL THEN
  5184. CopyDataX( matrixC, matrixCO, IncC, StrideC, IncCO,
  5185. StrideCO, RowsA, ColsB );
  5186. END;
  5187. cachePool.Release( cacheA );
  5188. cachePool.Release( cacheB );
  5189. cachePool.Release( cacheC );
  5190. RETURN TRUE;
  5191. END MatMulAXAXSSEStride;
  5192. (****** naiive Oberon matrix multiplication ******)
  5193. PROCEDURE MatMulARARNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5194. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5195. add: BOOLEAN );
  5196. (*
  5197. A is M x K matrix, M=rows (A); K=cols(A);
  5198. B is K x N matrix; K=rows(B); N = cols(B);
  5199. C is M x N matrix; M=rows(C); N=cols(C);
  5200. *)
  5201. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5202. val1, val2, sum: REAL; t: HUGEINT;
  5203. BEGIN
  5204. Tic( t );
  5205. FOR i := 1 TO M DO
  5206. adrC := matrixC; adrB := matrixB;
  5207. FOR j := 1 TO N DO
  5208. adrA := matrixA; innerB := adrB;
  5209. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5210. FOR k := 1 TO K DO
  5211. SYSTEM.GET( adrA, val1 );
  5212. SYSTEM.GET( innerB, val2 );
  5213. sum := sum + val1 * val2; INC( adrA, IncA );
  5214. INC( innerB, StrideB );
  5215. END;
  5216. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5217. INC( adrC, IncC );
  5218. END;
  5219. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5220. END;
  5221. Toc( t, compT );
  5222. END MatMulARARNaiive;
  5223. PROCEDURE MatMulAXAXNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5224. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5225. add: BOOLEAN );
  5226. (*
  5227. A is M x K matrix, M=rows (A); K=cols(A);
  5228. B is K x N matrix; K=rows(B); N = cols(B);
  5229. C is M x N matrix; M=rows(C); N=cols(C);
  5230. *)
  5231. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5232. val1, val2, sum: LONGREAL; t: HUGEINT;
  5233. BEGIN
  5234. Tic( t );
  5235. FOR i := 1 TO M DO
  5236. adrC := matrixC; adrB := matrixB;
  5237. FOR j := 1 TO N DO
  5238. adrA := matrixA; innerB := adrB;
  5239. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5240. FOR k := 1 TO K DO
  5241. SYSTEM.GET( adrA, val1 );
  5242. SYSTEM.GET( innerB, val2 );
  5243. sum := sum + val1 * val2; INC( adrA, IncA );
  5244. INC( innerB, StrideB );
  5245. END;
  5246. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5247. INC( adrC, IncC );
  5248. END;
  5249. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5250. END;
  5251. Toc( t, compT );
  5252. END MatMulAXAXNaiive;
  5253. (*
  5254. PROCEDURE Toggle( VAR A, B: LONGINT );
  5255. VAR temp: LONGINT;
  5256. BEGIN
  5257. temp := A; A := B; B := temp;
  5258. END Toggle;
  5259. PROCEDURE Transpose( VAR matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT );
  5260. (*
  5261. prepare computation of C=A*B via C = (B` * A`)`
  5262. *)
  5263. BEGIN
  5264. Toggle( matrixA, matrixB ); Toggle( IncA, StrideB ); Toggle( StrideA, IncB );
  5265. Toggle( IncC, StrideC ); Toggle( M, N );
  5266. END Transpose;
  5267. *)
  5268. (*
  5269. *)
  5270. PROCEDURE BestMethod( M, N, K: SIZE ): LONGINT;
  5271. BEGIN
  5272. IF M = 1 THEN
  5273. IF N < 32 THEN RETURN cMatMulScalarProduct
  5274. ELSIF N < 256 THEN
  5275. IF K < 256 THEN RETURN cMatMulScalarProduct
  5276. ELSE RETURN cMatMulStride
  5277. END;
  5278. ELSE RETURN cMatMulStride
  5279. END;
  5280. ELSIF N = 1 THEN
  5281. IF (M > 1024) & (K > 1024) THEN
  5282. RETURN cMatMulTransposed
  5283. ELSE RETURN cMatMulScalarProduct
  5284. END;
  5285. ELSIF K = 1 THEN
  5286. IF N < 32 THEN
  5287. IF M < 256 THEN RETURN cMatMulNaive
  5288. ELSE RETURN cMatMulStride
  5289. END;
  5290. ELSIF N < 256 THEN
  5291. IF M < 32 THEN RETURN cMatMulNaive
  5292. ELSE RETURN cMatMulStride
  5293. END;
  5294. ELSE RETURN cMatMulStride
  5295. END;
  5296. ELSIF M < 32 THEN
  5297. IF N < 32 THEN RETURN cMatMulScalarProduct
  5298. ELSIF N < 256 THEN
  5299. IF K < 32 THEN RETURN cMatMulScalarProduct
  5300. ELSE RETURN cMatMulStride
  5301. END;
  5302. ELSE RETURN cMatMulStride
  5303. END;
  5304. ELSIF M < 256 THEN
  5305. IF N < 32 THEN
  5306. IF K < 32 THEN RETURN cMatMulScalarProduct
  5307. ELSE RETURN cMatMulStride
  5308. END;
  5309. ELSE
  5310. IF K < 256 THEN RETURN cMatMulStride
  5311. ELSE RETURN cMatMulBlocked
  5312. END;
  5313. END;
  5314. ELSE
  5315. IF N < 32 THEN RETURN cMatMulStride ELSE
  5316. IF K < 256 THEN RETURN cMatMulStride
  5317. ELSE RETURN cMatMulBlocked
  5318. END;
  5319. END;
  5320. END;
  5321. RETURN cMatMulStride;
  5322. END BestMethod;
  5323. (*
  5324. (N) (K) (N)
  5325. CCCCCC AAAAA BBBBB
  5326. CCCCCC AAAAA BBBBB
  5327. (M) CCCCCC = (M) AAAAA * (K) BBBBB
  5328. CCCCCC AAAAA BBBBB
  5329. CCCCCC AAAAA BBBBB
  5330. *)
  5331. PROCEDURE MatMulR( matrixA, matrixB, matrixC: ADDRESS;
  5332. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5333. (*! heuristics for choice of different methods needs improvement *)
  5334. (*! transpose if superior*)
  5335. (*! provide special variant for small [up to 4x4] matrices *)
  5336. VAR M, N, K: SIZE;
  5337. BEGIN
  5338. ASSERT( ColsA = RowsB );
  5339. M := RowsA; N := ColsB; K := ColsA;
  5340. CASE BestMethod( M, N, K ) OF
  5341. | cMatMulScalarProduct:
  5342. RETURN FALSE;
  5343. | cMatMulNaive:
  5344. RETURN MatMulRNaive( matrixA, matrixB, matrixC, IncA,
  5345. StrideA, IncB, StrideB, IncC,
  5346. StrideC, RowsA, ColsA, RowsB,
  5347. ColsB );
  5348. | cMatMulTransposed:
  5349. RETURN MatMulARARTransposed( matrixA, matrixB,
  5350. matrixC, IncA,
  5351. StrideA, IncB,
  5352. StrideB, IncC,
  5353. StrideC, RowsA,
  5354. ColsA, RowsB,
  5355. ColsB, FALSE );
  5356. | cMatMulStride:
  5357. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5358. matrixC, IncA, StrideA,
  5359. IncB, StrideB, IncC,
  5360. StrideC, RowsA,
  5361. ColsA, RowsB, ColsB,
  5362. FALSE );
  5363. | cMatMulBlocked:
  5364. RETURN MatMulARARBlocked( matrixA, matrixB,
  5365. matrixC, IncA, StrideA,
  5366. IncB, StrideB, IncC,
  5367. StrideC, RowsA, ColsA,
  5368. RowsB, ColsB, FALSE );
  5369. ELSE
  5370. RETURN FALSE (* use scalar product for each row and column *)
  5371. END;
  5372. END MatMulR;
  5373. PROCEDURE MatMulX( matrixA, matrixB, matrixC: ADDRESS;
  5374. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5375. VAR M, N, K: SIZE;
  5376. BEGIN
  5377. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5378. K := ColsA;
  5379. (*
  5380. KernelLog.String("MatMulX, M,N,K = "); KernelLog.Int(M,10); KernelLog.Int(N,10); KernelLog.Int(K,10); KernelLog.Ln;
  5381. KernelLog.String("Method= "); KernelLog.Int( BestMethod(M,N,K),10); KernelLog.Ln;
  5382. *)
  5383. CASE BestMethod( M, N, K ) OF
  5384. | cMatMulScalarProduct:
  5385. RETURN FALSE;
  5386. | cMatMulNaive:
  5387. RETURN MatMulXNaive( matrixA, matrixB, matrixC, IncA,
  5388. StrideA, IncB, StrideB, IncC,
  5389. StrideC, RowsA, ColsA, RowsB,
  5390. ColsB );
  5391. | cMatMulTransposed:
  5392. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5393. matrixC, IncA,
  5394. StrideA, IncB, StrideB,
  5395. IncC, StrideC, RowsA,
  5396. ColsA, RowsB, ColsB,
  5397. FALSE );
  5398. | cMatMulStride:
  5399. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5400. matrixC, IncA, StrideA,
  5401. IncB, StrideB, IncC,
  5402. StrideC, RowsA, ColsA,
  5403. RowsB, ColsB,
  5404. FALSE );
  5405. | cMatMulBlocked:
  5406. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5407. matrixC, IncA, StrideA,
  5408. IncB, StrideB, IncC,
  5409. StrideC, RowsA, ColsA,
  5410. RowsB, ColsB, FALSE );
  5411. ELSE
  5412. RETURN FALSE (* use scalar product for each row and column *)
  5413. END;
  5414. END MatMulX;
  5415. PROCEDURE MatMulIncR( matrixA, matrixB, matrixC: ADDRESS;
  5416. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5417. (*! heuristics for choice of different methods needs improvement *)
  5418. (*! transpose if superior*)
  5419. (*! provide special variant for small [up to 4x4] matrices *)
  5420. VAR M, N, K: SIZE;
  5421. BEGIN
  5422. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5423. K := ColsA;
  5424. CASE BestMethod( M, N, K ) OF
  5425. | cMatMulScalarProduct:
  5426. RETURN FALSE;
  5427. | cMatMulNaive:
  5428. RETURN MatMulIncRNaive( matrixA, matrixB, matrixC,
  5429. IncA, StrideA, IncB, StrideB,
  5430. IncC, StrideC, RowsA, ColsA,
  5431. RowsB, ColsB );
  5432. | cMatMulTransposed:
  5433. RETURN MatMulARARTransposed( matrixA, matrixB,
  5434. matrixC, IncA,
  5435. StrideA, IncB,
  5436. StrideB, IncC,
  5437. StrideC, RowsA,
  5438. ColsA, RowsB,
  5439. ColsB, TRUE );
  5440. | cMatMulStride:
  5441. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5442. matrixC, IncA, StrideA,
  5443. IncB, StrideB, IncC,
  5444. StrideC, RowsA,
  5445. ColsA, RowsB, ColsB,
  5446. TRUE );
  5447. | cMatMulBlocked:
  5448. RETURN MatMulARARBlocked( matrixA, matrixB,
  5449. matrixC, IncA, StrideA,
  5450. IncB, StrideB, IncC,
  5451. StrideC, RowsA, ColsA,
  5452. RowsB, ColsB, TRUE );
  5453. ELSE
  5454. RETURN FALSE (* use scalar product for each row and column *)
  5455. END;
  5456. END MatMulIncR;
  5457. PROCEDURE MatMulIncX( matrixA, matrixB, matrixC: ADDRESS;
  5458. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5459. VAR M, N, K: SIZE;
  5460. BEGIN
  5461. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5462. K := ColsA;
  5463. CASE BestMethod( M, N, K ) OF
  5464. | cMatMulScalarProduct:
  5465. RETURN FALSE;
  5466. | cMatMulNaive:
  5467. RETURN MatMulIncXNaive( matrixA, matrixB, matrixC,
  5468. IncA, StrideA, IncB, StrideB,
  5469. IncC, StrideC, RowsA, ColsA,
  5470. RowsB, ColsB );
  5471. | cMatMulTransposed:
  5472. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5473. matrixC, IncA,
  5474. StrideA, IncB, StrideB,
  5475. IncC, StrideC, RowsA,
  5476. ColsA, RowsB, ColsB,
  5477. TRUE );
  5478. | cMatMulStride:
  5479. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5480. matrixC, IncA, StrideA,
  5481. IncB, StrideB, IncC,
  5482. StrideC, RowsA, ColsA,
  5483. RowsB, ColsB, TRUE );
  5484. | cMatMulBlocked:
  5485. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5486. matrixC, IncA, StrideA,
  5487. IncB, StrideB, IncC,
  5488. StrideC, RowsA, ColsA,
  5489. RowsB, ColsB, TRUE );
  5490. ELSE
  5491. RETURN FALSE (* use scalar product for each row and column *)
  5492. END;
  5493. END MatMulIncX;
  5494. PROCEDURE MatMulARARBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5495. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5496. add: BOOLEAN ): BOOLEAN;
  5497. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5498. BEGIN
  5499. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5500. K := ColsA; MagicBlockR( M, N, K, L2M, L2N, L2K );
  5501. (*
  5502. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5503. IncC, StrideC, RowsA, ColsB, ColsA );
  5504. *)
  5505. MultiplyR( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5506. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5507. StrideC, add );
  5508. RETURN TRUE;
  5509. END MatMulARARBlocked;
  5510. PROCEDURE MatMulAXAXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5511. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5512. add: BOOLEAN ): BOOLEAN;
  5513. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5514. BEGIN
  5515. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5516. K := ColsA; MagicBlockX( M, N, K, L2M, L2N, L2K );
  5517. (*
  5518. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5519. IncC, StrideC, RowsA, ColsB, ColsA );
  5520. *)
  5521. MultiplyX( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5522. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5523. StrideC, add );
  5524. RETURN TRUE;
  5525. END MatMulAXAXBlocked;
  5526. PROCEDURE MatMulRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5527. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5528. BEGIN
  5529. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5530. IncB, StrideB, IncC, StrideC, RowsA,
  5531. ColsB, ColsA, FALSE );
  5532. RETURN TRUE;
  5533. END MatMulRNaive;
  5534. PROCEDURE MatMulXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5535. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5536. BEGIN
  5537. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5538. IncB, StrideB, IncC, StrideC, RowsA,
  5539. ColsB, ColsA, FALSE );
  5540. RETURN TRUE;
  5541. END MatMulXNaive;
  5542. PROCEDURE MatMulIncRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5543. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5544. BEGIN
  5545. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5546. IncB, StrideB, IncC, StrideC, RowsA,
  5547. ColsB, ColsA, TRUE );
  5548. RETURN TRUE;
  5549. END MatMulIncRNaive;
  5550. PROCEDURE MatMulIncXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5551. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5552. BEGIN
  5553. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5554. IncB, StrideB, IncC, StrideC, RowsA,
  5555. ColsB, ColsA, TRUE );
  5556. RETURN TRUE;
  5557. END MatMulIncXNaive;
  5558. PROCEDURE MatMulXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5559. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5560. BEGIN
  5561. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5562. IncA, StrideA, IncB,
  5563. StrideB, IncC, StrideC,
  5564. RowsA, ColsA, RowsB,
  5565. ColsB, FALSE );
  5566. END MatMulXTransposed;
  5567. PROCEDURE MatMulIncXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5568. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5569. BEGIN
  5570. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5571. IncA, StrideA, IncB,
  5572. StrideB, IncC, StrideC,
  5573. RowsA, ColsA, RowsB,
  5574. ColsB, TRUE )
  5575. END MatMulIncXTransposed;
  5576. PROCEDURE MatMulRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5577. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5578. BEGIN
  5579. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5580. IncA, StrideA, IncB,
  5581. StrideB, IncC, StrideC,
  5582. RowsA, ColsA, RowsB,
  5583. ColsB, FALSE );
  5584. END MatMulRTransposed;
  5585. PROCEDURE MatMulIncRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5586. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5587. BEGIN
  5588. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5589. IncA, StrideA, IncB,
  5590. StrideB, IncC, StrideC,
  5591. RowsA, ColsA, RowsB,
  5592. ColsB, TRUE )
  5593. END MatMulIncRTransposed;
  5594. PROCEDURE MatMulXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5595. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5596. BEGIN
  5597. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5598. IncA, StrideA, IncB, StrideB,
  5599. IncC, StrideC, RowsA,
  5600. ColsA, RowsB, ColsB,
  5601. FALSE );
  5602. END MatMulXSSEStride;
  5603. PROCEDURE MatMulIncXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5604. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5605. BEGIN
  5606. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5607. IncA, StrideA, IncB, StrideB,
  5608. IncC, StrideC, RowsA,
  5609. ColsA, RowsB, ColsB,
  5610. TRUE );
  5611. END MatMulIncXSSEStride;
  5612. PROCEDURE MatMulRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5613. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5614. BEGIN
  5615. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5616. IncA, StrideA, IncB, StrideB,
  5617. IncC, StrideC, RowsA,
  5618. ColsA, RowsB, ColsB,
  5619. FALSE );
  5620. END MatMulRSSEStride;
  5621. PROCEDURE MatMulIncRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5622. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5623. BEGIN
  5624. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5625. IncA, StrideA, IncB, StrideB,
  5626. IncC, StrideC, RowsA,
  5627. ColsA, RowsB, ColsB,
  5628. TRUE )
  5629. END MatMulIncRSSEStride;
  5630. PROCEDURE MatMulRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5631. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5632. BEGIN
  5633. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5634. IncA, StrideA, IncB, StrideB,
  5635. IncC, StrideC, RowsA, ColsA,
  5636. RowsB, ColsB, FALSE )
  5637. END MatMulRBlocked;
  5638. PROCEDURE MatMulIncRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5639. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5640. BEGIN
  5641. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5642. IncA, StrideA, IncB, StrideB,
  5643. IncC, StrideC, RowsA, ColsA,
  5644. RowsB, ColsB, TRUE )
  5645. END MatMulIncRBlocked;
  5646. PROCEDURE MatMulXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5647. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5648. BEGIN
  5649. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5650. IncA, StrideA, IncB, StrideB,
  5651. IncC, StrideC, RowsA, ColsA,
  5652. RowsB, ColsB, FALSE )
  5653. END MatMulXBlocked;
  5654. PROCEDURE MatMulIncXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5655. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5656. BEGIN
  5657. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5658. IncA, StrideA, IncB, StrideB,
  5659. IncC, StrideC, RowsA, ColsA,
  5660. RowsB, ColsB, TRUE )
  5661. END MatMulIncXBlocked;
  5662. PROCEDURE SetMatMulMethod*( i: LONGINT );
  5663. BEGIN
  5664. KernelLog.String("ArrayBaseOptimized, method = ");
  5665. IF i = cMatMulDynamic THEN
  5666. KernelLog.String("dynamic.");
  5667. ArrayBase.matMulIncR := MatMulIncR;
  5668. ArrayBase.matMulIncX := MatMulIncX;
  5669. ArrayBase.matMulR := MatMulR;
  5670. ArrayBase.matMulX := MatMulX;
  5671. ELSIF i = cMatMulScalarProduct THEN
  5672. KernelLog.String("scalarproduct.");
  5673. ArrayBase.matMulIncR := NIL;
  5674. ArrayBase.matMulIncX := NIL;
  5675. ArrayBase.matMulR := NIL; ArrayBase.matMulX := NIL;
  5676. ELSIF i = cMatMulNaive THEN
  5677. KernelLog.String("naiive.");
  5678. ArrayBase.matMulR := MatMulRNaive;
  5679. ArrayBase.matMulX := MatMulXNaive;
  5680. ArrayBase.matMulIncR := MatMulIncRNaive;
  5681. ArrayBase.matMulIncX := MatMulIncXNaive;
  5682. ELSIF i = cMatMulTransposed THEN
  5683. KernelLog.String("transposed.");
  5684. ArrayBase.matMulR := MatMulRTransposed;
  5685. ArrayBase.matMulX := MatMulXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  5686. ArrayBase.matMulIncR := MatMulIncRTransposed;
  5687. ArrayBase.matMulIncX := MatMulIncXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  5688. ELSIF i = cMatMulStride THEN
  5689. KernelLog.String("stride.");
  5690. ArrayBase.matMulR := MatMulRSSEStride;
  5691. ArrayBase.matMulX := MatMulXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  5692. ArrayBase.matMulIncR := MatMulIncRSSEStride;
  5693. ArrayBase.matMulIncX := MatMulIncXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  5694. ELSIF i = cMatMulBlocked THEN
  5695. KernelLog.String("blocked.");
  5696. ArrayBase.matMulR := MatMulRBlocked;
  5697. ArrayBase.matMulX := MatMulXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  5698. ArrayBase.matMulIncR := MatMulIncRBlocked;
  5699. ArrayBase.matMulIncX := MatMulIncXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  5700. END;
  5701. KernelLog.Ln;
  5702. END SetMatMulMethod;
  5703. (* optimizations for small arrays (Alexey Morozov) *)
  5704. (* assumes that all arrays do not overlap *)
  5705. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  5706. PROCEDURE MatMulR2x2(dadr, ladr, radr: ADDRESS);
  5707. CODE{SYSTEM.i386, SYSTEM.SSE2}
  5708. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5709. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5710. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  5711. MOVUPS XMM0, [EAX] ; [a00,a01,a10,a11]
  5712. MOVUPS XMM1, [EBX] ; [b00,b01,b10,b11]
  5713. MOVAPS XMM2, XMM1
  5714. SHUFPS XMM2, XMM1, 204 ; XMM2 := [b00,b11,b00,b11]
  5715. MULPS XMM2, XMM0
  5716. SHUFPS XMM0, XMM0, 177 ; XMM0 := [a01,a00,a11,a10]
  5717. SHUFPS XMM1, XMM1, 102 ; XMM1 := [b10,b01,b10,b01]
  5718. MULPS XMM1, XMM0
  5719. ADDPS XMM1, XMM2
  5720. MOVUPS [ECX], XMM1
  5721. END MatMulR2x2;
  5722. (* based on weighted sum of rows (Alexey Morozov) *)
  5723. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  5724. PROCEDURE MatMulR3x3(dadr, ladr, radr: ADDRESS);
  5725. CODE{SYSTEM.i386, SYSTEM.SSE2}
  5726. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5727. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5728. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  5729. MOVUPS XMM0, [EBX] ; XMM0 := [b00,b01,b02,-]
  5730. MOVUPS XMM1, [EBX+12] ; XMM1 := [b10,b11,b12,-]
  5731. ; last element is out of range, is it still OK?
  5732. MOVUPS XMM2, [EBX+24] ; XMM2 := [b20,b21,b22,-]
  5733. ;MOVLPS XMM2, [EBX+24]
  5734. ;MOVSS XMM3, [EBX+32]
  5735. ;MOVLHPS XMM2, XMM3
  5736. MOVSS XMM3, [EAX]
  5737. SHUFPS XMM3, XMM3, 0; XMM3 := [a00,a00,a00,-]
  5738. MOVAPS XMM4, XMM0
  5739. MULPS XMM4, XMM3
  5740. MOVSS XMM3, [EAX+4]
  5741. SHUFPS XMM3, XMM3, 0; XMM3 := [a01,a01,a01,-]
  5742. MULPS XMM3, XMM1
  5743. ADDPS XMM4, XMM3
  5744. MOVSS XMM3, [EAX+8]
  5745. SHUFPS XMM3, XMM3, 0; XMM3 := [a02,a02,a02,-]
  5746. MULPS XMM3, XMM2
  5747. ADDPS XMM4, XMM3
  5748. MOVUPS [ECX], XMM4
  5749. ;***************************************************;
  5750. MOVSS XMM3, [EAX+12]
  5751. SHUFPS XMM3, XMM3, 0; XMM3 := [a10,a10,a10,-]
  5752. MOVAPS XMM4, XMM0
  5753. MULPS XMM4, XMM3
  5754. MOVSS XMM3, [EAX+16]
  5755. SHUFPS XMM3, XMM3, 0; XMM3 := [a11,a11,a11,-]
  5756. MULPS XMM3, XMM1
  5757. ADDPS XMM4, XMM3
  5758. MOVSS XMM3, [EAX+20]
  5759. SHUFPS XMM3, XMM3, 0; XMM3 := [a12,a12,a12,-]
  5760. MULPS XMM3, XMM2
  5761. ADDPS XMM4, XMM3
  5762. MOVUPS [ECX+12], XMM4
  5763. ;***************************************************;
  5764. MOVSS XMM3, [EAX+24]
  5765. SHUFPS XMM3, XMM3, 0; XMM3 := [a20,a20,a20,-]
  5766. MOVAPS XMM4, XMM0
  5767. MULPS XMM4, XMM3
  5768. MOVSS XMM3, [EAX+28]
  5769. SHUFPS XMM3, XMM3, 0; XMM3 := [a21,a21,a21,-]
  5770. MULPS XMM3, XMM1
  5771. ADDPS XMM4, XMM3
  5772. MOVSS XMM3, [EAX+32]
  5773. SHUFPS XMM3, XMM3, 0; XMM3 := [a22,a22,a22,-]
  5774. MULPS XMM3, XMM2
  5775. ADDPS XMM4, XMM3
  5776. ;MOVUPS [ECX+24], XMM4
  5777. MOVLPS [ECX+24], XMM4
  5778. MOVHLPS XMM4, XMM4
  5779. MOVSS [ECX+32], XMM4
  5780. END MatMulR3x3;
  5781. (* based on Strassen algorithm (Alexey Morozov) *)
  5782. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  5783. PROCEDURE MatMulR4x4(dadr, ladr, radr: ADDRESS);
  5784. CODE{SYSTEM.i386, SYSTEM.SSE2}
  5785. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5786. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5787. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  5788. ; load A00
  5789. MOVLPS XMM0, [EAX] ; XMM0 := [a00,a01,-,-]
  5790. MOVHPS XMM0, [EAX+16] ; XMM0 := [a00,a01,a10,a11]
  5791. ; load A01
  5792. MOVLPS XMM1, [EAX+8] ; XMM1 := [a02,a03,-,-]
  5793. MOVHPS XMM1, [EAX+24] ; XMM1 := [a02,a03,a12,a13]
  5794. ; load B00
  5795. MOVLPS XMM2, [EBX] ; XMM2 := [b00,b01,-,-]
  5796. MOVHPS XMM2, [EBX+16] ; XMM2 := [b00,b01,b10,b11]
  5797. ; load B01
  5798. MOVLPS XMM3, [EBX+8] ; XMM3 := [a02,a03,-,-]
  5799. MOVHPS XMM3, [EBX+24] ; XMM3 := [a02,a03,a12,a13]
  5800. ; load B10
  5801. MOVLPS XMM4, [EBX+32] ; XMM4 := [b20,b21,-,-]
  5802. MOVHPS XMM4, [EBX+48] ; XMM4 := [b20,b21,b30,b31]
  5803. ; load B11
  5804. MOVLPS XMM5, [EBX+40] ; XMM5 := [b22,b23,-,-]
  5805. MOVHPS XMM5, [EBX+56] ; XMM5 := [b22,b23,b32,b33]
  5806. ;****************************************************;
  5807. ; multiply A00(D)*B00(E) (use MatMulR2x2 code)
  5808. MOVAPS XMM6, XMM2
  5809. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5810. MULPS XMM6, XMM0
  5811. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5812. MOVAPS XMM7, XMM2
  5813. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5814. MULPS XMM7, XMM0
  5815. ADDPS XMM7, XMM6
  5816. ; multiply A01(D)*B10(E)
  5817. MOVAPS XMM0, XMM4
  5818. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5819. MULPS XMM0, XMM1
  5820. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5821. MOVAPS XMM6, XMM4
  5822. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5823. MULPS XMM6, XMM1
  5824. ADDPS XMM6, XMM0
  5825. ADDPS XMM7, XMM6
  5826. MOVLPS [ECX], XMM7
  5827. MOVHPS [ECX+16], XMM7
  5828. ;****************************************************;
  5829. ; load A00
  5830. MOVLPS XMM0, [EAX] ; XMM0 := [a00,a01,-,-]
  5831. MOVHPS XMM0, [EAX+16] ; XMM0 := [a00,a01,a10,a11]
  5832. ; load A01
  5833. MOVLPS XMM1, [EAX+8] ; XMM1 := [a02,a03,-,-]
  5834. MOVHPS XMM1, [EAX+24] ; XMM1 := [a02,a03,a12,a13]
  5835. ; multiply A00(D)*B01(E) (use MatMulR2x2 code)
  5836. MOVAPS XMM6, XMM3
  5837. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5838. MULPS XMM6, XMM0
  5839. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5840. MOVAPS XMM7, XMM3
  5841. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5842. MULPS XMM7, XMM0
  5843. ADDPS XMM7, XMM6
  5844. ; multiply A01(D)*B11(E)
  5845. MOVAPS XMM0, XMM5
  5846. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5847. MULPS XMM0, XMM1
  5848. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5849. MOVAPS XMM6, XMM5
  5850. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5851. MULPS XMM6, XMM1
  5852. ADDPS XMM6, XMM0
  5853. ADDPS XMM7, XMM6
  5854. MOVLPS [ECX+8], XMM7
  5855. MOVHPS [ECX+24], XMM7
  5856. ;****************************************************;
  5857. ; load A10
  5858. MOVLPS XMM0, [EAX+32] ; XMM0 := [a20,a21,-,-]
  5859. MOVHPS XMM0, [EAX+48] ; XMM0 := [a20,a21,a30,a31]
  5860. ; load A11
  5861. MOVLPS XMM1, [EAX+40] ; XMM1 := [a22,a23,-,-]
  5862. MOVHPS XMM1, [EAX+56] ; XMM1 := [a22,a23,a32,a33]
  5863. ; multiply A10(D)*B00(E) (use MatMulR2x2 code)
  5864. MOVAPS XMM6, XMM2
  5865. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5866. MULPS XMM6, XMM0
  5867. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5868. MOVAPS XMM7, XMM2
  5869. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5870. MULPS XMM7, XMM0
  5871. ADDPS XMM7, XMM6
  5872. ; multiply A11(D)*B10(E)
  5873. MOVAPS XMM0, XMM4
  5874. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5875. MULPS XMM0, XMM1
  5876. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5877. MOVAPS XMM6, XMM4
  5878. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5879. MULPS XMM6, XMM1
  5880. ADDPS XMM6, XMM0
  5881. ADDPS XMM7, XMM6
  5882. MOVLPS [ECX+32], XMM7
  5883. MOVHPS [ECX+48], XMM7
  5884. ;****************************************************;
  5885. ; load A10
  5886. MOVLPS XMM0, [EAX+32] ; XMM0 := [a20,a21,-,-]
  5887. MOVHPS XMM0, [EAX+48] ; XMM0 := [a20,a21,a30,a31]
  5888. ; load A11
  5889. MOVLPS XMM1, [EAX+40] ; XMM1 := [a22,a23,-,-]
  5890. MOVHPS XMM1, [EAX+56] ; XMM1 := [a22,a23,a32,a33]
  5891. ; multiply A10(D)*B01(E) (use MatMulR2x2 code)
  5892. MOVAPS XMM6, XMM3
  5893. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5894. MULPS XMM6, XMM0
  5895. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5896. MOVAPS XMM7, XMM3
  5897. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5898. MULPS XMM7, XMM0
  5899. ADDPS XMM7, XMM6
  5900. ; multiply A11(D)*B11(E)
  5901. MOVAPS XMM0, XMM5
  5902. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5903. MULPS XMM0, XMM1
  5904. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5905. MOVAPS XMM6, XMM5
  5906. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5907. MULPS XMM6, XMM1
  5908. ADDPS XMM6, XMM0
  5909. ADDPS XMM7, XMM6
  5910. MOVLPS [ECX+40], XMM7
  5911. MOVHPS [ECX+56], XMM7
  5912. END MatMulR4x4;
  5913. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  5914. (* FIXME: speed it up when horizontal add is available!!! *)
  5915. PROCEDURE MatVecMulR2x2(dadr, ladr, radr: ADDRESS);
  5916. CODE{SYSTEM.i386, SYSTEM.SSE2}
  5917. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5918. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5919. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  5920. ; load the whole matrix
  5921. MOVUPS XMM0, [EAX] ; XMM0 := [a00,a01,a10,a11]
  5922. MOVLPS XMM1, [EBX] ; XMM1 := [b00,b01,-,-]
  5923. MOVLHPS XMM1, XMM1 ; XMM1 := [b00,b10,b00,b10]
  5924. MULPS XMM0, XMM1 ; XMM0 := [a00*b00,a01*b10,a10*b00,a11*b10]
  5925. MOVAPS XMM1, XMM0
  5926. SHUFPS XMM0, XMM0, 8; XMM0 := [a00*b00,a10*b00,-,-]
  5927. SHUFPS XMM1, XMM1, 13; XMM1 := [a01*b10,a11*b10,-,-]
  5928. ADDPS XMM0, XMM1
  5929. MOVLPS [ECX], XMM0
  5930. END MatVecMulR2x2;
  5931. (* PH *)
  5932. (* to do: use MOVAPS when Felix fixes issues with alignment *)
  5933. PROCEDURE MatVecMulR4x4(dadr, ladr, radr: ADDRESS);
  5934. CODE{SYSTEM.i386, SYSTEM.SSE3}
  5935. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5936. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5937. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  5938. MOVUPS XMM0, [EBX] ; XMM0 := [b0,b1,b2,b3]
  5939. MOVUPS XMM1, [EAX] ; XMM1 := [a00,a01,a02,a03]
  5940. MOVUPS XMM2, [EAX+16] ; XMM2 := [a10,a11,a12,a13]
  5941. MOVUPS XMM3, [EAX+32] ; XMM3 := [a20,a21,a22,a23]
  5942. MOVUPS XMM4, [EAX+48] ; XMM4 := [a30,a31,a32,a33]
  5943. MULPS XMM1, XMM0
  5944. MULPS XMM2, XMM0
  5945. HADDPS XMM1, XMM2 ; adjacent pairs are horizontally added
  5946. MULPS XMM3, XMM0
  5947. MULPS XMM4, XMM0
  5948. HADDPS XMM3, XMM4 ; adjacent pairs are horizontally added
  5949. HADDPS XMM1, XMM3 ; adjacent pairs are horizontally added
  5950. MOVUPS [ECX], XMM1
  5951. END MatVecMulR4x4;
  5952. PROCEDURE InstallMatMul*(context: Commands.Context);
  5953. VAR type: LONGINT; string: ARRAY 32 OF CHAR;
  5954. BEGIN
  5955. context.arg.String(string);
  5956. IF string = "dynamic" THEN
  5957. type := cMatMulDynamic;
  5958. ELSIF string = "scalarproduct" THEN
  5959. type := cMatMulScalarProduct
  5960. ELSIF string = "naive" THEN
  5961. type := cMatMulNaive
  5962. ELSIF string = "transposed" THEN
  5963. type := cMatMulTransposed
  5964. ELSIF string = "stride" THEN
  5965. type := cMatMulStride
  5966. ELSIF string ="blocked" THEN
  5967. type := cMatMulBlocked
  5968. ELSE
  5969. KernelLog.String("unknown method: "); KernelLog.String(string); KernelLog.Ln;
  5970. type := cMatMulDynamic;
  5971. END;
  5972. SetMatMulMethod( type );
  5973. END InstallMatMul;
  5974. PROCEDURE InstallAsm*;
  5975. BEGIN
  5976. KernelLog.String( "ASM " );
  5977. ArrayBase.loopSPAXAX := SPAXAXLoopA;
  5978. ArrayBase.loopSPARAR := SPARARLoopA;
  5979. ArrayBase.loopAddAXAX := AddAXAXLoopA;
  5980. ArrayBase.loopAddARAR := AddARARLoopA;
  5981. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
  5982. ArrayBase.loopMatMulARAR := MatMulARARLoopA;
  5983. ArrayBase.loopMulAXSX := MulAXSXLoopA;
  5984. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopA;
  5985. ArrayBase.loopMulARSR := MulARSRLoopA;
  5986. ArrayBase.loopIncMulARSR := IncMulARSRLoopA;
  5987. ArrayBase.loopMatMulIncAXAX := MatMulIncAXAXLoopA;
  5988. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopA;
  5989. ArrayBase.transpose4 := Transpose4;
  5990. ArrayBase.transpose8 := Transpose8;
  5991. END InstallAsm;
  5992. PROCEDURE InstallSSE*;
  5993. BEGIN
  5994. IF Machine.SSESupport THEN
  5995. KernelLog.String( "SSE " );
  5996. ArrayBase.loopSPARAR := SPARARLoopSSE;
  5997. ArrayBase.loopAddARAR := AddARARLoopSSE;
  5998. ArrayBase.loopMulARSR := MulARSRLoopSSE;
  5999. ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
  6000. ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
  6001. ArrayBase.matMulR := MatMulR;
  6002. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopSSE;
  6003. ArrayBase.matMulIncR := MatMulIncR;
  6004. (* optimizations for small matrices (Alexey Morozov) *)
  6005. ArrayBase.matMulR2x2 := MatMulR2x2;
  6006. ArrayBase.matMulR3x3 := MatMulR3x3;
  6007. ArrayBase.matMulR4x4 := MatMulR4x4;
  6008. ArrayBase.matVecMulR2x2 := MatVecMulR2x2;
  6009. END;
  6010. END InstallSSE;
  6011. PROCEDURE InstallSSE2*; (* extra for testing, will be merged with Install in later versions *)
  6012. BEGIN
  6013. IF Machine.SSE2Support THEN
  6014. KernelLog.String( "SSE2 " );
  6015. ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
  6016. ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
  6017. ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
  6018. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
  6019. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;
  6020. ArrayBase.matMulX := MatMulX;
  6021. ArrayBase.loopMatMulIncAXAX :=
  6022. MatMulIncAXAXLoopSSE;
  6023. ArrayBase.matMulIncX := MatMulIncX;
  6024. END;
  6025. END InstallSSE2;
  6026. (*! to do: at current, this only works for Win, not for native because SSE3Support is not yet implemented in I386.Machine.Mod*)
  6027. PROCEDURE InstallSSE3*; (* extra for testing, will be merged with Install in later versions *)
  6028. BEGIN
  6029. IF Machine.SSE3Support THEN
  6030. KernelLog.String( "SSE3 " );
  6031. (* optimizations for small matrices *)
  6032. ArrayBase.matVecMulR4x4 := MatVecMulR4x4;
  6033. END;
  6034. END InstallSSE3;
  6035. PROCEDURE Install*;
  6036. BEGIN
  6037. KernelLog.String( "ArrayBaseOptimized: installing runtime library optimizations:" );
  6038. InstallAsm; InstallSSE; InstallSSE2; InstallSSE3;
  6039. KernelLog.String( " done." ); KernelLog.Ln;
  6040. END Install;
  6041. PROCEDURE SetParameters*( context: Commands.Context );
  6042. BEGIN
  6043. context.arg.SkipWhitespace; context.arg.Int(cBlockSize,TRUE);
  6044. context.arg.SkipWhitespace; context.arg.Int(nrProcesses,TRUE);
  6045. IF nrProcesses > maxProcesses THEN
  6046. nrProcesses := maxProcesses
  6047. ELSIF nrProcesses = 0 THEN nrProcesses := Machine.NumberOfProcessors();
  6048. END;
  6049. KernelLog.String( "BlockSize=" ); KernelLog.Int( cBlockSize, 0 );
  6050. KernelLog.String( ", NrProcesses = " ); KernelLog.Int( nrProcesses, 0 ); KernelLog.Ln;
  6051. END SetParameters;
  6052. BEGIN
  6053. cBlockSize := 0; (* automatic *)
  6054. nrProcesses := Machine.NumberOfProcessors(); (* automatic *)
  6055. allocT := 0; copyT := 0; compT := 0;
  6056. NEW( cachePool );
  6057. END FoxArrayBaseOptimized.
  6058. SystemTools.Free ArrayBaseOptimized ~
  6059. ArrayBaseOptimized.Install ~
  6060. ArrayBaseOptimized.InstallSSE2 ~
  6061. ArrayBaseOptimized.InstallSSE ~
  6062. ArrayBaseOptimized.InstallAsm ~
  6063. ArrayBaseOptimized.InstallMatMul dynamic ~
  6064. ArrayBaseOptimized.InstallMatMul scalarproduct ~
  6065. ArrayBaseOptimized.InstallMatMul transposed ~
  6066. ArrayBaseOptimized.InstallMatMul naive ~
  6067. ArrayBaseOptimized.InstallMatMul stride ~
  6068. ArrayBaseOptimized.InstallMatMul blocked ~
  6069. ArrayBaseOptimized.SetParameters 0 1 ~ (* BlockSize, NrProcesses *)