I386.FoxArrayBaseOptimized.Mod 189 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887
  1. MODULE FoxArrayBaseOptimized; (** AUTHOR "fof"; PURPOSE ""; **)
  2. IMPORT SYSTEM, ArrayBase := FoxArrayBase, Machine, KernelLog, Commands ;
  3. CONST
  4. L2CacheSize = 512 * 1024; (* L1CacheSize = 16 * 1024; *)
  5. (* parameters for blocking matrix multiplication *)
  6. L1BlockN = 5; (* L1 block size -> nr of columns in a block that can be processed using L1 chache *)
  7. L2BARatio = 1;
  8. L0BlockKR = 4; (* L0 block size -> nr of elements that can be processed at once for type REAL *)
  9. L1MaxBlockKR = 336; (* L1CacheSize/SIZEOF(REAL)/2/6*)
  10. L2BlockSize = 81920;
  11. L0BlockKX = 2; (* L0 block size -> nr of elements that can be processed at once for type LONGREAL *)
  12. L1MaxBlockKX = 256; (* > L1CacheSize/SIZEOF(LONGREAL)/2/6*)
  13. (*
  14. DefaultL2CacheSize = 81920;
  15. L2SizeR = L2CacheSize DIV 8; MaxBlockKR = 336; (* ca L1CacheSize/SIZEOF(REAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  16. L2SizeX = L2CacheSize DIV 8; MaxBlockKX = 256; (* bit more than L1CacheSize/SIZEL(LONGREAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  17. *)
  18. debug = FALSE; parallel = TRUE; SSE = TRUE;
  19. MaxCachePoolSize = 0 (* disabled *) (* 646*1024*1024 *) (* enabled *) ;
  20. maxProcesses = 48;
  21. cMatMulDynamic* = -1; cMatMulScalarProduct* = 0;
  22. cMatMulNaive* = 1; cMatMulTransposed* = 2;
  23. cMatMulStride* = 3; cMatMulBlocked* = 4;
  24. VAR
  25. cBlockSize*: LONGINT; nrProcesses*: LONGINT;
  26. lastUsedBlockSize*: SIZE;
  27. allocT-, copyT-, zeroT-, compT-: HUGEINT;
  28. TYPE
  29. Cache = POINTER TO RECORD
  30. p: ANY;
  31. adr: ADDRESS; size: SIZE;
  32. prev, next: Cache;
  33. END;
  34. CachePool = OBJECT
  35. (*! provide heuristics for overal size *)
  36. VAR first, last: Cache;
  37. PROCEDURE & Init*;
  38. BEGIN
  39. NEW( first ); first.size := 0; (* sentinel *)
  40. NEW( last ); last.size := MAX( SIZE ); (* sentinel *)
  41. first.next := last; first.prev := NIL; last.prev := first; last.next := NIL;
  42. END Init;
  43. PROCEDURE Acquire( size: SIZE ): Cache;
  44. VAR c: Cache; t: HUGEINT;
  45. BEGIN {EXCLUSIVE}
  46. IF size = 0 THEN RETURN first END;
  47. Tic( t );
  48. c := last;
  49. WHILE (c.prev.size >= size) DO
  50. c := c.prev;
  51. END;
  52. IF c = last THEN
  53. NEW( c ); SYSTEM.NEW( c.p, size + 16 );
  54. c.adr := Align( c.p , 16 );
  55. c.size := size;
  56. ELSE
  57. c.prev.next := c.next;
  58. c.next.prev := c.prev;
  59. c.prev := NIL; c.next := NIL;
  60. END;
  61. Toc( t, allocT ); RETURN c;
  62. END Acquire;
  63. PROCEDURE Release( c: Cache );
  64. VAR t: Cache;
  65. BEGIN {EXCLUSIVE}
  66. IF (c=first) OR (c=NIL) THEN RETURN END;
  67. ASSERT(c.size > 0);
  68. IF c.size > MaxCachePoolSize THEN RETURN END;
  69. t := first;
  70. WHILE (t.size <= c.size) DO t := t.next; END;
  71. c.prev := t.prev; c.next := t; t.prev := c; c.prev.next := c;
  72. END Release;
  73. END CachePool;
  74. ComputationObj = OBJECT
  75. VAR done: BOOLEAN;
  76. PROCEDURE & Init*;
  77. BEGIN
  78. done := FALSE;
  79. END Init;
  80. PROCEDURE Compute; (*abstract*)
  81. END Compute;
  82. PROCEDURE Wait;
  83. BEGIN {EXCLUSIVE}
  84. AWAIT( done );
  85. END Wait;
  86. BEGIN {ACTIVE, EXCLUSIVE}
  87. Compute; done := TRUE;
  88. END ComputationObj;
  89. MatMulHObjR = OBJECT (ComputationObj)
  90. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  91. add: BOOLEAN;
  92. PROCEDURE & InitR*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  93. add: BOOLEAN );
  94. BEGIN
  95. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  96. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  97. SELF.IncC := IncC; SELF.StrideC := StrideC;
  98. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  99. SELF.Cols := Cols; SELF.add := add;
  100. END InitR;
  101. PROCEDURE Compute;
  102. BEGIN
  103. MatMulHBlockR( MatrixA, MatrixB, MatrixC, Stride, IncC,
  104. StrideC, RowsA, RowsB, Cols, add );
  105. END Compute;
  106. END MatMulHObjR;
  107. MatMulHObjX = OBJECT (ComputationObj)
  108. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  109. add: BOOLEAN;
  110. PROCEDURE & InitX*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  111. add: BOOLEAN );
  112. BEGIN
  113. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  114. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  115. SELF.IncC := IncC; SELF.StrideC := StrideC;
  116. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  117. SELF.Cols := Cols; SELF.add := add;
  118. END InitX;
  119. PROCEDURE Compute;
  120. BEGIN
  121. MatMulHBlockX( MatrixA, MatrixB, MatrixC, Stride, IncC,
  122. StrideC, RowsA, RowsB, Cols, add );
  123. END Compute;
  124. END MatMulHObjX;
  125. MultiplyObjectR = OBJECT (ComputationObj);
  126. VAR adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK:SIZE;
  127. start, finished: BOOLEAN;
  128. PROCEDURE & InitR*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  129. BEGIN
  130. Init; start := FALSE; finished := FALSE;
  131. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  132. SELF.M := M; SELF.N := N; SELF.K := K;
  133. SELF.IncC := IncC; SELF.StrideC := StrideC;
  134. SELF.L2BlockM := L2BlockM;
  135. SELF.L2BlockN := L2BlockN;
  136. SELF.L2BlockK := L2BlockK;
  137. END InitR;
  138. PROCEDURE Compute;
  139. BEGIN
  140. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  141. L2BlockN, L2BlockK );
  142. END Compute;
  143. END MultiplyObjectR;
  144. MultiplyObjectX = OBJECT (ComputationObj);
  145. VAR adrA, adrB:ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE;
  146. start, finished: BOOLEAN;
  147. PROCEDURE & InitX*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  148. BEGIN
  149. Init; start := FALSE; finished := FALSE;
  150. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  151. SELF.M := M; SELF.N := N; SELF.K := K;
  152. SELF.IncC := IncC; SELF.StrideC := StrideC;
  153. SELF.L2BlockM := L2BlockM;
  154. SELF.L2BlockN := L2BlockN;
  155. SELF.L2BlockK := L2BlockK;
  156. END InitX;
  157. PROCEDURE Compute;
  158. BEGIN
  159. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  160. L2BlockN, L2BlockK );
  161. END Compute;
  162. END MultiplyObjectX;
  163. VAR
  164. (* ran: Random.Generator; (* testing *)*)
  165. cachePool: CachePool;
  166. (*********** Part 0: assembler routines ***************)
  167. PROCEDURE -L1Block1XA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  168. CODE {SYSTEM.i386, SYSTEM.FPU}
  169. MOV EAX, [ESP+K] ; EAX IS counter
  170. MOV EDX, [ESP+adrC]
  171. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  172. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  173. FLD QWORD [EDX] ; S.GET(dadr, x)
  174. loop8:
  175. CMP EAX, 8
  176. JL loop1
  177. FLD QWORD[EBX] ; S.GET(ladr, x)
  178. ADD EBX, 8 ; INC(ladr, incl)
  179. FLD QWORD[ECX] ; S.GET(ladr, y)
  180. ADD ECX, 8 ; INC(radr, incr)
  181. FMULP ; x := x*y
  182. FADDP ; z := z+x
  183. FLD QWORD[EBX] ; S.GET(ladr, x)
  184. ADD EBX, 8 ; INC(ladr, incl)
  185. FLD QWORD[ECX] ; S.GET(ladr, y)
  186. ADD ECX, 8 ; INC(radr, incr)
  187. FMULP ; x := x*y
  188. FADDP ; z := z+x
  189. FLD QWORD[EBX] ; S.GET(ladr, x)
  190. ADD EBX, 8 ; INC(ladr, incl)
  191. FLD QWORD[ECX] ; S.GET(ladr, y)
  192. ADD ECX, 8 ; INC(radr, incr)
  193. FMULP ; x := x*y
  194. FADDP ; z := z+x
  195. FLD QWORD[EBX] ; S.GET(ladr, x)
  196. ADD EBX, 8 ; INC(ladr, incl)
  197. FLD QWORD[ECX] ; S.GET(ladr, y)
  198. ADD ECX, 8 ; INC(radr, incr)
  199. FMULP ; x := x*y
  200. FADDP ; z := z+x
  201. FLD QWORD[EBX] ; S.GET(ladr, x)
  202. ADD EBX, 8 ; INC(ladr, incl)
  203. FLD QWORD[ECX] ; S.GET(ladr, y)
  204. ADD ECX, 8 ; INC(radr, incr)
  205. FMULP ; x := x*y
  206. FADDP ; z := z+x
  207. FLD QWORD[EBX] ; S.GET(ladr, x)
  208. ADD EBX, 8 ; INC(ladr, incl)
  209. FLD QWORD[ECX] ; S.GET(ladr, y)
  210. ADD ECX, 8 ; INC(radr, incr)
  211. FMULP ; x := x*y
  212. FADDP ; z := z+x
  213. FLD QWORD[EBX] ; S.GET(ladr, x)
  214. ADD EBX, 8 ; INC(ladr, incl)
  215. FLD QWORD[ECX] ; S.GET(ladr, y)
  216. ADD ECX, 8 ; INC(radr, incr)
  217. FMULP ; x := x*y
  218. FADDP ; z := z+x
  219. FLD QWORD[EBX] ; S.GET(ladr, x)
  220. ADD EBX, 8 ; INC(ladr, incl)
  221. FLD QWORD[ECX] ; S.GET(ladr, y)
  222. ADD ECX, 8 ; INC(radr, incr)
  223. FMULP ; x := x*y
  224. FADDP ; z := z+x
  225. SUB EAX, 8 ; DEC(len)
  226. JMP loop8 ;
  227. loop1:
  228. CMP EAX, 0 ; WHILE len > 0 DO
  229. JLE endL
  230. FLD QWORD[EBX] ; S.GET(ladr, x)
  231. ADD EBX, 8 ; INC(ladr, incl)
  232. FLD QWORD[ECX] ; S.GET(ladr, y)
  233. ADD ECX, 8 ; INC(radr, incr)
  234. FMULP ; x := x*y
  235. FADDP ; z := z+x
  236. DEC EAX ; DEC(len)
  237. JMP loop1 ;
  238. endL:
  239. FSTP QWORD[EDX] ; S.PUT(dadr, x)
  240. FWAIT ;
  241. ADD ESP, 16 ;
  242. END L1Block1XA;
  243. PROCEDURE -L1Block1XSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  244. (*
  245. matrixA, matrixB must be stored in special format
  246. K>0 guaranteed
  247. *)
  248. CODE {SYSTEM.i386, SYSTEM.SSE2}
  249. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  250. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  251. MOV EDX, [ESP+K] ; EDX IS counter
  252. XORPD XMM2, XMM2 ;
  253. kLoop8: ;
  254. CMP EDX, 8 ;
  255. JL kLoop2 ;
  256. MOVAPD XMM7, [EBX] ;
  257. MOVAPD XMM0, [ECX] ;
  258. ADD ECX, 16 ;
  259. ADD EBX, 16 ;
  260. MOVAPD XMM6, [EBX] ;
  261. MOVAPD XMM1, [ECX] ;
  262. ADD ECX, 16 ;
  263. ADD EBX, 16 ;
  264. MULPD XMM0, XMM7 ;
  265. ADDPD XMM2, XMM0 ;
  266. MOVAPD XMM5, [EBX] ;
  267. MOVAPD XMM3, [ECX] ;
  268. ADD ECX, 16 ;
  269. ADD EBX, 16 ;
  270. MULPD XMM1, XMM6 ;
  271. ADDPD XMM2, XMM1 ;
  272. MOVAPD XMM7, [EBX] ;
  273. MOVAPD XMM0, [ECX] ;
  274. ADD ECX, 16 ;
  275. ADD EBX, 16 ;
  276. MULPD XMM3, XMM5 ;
  277. ADDPD XMM2, XMM3 ;
  278. MULPD XMM0, XMM7 ;
  279. ADDPD XMM2, XMM0 ;
  280. SUB EDX, 8 ;
  281. JMP kLoop8 ;
  282. kLoop2: ;
  283. CMP EDX, 0 ;
  284. JLE horizontalAdd ;
  285. MOVAPD XMM7, [EBX] ;
  286. MOVAPD XMM0, [ECX] ;
  287. ADD ECX, 16 ;
  288. ADD EBX, 16 ;
  289. MULPD XMM0, XMM7 ;
  290. ADDPD XMM2, XMM0 ;
  291. SUB EDX, 2
  292. JMP kLoop2 ;
  293. horizontalAdd:
  294. MOV EDI, [ESP+adrC] ;
  295. MOVAPD XMM1, XMM2 ;
  296. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  297. ADDPD XMM2, XMM1 ;
  298. ADDSD XMM2, [EDI] ;
  299. MOVSD [EDI], XMM2 ;
  300. endL:
  301. ADD ESP, 16 ;
  302. END L1Block1XSSE;
  303. PROCEDURE -L1Block5XSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  304. (*
  305. matrixA and matrix B are stored in special format !
  306. K > 0 is guaranteed
  307. *)
  308. CODE {SYSTEM.i386, SYSTEM.SSE2}
  309. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  310. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  311. MOV EDX, [ESP+K] ; EDX IS counter
  312. XORPD XMM2, XMM2 ;
  313. XORPD XMM3, XMM3 ;
  314. XORPD XMM4, XMM4 ;
  315. XORPD XMM5, XMM5 ;
  316. XORPD XMM6, XMM6 ;
  317. kLoop8: ;
  318. CMP EDX, 8 ;
  319. JL kLoop2
  320. ; (*-- 0 -- *) ;
  321. MOVAPD XMM7, [EBX] ; get 4 elements OF A
  322. ADD EBX, 16 ;
  323. MOVAPD XMM0, [ECX] ; get 4 elements OF B
  324. ADD ECX, 16 ;
  325. MOVAPD XMM1, [ECX] ; get 4 elements OF B
  326. ADD ECX, 16 ;
  327. MULPD XMM0, XMM7 ;
  328. ADDPD XMM2, XMM0 ;
  329. MOVAPD XMM0, [ECX] ;
  330. ADD ECX, 16 ;
  331. MULPD XMM1, XMM7 ;
  332. ADDPD XMM3, XMM1 ;
  333. MOVAPD XMM1, [ECX] ;
  334. ADD ECX, 16 ;
  335. MULPD XMM0, XMM7 ;
  336. ADDPD XMM4, XMM0 ;
  337. MOVAPD XMM0, [ECX] ;
  338. ADD ECX, 16 ;
  339. MULPD XMM1, XMM7 ;
  340. ADDPD XMM5, XMM1 ;
  341. MOVAPD XMM1, [ECX] ;
  342. ADD ECX, 16 ;
  343. MULPD XMM0, XMM7 ;
  344. ADDPD XMM6, XMM0
  345. ; (*-- 2 -- *) ;
  346. MOVAPD XMM7, [EBX] ;
  347. ADD EBX, 16 ;
  348. MOVAPD XMM0, [ECX] ;
  349. ADD ECX, 16 ;
  350. MULPD XMM1, XMM7 ;
  351. ADDPD XMM2, XMM1 ;
  352. MOVAPD XMM1, [ECX] ;
  353. ADD ECX, 16 ;
  354. MULPD XMM0, XMM7 ;
  355. ADDPD XMM3, XMM0 ;
  356. MOVAPD XMM0, [ECX] ;
  357. ADD ECX, 16 ;
  358. MULPD XMM1, XMM7 ;
  359. ADDPD XMM4, XMM1 ;
  360. MOVAPD XMM1, [ECX] ;
  361. ADD ECX, 16 ;
  362. MULPD XMM0, XMM7 ;
  363. ADDPD XMM5, XMM0 ;
  364. MOVAPD XMM0, [ECX] ;
  365. ADD ECX, 16 ;
  366. MULPD XMM1, XMM7 ;
  367. ADDPD XMM6, XMM1
  368. ; (*-- 4 -- *) ;
  369. MOVAPD XMM7, [EBX] ;
  370. ADD EBX, 16 ;
  371. MOVAPD XMM1, [ECX] ;
  372. ADD ECX, 16 ;
  373. MULPD XMM0, XMM7 ;
  374. ADDPD XMM2, XMM0 ;
  375. MOVAPD XMM0, [ECX] ;
  376. ADD ECX, 16 ;
  377. MULPD XMM1, XMM7 ;
  378. ADDPD XMM3, XMM1 ;
  379. MOVAPD XMM1, [ECX] ;
  380. ADD ECX, 16 ;
  381. MULPD XMM0, XMM7 ;
  382. ADDPD XMM4, XMM0 ;
  383. MOVAPD XMM0, [ECX] ;
  384. ADD ECX, 16 ;
  385. MULPD XMM1, XMM7 ;
  386. ADDPD XMM5, XMM1 ;
  387. MOVAPD XMM1, [ECX] ;
  388. ADD ECX, 16 ;
  389. MULPD XMM0, XMM7 ;
  390. ADDPD XMM6, XMM0
  391. ; (*-- 6 -- *) ;
  392. MOVAPD XMM7, [EBX] ;
  393. ADD EBX, 16 ;
  394. MOVAPD XMM0, [ECX] ;
  395. ADD ECX, 16 ;
  396. MULPD XMM1, XMM7 ;
  397. ADDPD XMM2, XMM1 ;
  398. MOVAPD XMM1, [ECX] ;
  399. ADD ECX, 16 ;
  400. MULPD XMM0, XMM7 ;
  401. ADDPD XMM3, XMM0 ;
  402. MOVAPD XMM0, [ECX] ;
  403. ADD ECX, 16 ;
  404. MULPD XMM1, XMM7 ;
  405. ADDPD XMM4, XMM1 ;
  406. MOVAPD XMM1, [ECX] ;
  407. ADD ECX, 16 ;
  408. MULPD XMM0, XMM7 ;
  409. ADDPD XMM5, XMM0 ;
  410. MULPD XMM1, XMM7 ;
  411. ADDPD XMM6, XMM1 ;
  412. SUB EDX, 8
  413. JMP kLoop8 ;
  414. kLoop2: ;
  415. CMP EDX, 0 ;
  416. JLE horizontalAdd ;
  417. MOVAPD XMM7, [EBX] ;
  418. ADD EBX, 16 ;
  419. MOVAPD XMM0, [ECX] ;
  420. ADD ECX, 16 ;
  421. MOVAPD XMM1, [ECX] ;
  422. ADD ECX, 16 ;
  423. MULPD XMM0, XMM7 ;
  424. ADDPD XMM2, XMM0 ;
  425. MOVAPD XMM0, [ECX] ;
  426. ADD ECX, 16 ;
  427. MULPD XMM1, XMM7 ;
  428. ADDPD XMM3, XMM1 ;
  429. MOVAPD XMM1, [ECX] ;
  430. ADD ECX, 16 ;
  431. MULPD XMM0, XMM7 ;
  432. ADDPD XMM4, XMM0 ;
  433. MOVAPD XMM0, [ECX] ;
  434. ADD ECX, 16 ;
  435. MULPD XMM1, XMM7 ;
  436. ADDPD XMM5, XMM1 ;
  437. MULPD XMM0, XMM7 ;
  438. ADDPD XMM6, XMM0 ;
  439. SUB EDX, 2
  440. JMP kLoop2 ;
  441. horizontalAdd: ; add and store
  442. MOV EDI, [ESP+adrC] ;
  443. MOV EAX, [ESP+IncC] ;
  444. MOVAPD XMM1, XMM2 ;
  445. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  446. ADDPD XMM2, XMM1 ;
  447. ADDSD XMM2, [EDI] ;
  448. MOVSD [EDI], XMM2 ;
  449. ADD EDI, EAX ;
  450. MOVAPD XMM1, XMM3 ;
  451. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  452. ADDPD XMM3, XMM1 ;
  453. ADDSD XMM3, [EDI] ;
  454. MOVSD [EDI], XMM3 ;
  455. ADD EDI, EAX ;
  456. MOVAPD XMM1, XMM4 ;
  457. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  458. ADDPD XMM4, XMM1 ;
  459. ADDSD XMM4, [EDI] ;
  460. MOVSD [EDI], XMM4 ;
  461. ADD EDI, EAX ;
  462. MOVAPD XMM1, XMM5 ;
  463. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  464. ADDPD XMM5, XMM1 ;
  465. ADDSD XMM5, [EDI] ;
  466. MOVSD [EDI], XMM5 ;
  467. ADD EDI, EAX ;
  468. MOVAPD XMM1, XMM6 ;
  469. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  470. ADDPD XMM6, XMM1 ;
  471. ADDSD XMM6, [EDI] ;
  472. MOVSD [EDI], XMM6 ;
  473. endL:
  474. ADD ESP, 20 ;
  475. END L1Block5XSSE;
  476. PROCEDURE -L1Block1RA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  477. CODE {SYSTEM.i386, SYSTEM.FPU}
  478. MOV EAX, [ESP+K] ; EAX IS counter
  479. MOV EDX, [ESP+adrC]
  480. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  481. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  482. FLD DWORD [EDX] ; S.GET(dadr, x)
  483. loop16:
  484. CMP EAX, 16
  485. JL loop1
  486. FLD DWORD[EBX] ; S.GET(ladr, x)
  487. ADD EBX, 4 ; INC(ladr, incl)
  488. FLD DWORD[ECX] ; S.GET(ladr, y)
  489. ADD ECX, 4 ; INC(radr, incr)
  490. FMULP ; x := x*y
  491. FADDP ; z := z+x
  492. FLD DWORD[EBX] ; S.GET(ladr, x)
  493. ADD EBX, 4 ; INC(ladr, incl)
  494. FLD DWORD[ECX] ; S.GET(ladr, y)
  495. ADD ECX, 4 ; INC(radr, incr)
  496. FMULP ; x := x*y
  497. FADDP ; z := z+x
  498. FLD DWORD[EBX] ; S.GET(ladr, x)
  499. ADD EBX, 4 ; INC(ladr, incl)
  500. FLD DWORD[ECX] ; S.GET(ladr, y)
  501. ADD ECX, 4 ; INC(radr, incr)
  502. FMULP ; x := x*y
  503. FADDP ; z := z+x
  504. FLD DWORD[EBX] ; S.GET(ladr, x)
  505. ADD EBX, 4 ; INC(ladr, incl)
  506. FLD DWORD[ECX] ; S.GET(ladr, y)
  507. ADD ECX, 4 ; INC(radr, incr)
  508. FMULP ; x := x*y
  509. FADDP ; z := z+x
  510. FLD DWORD[EBX] ; S.GET(ladr, x)
  511. ADD EBX, 4 ; INC(ladr, incl)
  512. FLD DWORD[ECX] ; S.GET(ladr, y)
  513. ADD ECX, 4 ; INC(radr, incr)
  514. FMULP ; x := x*y
  515. FADDP ; z := z+x
  516. FLD DWORD[EBX] ; S.GET(ladr, x)
  517. ADD EBX, 4 ; INC(ladr, incl)
  518. FLD DWORD[ECX] ; S.GET(ladr, y)
  519. ADD ECX, 4 ; INC(radr, incr)
  520. FMULP ; x := x*y
  521. FADDP ; z := z+x
  522. FLD DWORD[EBX] ; S.GET(ladr, x)
  523. ADD EBX, 4 ; INC(ladr, incl)
  524. FLD DWORD[ECX] ; S.GET(ladr, y)
  525. ADD ECX, 4 ; INC(radr, incr)
  526. FMULP ; x := x*y
  527. FADDP ; z := z+x
  528. FLD DWORD[EBX] ; S.GET(ladr, x)
  529. ADD EBX, 4 ; INC(ladr, incl)
  530. FLD DWORD[ECX] ; S.GET(ladr, y)
  531. ADD ECX, 4 ; INC(radr, incr)
  532. FMULP ; x := x*y
  533. FADDP ; z := z+x
  534. FLD DWORD[EBX] ; S.GET(ladr, x)
  535. ADD EBX, 4 ; INC(ladr, incl)
  536. FLD DWORD[ECX] ; S.GET(ladr, y)
  537. ADD ECX, 4 ; INC(radr, incr)
  538. FMULP ; x := x*y
  539. FADDP ; z := z+x
  540. FLD DWORD[EBX] ; S.GET(ladr, x)
  541. ADD EBX, 4 ; INC(ladr, incl)
  542. FLD DWORD[ECX] ; S.GET(ladr, y)
  543. ADD ECX, 4 ; INC(radr, incr)
  544. FMULP ; x := x*y
  545. FADDP ; z := z+x
  546. FLD DWORD[EBX] ; S.GET(ladr, x)
  547. ADD EBX, 4 ; INC(ladr, incl)
  548. FLD DWORD[ECX] ; S.GET(ladr, y)
  549. ADD ECX, 4 ; INC(radr, incr)
  550. FMULP ; x := x*y
  551. FADDP ; z := z+x
  552. FLD DWORD[EBX] ; S.GET(ladr, x)
  553. ADD EBX, 4 ; INC(ladr, incl)
  554. FLD DWORD[ECX] ; S.GET(ladr, y)
  555. ADD ECX, 4 ; INC(radr, incr)
  556. FMULP ; x := x*y
  557. FADDP ; z := z+x
  558. FLD DWORD[EBX] ; S.GET(ladr, x)
  559. ADD EBX, 4 ; INC(ladr, incl)
  560. FLD DWORD[ECX] ; S.GET(ladr, y)
  561. ADD ECX, 4 ; INC(radr, incr)
  562. FMULP ; x := x*y
  563. FADDP ; z := z+x
  564. FLD DWORD[EBX] ; S.GET(ladr, x)
  565. ADD EBX, 4 ; INC(ladr, incl)
  566. FLD DWORD[ECX] ; S.GET(ladr, y)
  567. ADD ECX, 4 ; INC(radr, incr)
  568. FMULP ; x := x*y
  569. FADDP ; z := z+x
  570. FLD DWORD[EBX] ; S.GET(ladr, x)
  571. ADD EBX, 4 ; INC(ladr, incl)
  572. FLD DWORD[ECX] ; S.GET(ladr, y)
  573. ADD ECX, 4 ; INC(radr, incr)
  574. FMULP ; x := x*y
  575. FADDP ; z := z+x
  576. FLD DWORD[EBX] ; S.GET(ladr, x)
  577. ADD EBX, 4 ; INC(ladr, incl)
  578. FLD DWORD[ECX] ; S.GET(ladr, y)
  579. ADD ECX, 4 ; INC(radr, incr)
  580. FMULP ; x := x*y
  581. FADDP ; z := z+x
  582. SUB EAX, 16 ; DEC(len)
  583. JMP loop16 ;
  584. loop1:
  585. CMP EAX, 0 ; WHILE len > 0 DO
  586. JLE endL
  587. FLD DWORD[EBX] ; S.GET(ladr, x)
  588. ADD EBX, 4 ; INC(ladr, incl)
  589. FLD DWORD[ECX] ; S.GET(ladr, y)
  590. ADD ECX, 4 ; INC(radr, incr)
  591. FMULP ; x := x*y
  592. FADDP ; z := z+x
  593. DEC EAX ; DEC(len)
  594. JMP loop1 ;
  595. endL:
  596. FSTP DWORD[EDX] ; S.PUT(dadr, x)
  597. FWAIT ;
  598. ADD ESP, 16 ;
  599. END L1Block1RA;
  600. PROCEDURE -L1Block1RSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  601. (*
  602. matrixA, matrixB must be stored in special format
  603. K>0 guaranteed
  604. *)
  605. CODE {SYSTEM.i386, SYSTEM.SSE}
  606. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  607. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  608. MOV EDX, [ESP+K] ; EDX IS counter
  609. XORPS XMM2, XMM2 ;
  610. kLoop16: ;
  611. CMP EDX, 16 ;
  612. JL kLoop4 ;
  613. MOVAPS XMM7, [EBX] ;
  614. MOVAPS XMM0, [ECX] ;
  615. ADD ECX, 16 ;
  616. ADD EBX, 16 ;
  617. MOVAPS XMM6, [EBX] ;
  618. MOVAPS XMM1, [ECX] ;
  619. ADD ECX, 16 ;
  620. ADD EBX, 16 ;
  621. MULPS XMM0, XMM7 ;
  622. ADDPS XMM2, XMM0 ;
  623. MOVAPS XMM5, [EBX] ;
  624. MOVAPS XMM3, [ECX] ;
  625. ADD ECX, 16 ;
  626. ADD EBX, 16 ;
  627. MULPS XMM1, XMM6 ;
  628. ADDPS XMM2, XMM1 ;
  629. MOVAPS XMM7, [EBX] ;
  630. MOVAPS XMM0, [ECX] ;
  631. ADD ECX, 16 ;
  632. ADD EBX, 16 ;
  633. MULPS XMM3, XMM5 ;
  634. ADDPS XMM2, XMM3 ;
  635. MULPS XMM0, XMM7 ;
  636. ADDPS XMM2, XMM0 ;
  637. SUB EDX, 16 ;
  638. JMP kLoop16 ;
  639. kLoop4: ;
  640. CMP EDX, 0 ;
  641. JLE horizontalAdd ;
  642. MOVAPS XMM7, [EBX] ;
  643. MOVAPS XMM0, [ECX] ;
  644. ADD ECX, 16 ;
  645. ADD EBX, 16 ;
  646. MULPS XMM0, XMM7 ;
  647. ADDPS XMM2, XMM0 ;
  648. SUB EDX, 4
  649. JMP kLoop4 ;
  650. horizontalAdd:
  651. MOV EDI, [ESP+adrC] ;
  652. MOVLHPS XMM1, XMM2 ;
  653. ADDPS XMM1, XMM2 ;
  654. SHUFPS XMM2, XMM1, 48 ;
  655. ADDPS XMM2, XMM1 ;
  656. MOVHLPS XMM2, XMM2 ;
  657. ADDSS XMM2, [EDI] ;
  658. MOVSS [EDI], XMM2 ;
  659. endL:
  660. ADD ESP, 16 ;
  661. END L1Block1RSSE;
  662. PROCEDURE -L1Block5RSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  663. (*
  664. matrixA and matrix B are stored in special format !
  665. K > 0 is guaranteed
  666. *)
  667. CODE {SYSTEM.i386, SYSTEM.SSE}
  668. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  669. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  670. MOV EDX, [ESP+K] ; EDX IS counter
  671. XORPS XMM2, XMM2 ;
  672. XORPS XMM3, XMM3 ;
  673. XORPS XMM4, XMM4 ;
  674. XORPS XMM5, XMM5 ;
  675. XORPS XMM6, XMM6 ;
  676. kLoop16: ;
  677. CMP EDX, 16 ;
  678. JL kLoop4 ; (*-- 0 -- *)
  679. MOVAPS XMM7, [EBX] ; get 4 elements OF A
  680. ADD EBX, 16 ;
  681. MOVAPS XMM0, [ECX] ; get 4 elements OF B
  682. ADD ECX, 16 ;
  683. MOVAPS XMM1, [ECX] ; get 4 elements OF B
  684. ADD ECX, 16 ;
  685. MULPS XMM0, XMM7 ;
  686. ADDPS XMM2, XMM0 ;
  687. MOVAPS XMM0, [ECX] ;
  688. ADD ECX, 16 ;
  689. MULPS XMM1, XMM7 ;
  690. ADDPS XMM3, XMM1 ;
  691. MOVAPS XMM1, [ECX] ;
  692. ADD ECX, 16 ;
  693. MULPS XMM0, XMM7 ;
  694. ADDPS XMM4, XMM0 ;
  695. MOVAPS XMM0, [ECX] ;
  696. ADD ECX, 16 ;
  697. MULPS XMM1, XMM7 ;
  698. ADDPS XMM5, XMM1 ;
  699. MOVAPS XMM1, [ECX] ;
  700. ADD ECX, 16 ;
  701. MULPS XMM0, XMM7 ;
  702. ADDPS XMM6, XMM0
  703. ; (*-- 4 -- *) ;
  704. MOVAPS XMM7, [EBX] ;
  705. ADD EBX, 16 ;
  706. MOVAPS XMM0, [ECX] ;
  707. ADD ECX, 16 ;
  708. MULPS XMM1, XMM7 ;
  709. ADDPS XMM2, XMM1 ;
  710. MOVAPS XMM1, [ECX] ;
  711. ADD ECX, 16 ;
  712. MULPS XMM0, XMM7 ;
  713. ADDPS XMM3, XMM0 ;
  714. MOVAPS XMM0, [ECX] ;
  715. ADD ECX, 16 ;
  716. MULPS XMM1, XMM7 ;
  717. ADDPS XMM4, XMM1 ;
  718. MOVAPS XMM1, [ECX] ;
  719. ADD ECX, 16 ;
  720. MULPS XMM0, XMM7 ;
  721. ADDPS XMM5, XMM0 ;
  722. MOVAPS XMM0, [ECX] ;
  723. ADD ECX, 16 ;
  724. MULPS XMM1, XMM7 ;
  725. ADDPS XMM6, XMM1
  726. ; (*-- 8 -- *) ;
  727. MOVAPS XMM7, [EBX] ;
  728. ADD EBX, 16 ;
  729. MOVAPS XMM1, [ECX] ;
  730. ADD ECX, 16 ;
  731. MULPS XMM0, XMM7 ;
  732. ADDPS XMM2, XMM0 ;
  733. MOVAPS XMM0, [ECX] ;
  734. ADD ECX, 16 ;
  735. MULPS XMM1, XMM7 ;
  736. ADDPS XMM3, XMM1 ;
  737. MOVAPS XMM1, [ECX] ;
  738. ADD ECX, 16 ;
  739. MULPS XMM0, XMM7 ;
  740. ADDPS XMM4, XMM0 ;
  741. MOVAPS XMM0, [ECX] ;
  742. ADD ECX, 16 ;
  743. MULPS XMM1, XMM7 ;
  744. ADDPS XMM5, XMM1 ;
  745. MOVAPS XMM1, [ECX] ;
  746. ADD ECX, 16 ;
  747. MULPS XMM0, XMM7 ;
  748. ADDPS XMM6, XMM0
  749. ; (*-- 12 -- *) ;
  750. MOVAPS XMM7, [EBX] ;
  751. ADD EBX, 16 ;
  752. MOVAPS XMM0, [ECX] ;
  753. ADD ECX, 16 ;
  754. MULPS XMM1, XMM7 ;
  755. ADDPS XMM2, XMM1 ;
  756. MOVAPS XMM1, [ECX] ;
  757. ADD ECX, 16 ;
  758. MULPS XMM0, XMM7 ;
  759. ADDPS XMM3, XMM0 ;
  760. MOVAPS XMM0, [ECX] ;
  761. ADD ECX, 16 ;
  762. MULPS XMM1, XMM7 ;
  763. ADDPS XMM4, XMM1 ;
  764. MOVAPS XMM1, [ECX] ;
  765. ADD ECX, 16 ;
  766. MULPS XMM0, XMM7 ;
  767. ADDPS XMM5, XMM0 ;
  768. MULPS XMM1, XMM7 ;
  769. ADDPS XMM6, XMM1 ;
  770. SUB EDX, 16
  771. JMP kLoop16 ;
  772. kLoop4: ;
  773. CMP EDX, 0 ;
  774. JLE horizontalAdd ;
  775. MOVAPS XMM7, [EBX] ;
  776. ADD EBX, 16 ;
  777. MOVAPS XMM0, [ECX] ;
  778. ADD ECX, 16 ;
  779. MOVAPS XMM1, [ECX] ;
  780. ADD ECX, 16 ;
  781. MULPS XMM0, XMM7 ;
  782. ADDPS XMM2, XMM0 ;
  783. MOVAPS XMM0, [ECX] ;
  784. ADD ECX, 16 ;
  785. MULPS XMM1, XMM7 ;
  786. ADDPS XMM3, XMM1 ;
  787. MOVAPS XMM1, [ECX] ;
  788. ADD ECX, 16 ;
  789. MULPS XMM0, XMM7 ;
  790. ADDPS XMM4, XMM0 ;
  791. MOVAPS XMM0, [ECX] ;
  792. ADD ECX, 16 ;
  793. MULPS XMM1, XMM7 ;
  794. ADDPS XMM5, XMM1 ;
  795. MULPS XMM0, XMM7 ;
  796. ADDPS XMM6, XMM0 ;
  797. SUB EDX, 4
  798. JMP kLoop4 ;
  799. horizontalAdd: ; add and store
  800. MOV EDI, [ESP+adrC] ;
  801. MOV EAX, [ESP+IncC] ;
  802. MOVLHPS XMM1, XMM2 ;
  803. ADDPS XMM1, XMM2 ;
  804. SHUFPS XMM2, XMM1, 48 ;
  805. ADDPS XMM2, XMM1 ;
  806. MOVHLPS XMM2, XMM2 ;
  807. ADDSS XMM2, [EDI] ;
  808. MOVSS [EDI], XMM2 ;
  809. ADD EDI, EAX ;
  810. MOVLHPS XMM1, XMM3 ;
  811. ADDPS XMM1, XMM3 ;
  812. SHUFPS XMM3, XMM1, 48 ;
  813. ADDPS XMM3, XMM1 ;
  814. MOVHLPS XMM3, XMM3 ;
  815. ADDSS XMM3, [EDI] ;
  816. MOVSS [EDI], XMM3 ;
  817. ADD EDI, EAX ;
  818. MOVLHPS XMM1, XMM4 ;
  819. ADDPS XMM1, XMM4 ;
  820. SHUFPS XMM4, XMM1, 48 ;
  821. ADDPS XMM4, XMM1 ;
  822. MOVHLPS XMM4, XMM4 ;
  823. ADDSS XMM4, [EDI] ;
  824. MOVSS [EDI], XMM4 ;
  825. ADD EDI, EAX ;
  826. MOVLHPS XMM1, XMM5 ;
  827. ADDPS XMM1, XMM5 ;
  828. SHUFPS XMM5, XMM1, 48 ;
  829. ADDPS XMM5, XMM1 ;
  830. MOVHLPS XMM5, XMM5 ;
  831. ADDSS XMM5, [EDI] ;
  832. MOVSS [EDI], XMM5 ;
  833. ADD EDI, EAX ;
  834. MOVLHPS XMM1, XMM6 ;
  835. ADDPS XMM1, XMM6 ;
  836. SHUFPS XMM6, XMM1, 48 ;
  837. ADDPS XMM6, XMM1 ;
  838. MOVHLPS XMM6, XMM6 ;
  839. ADDSS XMM6, [EDI] ;
  840. MOVSS [EDI], XMM6 ;
  841. endL:
  842. ADD ESP, 20 ;
  843. END L1Block5RSSE;
  844. PROCEDURE -Align4( adr: ADDRESS ): ADDRESS;
  845. CODE {SYSTEM.i386}
  846. MOV EAX, [ESP+adr] ;
  847. NEG EAX ;
  848. AND EAX, 3H ;
  849. ADD EAX, [ESP+adr] ;
  850. ADD ESP, 4
  851. END Align4;
  852. PROCEDURE -Align2( adr: ADDRESS ): ADDRESS;
  853. CODE {SYSTEM.i386}
  854. MOV EAX, [ESP+adr] ;
  855. NEG EAX ;
  856. AND EAX, 1H ;
  857. ADD EAX, [ESP+adr] ;
  858. ADD ESP, 4
  859. END Align2;
  860. PROCEDURE -ZeroR( adr: ADDRESS; count: SIZE );
  861. (** For 32 bit types *)
  862. CODE {SYSTEM.i386}
  863. MOV EDI, [ESP+adr] ; address OF dest index
  864. MOV ECX, [ESP+count] ; counter
  865. MOV EAX, 0 ; value
  866. CLD ; incremental
  867. REP ;
  868. STOSD ;
  869. ADD ESP, 8 ;
  870. END ZeroR;
  871. PROCEDURE -ZeroX( adr: ADDRESS; count: SIZE );
  872. (** For 64 bit types *)
  873. CODE {SYSTEM.i386}
  874. MOV EDI, [ESP+adr] ; address OF dest index
  875. MOV ECX, [ESP+count] ; counter
  876. SHL ECX, 1 ;
  877. MOV EAX, 0 ; value
  878. CLD ; incremental
  879. REP ;
  880. STOSD ;
  881. ADD ESP, 8 ;
  882. END ZeroX;
  883. PROCEDURE -ZeroRI( adr: SIZE; inc, count: SIZE );
  884. (** For 32 bit types *)
  885. CODE {SYSTEM.i386}
  886. MOV EDI, [ESP+adr] ; address OF dest index
  887. MOV EBX, [ESP+inc] ;
  888. MOV ECX, [ESP+count] ; counter
  889. CMP EBX, 4 ;
  890. JE fastzero ;
  891. MOV EAX, 0 ;
  892. loopL:
  893. CMP ECX, 0 ;
  894. JLE endL ;
  895. MOV [EDI], EAX ;
  896. ADD EDI, EBX ;
  897. DEC ECX ;
  898. JMP loopL ;
  899. fastzero:
  900. MOV EAX, 0 ; value
  901. CLD ; incremental
  902. REP ;
  903. STOSD ;
  904. endL:
  905. ADD ESP, 12 ;
  906. END ZeroRI;
  907. PROCEDURE -ZeroXI( adr: ADDRESS; inc, count: SIZE );
  908. (** For 32 bit types *)
  909. CODE {SYSTEM.i386}
  910. MOV EDI, [ESP+adr] ; address OF dest index
  911. MOV EBX, [ESP+inc] ;
  912. MOV ECX, [ESP+count] ; counter
  913. MOV EAX, 0 ;
  914. CMP EBX, 8 ;
  915. JE fastzero ;
  916. loopL:
  917. CMP ECX, 0 ;
  918. JLE endL ;
  919. MOV [EDI], EAX ;
  920. MOV [EDI+4], EAX ;
  921. ADD EDI, EBX ;
  922. DEC ECX ;
  923. JMP loopL ;
  924. fastzero:
  925. SHL ECX, 1 ;
  926. CLD ; incremental
  927. REP ;
  928. STOSD ;
  929. endL:
  930. ADD ESP, 12 ;
  931. END ZeroXI;
  932. PROCEDURE -MovR( from, to0, frominc, count: SIZE );
  933. CODE {SYSTEM.i386}
  934. MOV EDI, [ESP+to0] ; TO
  935. MOV ESI, [ESP+from] ; from
  936. MOV ECX, [ESP+count] ; count
  937. MOV EBX, [ESP+frominc] ; inc
  938. CMP EBX, 4 ;
  939. JE fastmove ;
  940. loopL:
  941. CMP ECX, 0 ;
  942. JLE endL ;
  943. MOV EAX, [ESI] ;
  944. MOV [EDI], EAX ;
  945. ADD ESI, EBX ;
  946. ADD EDI, 4 ;
  947. DEC ECX ;
  948. JMP loopL ;
  949. fastmove:
  950. CLD ; incremental
  951. REP ;
  952. MOVSD ; move rest IN one byte steps
  953. endL:
  954. ADD ESP, 16 ;
  955. END MovR;
  956. PROCEDURE -MovX( from, to0: ADDRESS; frominc, count:SIZE );
  957. CODE {SYSTEM.i386}
  958. MOV EDI, [ESP+to0] ; TO
  959. MOV ESI, [ESP+from] ; from
  960. MOV ECX, [ESP+count] ; count
  961. MOV EBX, [ESP+frominc] ; inc
  962. CMP EBX, 8 ;
  963. JE fastmove ;
  964. loopL:
  965. CMP ECX, 0 ;
  966. JLE endL ;
  967. MOV EAX, [ESI] ;
  968. MOV [EDI], EAX ;
  969. MOV EAX, [ESI+4] ;
  970. MOV [EDI+4], EAX ;
  971. ADD ESI, EBX ;
  972. ADD EDI, 8 ;
  973. DEC ECX ;
  974. JMP loopL ;
  975. fastmove:
  976. SHL ECX, 1 ;
  977. CLD ; incremental
  978. REP ;
  979. MOVSD ; move rest IN one byte steps
  980. endL:
  981. ADD ESP, 16 ;
  982. END MovX;
  983. PROCEDURE -MovR5( src: ADDRESS; inc, stride: SIZE; dest: ADDRESS; count: SIZE);
  984. CODE {SYSTEM.i386}
  985. MOV ESI, [ESP+src] ; src
  986. MOV EBX, [ESP+inc] ; inc
  987. MOV ECX, [ESP+stride] ; stride
  988. MOV EDI, [ESP+dest] ; dest
  989. loopL:
  990. MOV EAX, [ESP+count] ; count
  991. CMP EAX, 0 ;
  992. JLE endL ;
  993. SUB EAX, 4 ;
  994. MOV [ESP+count], EAX ;
  995. MOV EDX, ESI ;
  996. MOV EAX, [EDX] ;
  997. MOV [EDI], EAX ;
  998. ADD EDX, EBX ;
  999. MOV EAX, [EDX] ;
  1000. MOV [EDI+16], EAX ;
  1001. ADD EDX, EBX ;
  1002. MOV EAX, [EDX] ;
  1003. MOV [EDI+32], EAX ;
  1004. ADD EDX, EBX ;
  1005. MOV EAX, [EDX] ;
  1006. MOV [EDI+48], EAX ;
  1007. ADD EDX, EBX ;
  1008. MOV EAX, [EDX] ;
  1009. MOV [EDI+64], EAX ;
  1010. ADD ESI, ECX ;
  1011. ADD EDI, 4 ;
  1012. MOV EDX, ESI ;
  1013. MOV EAX, [EDX] ;
  1014. MOV [EDI], EAX ;
  1015. ADD EDX, EBX ;
  1016. MOV EAX, [EDX] ;
  1017. MOV [EDI+16], EAX ;
  1018. ADD EDX, EBX ;
  1019. MOV EAX, [EDX] ;
  1020. MOV [EDI+32], EAX ;
  1021. ADD EDX, EBX ;
  1022. MOV EAX, [EDX] ;
  1023. MOV [EDI+48], EAX ;
  1024. ADD EDX, EBX ;
  1025. MOV EAX, [EDX] ;
  1026. MOV [EDI+64], EAX ;
  1027. ADD ESI, ECX ;
  1028. ADD EDI, 4 ;
  1029. MOV EDX, ESI ;
  1030. MOV EAX, [EDX] ;
  1031. MOV [EDI], EAX ;
  1032. ADD EDX, EBX ;
  1033. MOV EAX, [EDX] ;
  1034. MOV [EDI+16], EAX ;
  1035. ADD EDX, EBX ;
  1036. MOV EAX, [EDX] ;
  1037. MOV [EDI+32], EAX ;
  1038. ADD EDX, EBX ;
  1039. MOV EAX, [EDX] ;
  1040. MOV [EDI+48], EAX ;
  1041. ADD EDX, EBX ;
  1042. MOV EAX, [EDX] ;
  1043. MOV [EDI+64], EAX ;
  1044. ADD ESI, ECX ;
  1045. ADD EDI, 4 ;
  1046. MOV EDX, ESI ;
  1047. MOV EAX, [EDX] ;
  1048. MOV [EDI], EAX ;
  1049. ADD EDX, EBX ;
  1050. MOV EAX, [EDX] ;
  1051. MOV [EDI+16], EAX ;
  1052. ADD EDX, EBX ;
  1053. MOV EAX, [EDX] ;
  1054. MOV [EDI+32], EAX ;
  1055. ADD EDX, EBX ;
  1056. MOV EAX, [EDX] ;
  1057. MOV [EDI+48], EAX ;
  1058. ADD EDX, EBX ;
  1059. MOV EAX, [EDX] ;
  1060. MOV [EDI+64], EAX ;
  1061. ADD ESI, ECX ;
  1062. ADD EDI, 4 ;
  1063. ADD EDI, 64 ;
  1064. JMP loopL ;
  1065. endL:
  1066. ADD ESP, 20 ;
  1067. END MovR5;
  1068. (* *)
  1069. PROCEDURE AddAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1070. CODE {SYSTEM.i386, SYSTEM.FPU}
  1071. MOV EAX, [EBP+len] ;
  1072. MOV EBX, [EBP+ladr] ;
  1073. MOV ECX, [EBP+radr] ;
  1074. MOV EDX, [EBP+dadr] ;
  1075. start:
  1076. CMP EAX, 0 ;
  1077. JLE endL ;
  1078. FLD QWORD [EBX] ;
  1079. ADD EBX, [EBP+linc] ;
  1080. FLD QWORD [ECX] ;
  1081. ADD ECX, [EBP+rinc] ;
  1082. FADDP ;
  1083. FSTP QWORD [EDX] ;
  1084. ADD EDX, [EBP+dinc] ;
  1085. DEC EAX ;
  1086. JMP start ;
  1087. endL:
  1088. FWAIT ;
  1089. END AddAXAXLoopA;
  1090. PROCEDURE AddARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1091. CODE {SYSTEM.i386, SYSTEM.FPU}
  1092. MOV EAX, [EBP+len] ;
  1093. MOV EBX, [EBP+ladr] ;
  1094. MOV ECX, [EBP+radr] ;
  1095. MOV EDX, [EBP+dadr] ;
  1096. start:
  1097. CMP EAX, 0 ;
  1098. JLE endL ;
  1099. FLD DWORD [EBX] ;
  1100. ADD EBX, [EBP+linc] ;
  1101. FLD DWORD [ECX] ;
  1102. ADD ECX, [EBP+rinc] ;
  1103. FADDP ;
  1104. FSTP DWORD [EDX] ;
  1105. ADD EDX, [EBP+dinc] ;
  1106. DEC EAX ;
  1107. JMP start ;
  1108. endL:
  1109. FWAIT ;
  1110. END AddARARLoopA;
  1111. PROCEDURE AddAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1112. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1113. MOV EAX, [EBP+len] ;
  1114. CMP EAX, 0 ;
  1115. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1116. MOV EBX, [EBP+ladr] ;
  1117. MOV ECX, [EBP+radr] ;
  1118. MOV EDX, [EBP+dadr] ;
  1119. ; check IF data are contiguous IN memory
  1120. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1121. JNE single ; not continuous- > simplest method
  1122. CMP [EBP+rinc], 8 ; check right FOR contiunuity
  1123. JNE single ; not continuous- > simplest method
  1124. CMP [EBP+dinc], 8 ; check destination FOR contiunuity
  1125. JNE single ; not continuous- > simplest method
  1126. ; check FOR alignment
  1127. MOV ESI, EBX ;
  1128. AND ESI, 7 ; ladr MOD 8
  1129. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1130. JNE unaligned ; not 64 bit aligned
  1131. MOV ESI, ECX ;
  1132. AND ESI, 7 ; radr MOD 8
  1133. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1134. JNE unaligned ; not 64 bit aligned
  1135. MOV ESI, EDX ;
  1136. AND ESI, 7 ; dadr MOD 8
  1137. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1138. JNE unaligned ; not 64 bit aligned
  1139. MOV ESI, EBX ;
  1140. AND ESI, 8 ; 16 byte alignment
  1141. MOV EDI, ECX ;
  1142. AND EDI, 8 ; 16 byte alignment
  1143. CMP ESI, EDI ;
  1144. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1145. MOV EDI, EDX ;
  1146. AND EDI, 8 ; 16 byte alignment
  1147. CMP ESI, EDI ;
  1148. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1149. CMP ESI, 8 ;
  1150. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1151. ; one single element processing TO achieve 128 bt alignment
  1152. MOVSD XMM1, [EBX] ;
  1153. MOVSD XMM0, [ECX] ;
  1154. ADDSD XMM0, XMM1 ;
  1155. MOVSD [EDX], XMM0 ;
  1156. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1157. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1158. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1159. DEC EAX ; one element has been processed
  1160. aligned:
  1161. aligned8:
  1162. CMP EAX, 8 ;
  1163. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1164. MOVAPD XMM0, [EBX] ;
  1165. MOVAPD XMM1, [EBX+16] ;
  1166. MOVAPD XMM2, [EBX+32] ;
  1167. MOVAPD XMM3, [EBX+48] ;
  1168. ADD EBX, 64 ;
  1169. MOVAPD XMM4, [ECX] ;
  1170. MOVAPD XMM5, [ECX+16] ;
  1171. MOVAPD XMM6, [ECX+32] ;
  1172. MOVAPD XMM7, [ECX+48] ;
  1173. ADD ECX, 64 ;
  1174. ADDPD XMM0, XMM4 ;
  1175. ADDPD XMM1, XMM5 ;
  1176. ADDPD XMM2, XMM6 ;
  1177. ADDPD XMM3, XMM7 ;
  1178. MOVAPD [EDX], XMM0 ;
  1179. MOVAPD [EDX+16], XMM1 ;
  1180. MOVAPD [EDX+32], XMM2 ;
  1181. MOVAPD [EDX+48], XMM3 ;
  1182. ADD EDX, 64 ;
  1183. SUB EAX, 8 ;
  1184. JMP aligned8 ;
  1185. ; LOOP FOR 2 pieces aligned
  1186. aligned2: ;
  1187. CMP EAX, 2 ;
  1188. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1189. MOVAPD XMM0, [EBX] ;
  1190. ADD EBX, 16 ;
  1191. MOVAPD XMM1, [ECX] ;
  1192. ADD ECX, 16 ;
  1193. ADDPD XMM0, XMM1 ;
  1194. MOVAPD [EDX], XMM0 ;
  1195. ADD EDX, 16 ;
  1196. SUB EAX, 2 ;
  1197. JMP aligned2 ;
  1198. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1199. unaligned: ;
  1200. unaligned8: ;
  1201. CMP EAX, 8 ;
  1202. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1203. MOVUPD XMM0, [EBX] ;
  1204. MOVUPD XMM1, [EBX+16] ;
  1205. MOVUPD XMM2, [EBX+32] ;
  1206. MOVUPD XMM3, [EBX+48] ;
  1207. ADD EBX, 64 ;
  1208. MOVUPD XMM4, [ECX] ;
  1209. MOVUPD XMM5, [ECX+16] ;
  1210. MOVUPD XMM6, [ECX+32] ;
  1211. MOVUPD XMM7, [ECX+48] ;
  1212. ADD ECX, 64 ;
  1213. ADDPD XMM0, XMM4 ;
  1214. ADDPD XMM1, XMM5 ;
  1215. ADDPD XMM2, XMM6 ;
  1216. ADDPD XMM3, XMM7 ;
  1217. MOVUPD [EDX], XMM0 ;
  1218. MOVUPD [EDX+16], XMM1 ;
  1219. MOVUPD [EDX+32], XMM2 ;
  1220. MOVUPD [EDX+48], XMM3 ;
  1221. ADD EDX, 64 ;
  1222. SUB EAX, 8 ;
  1223. JMP unaligned8 ;
  1224. ; LOOP FOR 2 pieces aligned
  1225. unaligned2: ;
  1226. CMP EAX, 2 ;
  1227. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1228. MOVUPD XMM0, [EBX] ;
  1229. ADD EBX, 16 ;
  1230. MOVUPD XMM1, [ECX] ;
  1231. ADD ECX, 16 ;
  1232. ADDPD XMM0, XMM1 ;
  1233. MOVUPD [EDX], XMM0 ;
  1234. ADD EDX, 16 ;
  1235. SUB EAX, 2 ;
  1236. JMP unaligned2 ;
  1237. ; one piece left OR non-contiguous data
  1238. single:
  1239. singlepieces: ;
  1240. CMP EAX, 0 ;
  1241. JLE endL ; len <= 0- > EXIT
  1242. MOVSD XMM0, [EBX]
  1243. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1244. MOVSD XMM1, [ECX]
  1245. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1246. ADDSD XMM0, XMM1 ;
  1247. MOVSD [EDX], XMM0
  1248. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1249. DEC EAX ; DEC(len)
  1250. JMP singlepieces ;
  1251. endL:
  1252. END AddAXAXLoopSSE;
  1253. PROCEDURE AddARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1254. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1255. MOV EAX, [EBP+len] ;
  1256. CMP EAX, 0 ;
  1257. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1258. MOV EBX, [EBP+ladr] ;
  1259. MOV ECX, [EBP+radr] ;
  1260. MOV EDX, [EBP+dadr] ;
  1261. ; check IF data are contiguous IN memory
  1262. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1263. JNE single ; not continuous- > simplest method
  1264. CMP [EBP+rinc], 4 ; check right FOR contiunuity
  1265. JNE single ; not continuous- > simplest method
  1266. CMP [EBP+dinc], 4 ; check destination FOR contiunuity
  1267. JNE single ; not continuous- > simplest method
  1268. ; check FOR alignment
  1269. MOV ESI, EBX ;
  1270. AND ESI, 3 ; ladr MOD 4
  1271. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1272. JNE unaligned ; not 32 bit aligned
  1273. MOV ESI, ECX ;
  1274. AND ESI, 3 ; radr MOD 4
  1275. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1276. JNE unaligned ; not 32 bit aligned
  1277. MOV ESI, EDX ;
  1278. AND ESI, 3 ; dadr MOD 4
  1279. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1280. JNE unaligned ; not 32 bit aligned
  1281. MOV ESI, EBX ;
  1282. AND ESI, 8+4 ; 16 byte alignment?
  1283. MOV EDI, ECX ;
  1284. AND EDI, 8+4 ; 16 byte alignment?
  1285. CMP ESI, EDI ;
  1286. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1287. MOV EDI, EDX ;
  1288. AND EDI, 8+4 ; 16 byte alignment
  1289. CMP ESI, EDI ;
  1290. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1291. CMP ESI, 0 ;
  1292. JE aligned ; already aligned
  1293. align:
  1294. ; one single element processing UNTIL 128 bt alignment achieved
  1295. MOVSS XMM1, [EBX] ;
  1296. MOVSS XMM0, [ECX] ;
  1297. ADDSS XMM0, XMM1 ;
  1298. MOVSS [EDX], XMM0 ;
  1299. ADD EBX, 4 ;
  1300. ADD ECX, 4 ;
  1301. ADD EDX, 4 ;
  1302. DEC EAX ; one element has been processed ;
  1303. CMP EAX, 0 ; all elements already processed?
  1304. JLE single ;
  1305. MOV ESI, EBX ;
  1306. AND ESI, 8+4 ;
  1307. CMP ESI, 0 ;
  1308. JNE align ;
  1309. aligned:
  1310. aligned16:
  1311. CMP EAX, 16 ;
  1312. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1313. MOVAPS XMM0, [EBX] ;
  1314. MOVAPS XMM1, [EBX+16] ;
  1315. MOVAPS XMM2, [EBX+32] ;
  1316. MOVAPS XMM3, [EBX+48] ;
  1317. ADD EBX, 64 ;
  1318. MOVAPS XMM4, [ECX] ;
  1319. MOVAPS XMM5, [ECX+16] ;
  1320. MOVAPS XMM6, [ECX+32] ;
  1321. MOVAPS XMM7, [ECX+48] ;
  1322. ADD ECX, 64 ;
  1323. ADDPS XMM0, XMM4 ;
  1324. ADDPS XMM1, XMM5 ;
  1325. ADDPS XMM2, XMM6 ;
  1326. ADDPS XMM3, XMM7 ;
  1327. MOVAPS [EDX], XMM0 ;
  1328. MOVAPS [EDX+16], XMM1 ;
  1329. MOVAPS [EDX+32], XMM2 ;
  1330. MOVAPS [EDX+48], XMM3 ;
  1331. ADD EDX, 64 ;
  1332. SUB EAX, 16 ;
  1333. JMP aligned16 ;
  1334. ; LOOP FOR 2 pieces aligned
  1335. aligned4: ;
  1336. CMP EAX, 4 ;
  1337. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1338. MOVAPS XMM0, [EBX] ;
  1339. ADD EBX, 16 ;
  1340. MOVAPS XMM1, [ECX] ;
  1341. ADD ECX, 16 ;
  1342. ADDPS XMM0, XMM1 ;
  1343. MOVAPS [EDX], XMM0 ;
  1344. ADD EDX, 16 ;
  1345. SUB EAX, 4 ;
  1346. JMP aligned4 ;
  1347. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1348. unaligned: ;
  1349. unaligned16: ;
  1350. CMP EAX, 16 ;
  1351. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1352. MOVUPS XMM0, [EBX] ;
  1353. MOVUPS XMM1, [EBX+16] ;
  1354. MOVUPS XMM2, [EBX+32] ;
  1355. MOVUPS XMM3, [EBX+48] ;
  1356. ADD EBX, 64 ;
  1357. MOVUPS XMM4, [ECX] ;
  1358. MOVUPS XMM5, [ECX+16] ;
  1359. MOVUPS XMM6, [ECX+32] ;
  1360. MOVUPS XMM7, [ECX+48] ;
  1361. ADD ECX, 64 ;
  1362. ADDPS XMM0, XMM4 ;
  1363. ADDPS XMM1, XMM5 ;
  1364. ADDPS XMM2, XMM6 ;
  1365. ADDPS XMM3, XMM7 ;
  1366. MOVUPS [EDX], XMM0 ;
  1367. MOVUPS [EDX+16], XMM1 ;
  1368. MOVUPS [EDX+32], XMM2 ;
  1369. MOVUPS [EDX+48], XMM3 ;
  1370. ADD EDX, 64 ;
  1371. SUB EAX, 16 ;
  1372. JMP unaligned16 ;
  1373. ; LOOP FOR 2 pieces aligned
  1374. unaligned4: ;
  1375. CMP EAX, 4 ;
  1376. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1377. MOVUPS XMM0, [EBX] ;
  1378. ADD EBX, 16 ;
  1379. MOVUPS XMM1, [ECX] ;
  1380. ADD ECX, 16 ;
  1381. ADDPS XMM0, XMM1 ;
  1382. MOVUPS [EDX], XMM0 ;
  1383. ADD EDX, 16 ;
  1384. SUB EAX, 4 ;
  1385. JMP unaligned4 ;
  1386. ; one piece left OR non-contiguous data
  1387. single:
  1388. singlepieces: ;
  1389. CMP EAX, 0 ;
  1390. JLE endL ; len <= 0- > EXIT
  1391. MOVSS XMM0, [EBX]
  1392. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1393. MOVSS XMM1, [ECX]
  1394. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1395. ADDSS XMM0, XMM1 ;
  1396. MOVSS [EDX], XMM0
  1397. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1398. DEC EAX ; DEC(len)
  1399. JMP singlepieces ;
  1400. endL:
  1401. END AddARARLoopSSE;
  1402. (* *)
  1403. PROCEDURE SubAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1404. CODE {SYSTEM.i386, SYSTEM.FPU}
  1405. MOV EAX, [EBP+len] ;
  1406. MOV EBX, [EBP+ladr] ;
  1407. MOV ECX, [EBP+radr] ;
  1408. MOV EDX, [EBP+dadr] ;
  1409. start:
  1410. CMP EAX, 0 ;
  1411. JLE endL ;
  1412. FLD QWORD [EBX] ;
  1413. ADD EBX, [EBP+linc] ;
  1414. FLD QWORD [ECX] ;
  1415. ADD ECX, [EBP+rinc] ;
  1416. FSUBP ;
  1417. FSTP QWORD [EDX] ;
  1418. ADD EDX, [EBP+dinc] ;
  1419. DEC EAX ;
  1420. JMP start ;
  1421. endL:
  1422. FWAIT ;
  1423. END SubAXAXLoopA;
  1424. PROCEDURE SubARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1425. CODE {SYSTEM.i386, SYSTEM.FPU}
  1426. MOV EAX, [EBP+len] ;
  1427. MOV EBX, [EBP+ladr] ;
  1428. MOV ECX, [EBP+radr] ;
  1429. MOV EDX, [EBP+dadr] ;
  1430. start:
  1431. CMP EAX, 0 ;
  1432. JLE endL ;
  1433. FLD DWORD [EBX] ;
  1434. ADD EBX, [EBP+linc] ;
  1435. FLD DWORD [ECX] ;
  1436. ADD ECX, [EBP+rinc] ;
  1437. FSUBP ;
  1438. FSTP DWORD [EDX] ;
  1439. ADD EDX, [EBP+dinc] ;
  1440. DEC EAX ;
  1441. JMP start ;
  1442. endL:
  1443. FWAIT ;
  1444. END SubARARLoopA;
  1445. PROCEDURE SubAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1446. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1447. MOV EAX, [EBP+len] ;
  1448. CMP EAX, 0 ;
  1449. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1450. MOV EBX, [EBP+ladr] ;
  1451. MOV ECX, [EBP+radr] ;
  1452. MOV EDX, [EBP+dadr] ;
  1453. ; check IF data are contiguous IN memory
  1454. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1455. JNE single ; not continuous- > simplest method
  1456. CMP [EBP+rinc], 8 ; check right FOR contiunuity
  1457. JNE single ; not continuous- > simplest method
  1458. CMP [EBP+dinc], 8 ; check destination FOR contiunuity
  1459. JNE single ; not continuous- > simplest method
  1460. ; check FOR alignment
  1461. MOV ESI, EBX ;
  1462. AND ESI, 7 ; ladr MOD 8
  1463. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1464. JNE unaligned ; not 64 bit aligned
  1465. MOV ESI, ECX ;
  1466. AND ESI, 7 ; radr MOD 8
  1467. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1468. JNE unaligned ; not 64 bit aligned
  1469. MOV ESI, EDX ;
  1470. AND ESI, 7 ; dadr MOD 8
  1471. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1472. JNE unaligned ; not 64 bit aligned
  1473. MOV ESI, EBX ;
  1474. AND ESI, 8 ; 16 byte alignment
  1475. MOV EDI, ECX ;
  1476. AND EDI, 8 ; 16 byte alignment
  1477. CMP ESI, EDI ;
  1478. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1479. MOV EDI, EDX ;
  1480. AND EDI, 8 ; 16 byte alignment
  1481. CMP ESI, EDI ;
  1482. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1483. CMP ESI, 8 ;
  1484. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1485. ; one single element processing TO achieve 128 bt alignment
  1486. MOVSD XMM1, [EBX] ;
  1487. MOVSD XMM0, [ECX] ;
  1488. SUBSD XMM0, XMM1 ;
  1489. MOVSD [EDX], XMM0 ;
  1490. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1491. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1492. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1493. DEC EAX ; one element has been processed
  1494. aligned:
  1495. aligned8:
  1496. CMP EAX, 8 ;
  1497. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1498. MOVAPD XMM0, [EBX] ;
  1499. MOVAPD XMM1, [EBX+16] ;
  1500. MOVAPD XMM2, [EBX+32] ;
  1501. MOVAPD XMM3, [EBX+48] ;
  1502. ADD EBX, 64 ;
  1503. MOVAPD XMM4, [ECX] ;
  1504. MOVAPD XMM5, [ECX+16] ;
  1505. MOVAPD XMM6, [ECX+32] ;
  1506. MOVAPD XMM7, [ECX+48] ;
  1507. ADD ECX, 64 ;
  1508. SUBPD XMM0, XMM4 ;
  1509. SUBPD XMM1, XMM5 ;
  1510. SUBPD XMM2, XMM6 ;
  1511. SUBPD XMM3, XMM7 ;
  1512. MOVAPD [EDX], XMM0 ;
  1513. MOVAPD [EDX+16], XMM1 ;
  1514. MOVAPD [EDX+32], XMM2 ;
  1515. MOVAPD [EDX+48], XMM3 ;
  1516. ADD EDX, 64 ;
  1517. SUB EAX, 8 ;
  1518. JMP aligned8 ;
  1519. ; LOOP FOR 2 pieces aligned
  1520. aligned2: ;
  1521. CMP EAX, 2 ;
  1522. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1523. MOVAPD XMM0, [EBX] ;
  1524. ADD EBX, 16 ;
  1525. MOVAPD XMM1, [ECX] ;
  1526. ADD ECX, 16 ;
  1527. SUBPD XMM0, XMM1 ;
  1528. MOVAPD [EDX], XMM0 ;
  1529. ADD EDX, 16 ;
  1530. SUB EAX, 2 ;
  1531. JMP aligned2 ;
  1532. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1533. unaligned: ;
  1534. unaligned8: ;
  1535. CMP EAX, 8 ;
  1536. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1537. MOVUPD XMM0, [EBX] ;
  1538. MOVUPD XMM1, [EBX+16] ;
  1539. MOVUPD XMM2, [EBX+32] ;
  1540. MOVUPD XMM3, [EBX+48] ;
  1541. ADD EBX, 64 ;
  1542. MOVUPD XMM4, [ECX] ;
  1543. MOVUPD XMM5, [ECX+16] ;
  1544. MOVUPD XMM6, [ECX+32] ;
  1545. MOVUPD XMM7, [ECX+48] ;
  1546. ADD ECX, 64 ;
  1547. SUBPD XMM0, XMM4 ;
  1548. SUBPD XMM1, XMM5 ;
  1549. SUBPD XMM2, XMM6 ;
  1550. SUBPD XMM3, XMM7 ;
  1551. MOVUPD [EDX], XMM0 ;
  1552. MOVUPD [EDX+16], XMM1 ;
  1553. MOVUPD [EDX+32], XMM2 ;
  1554. MOVUPD [EDX+48], XMM3 ;
  1555. ADD EDX, 64 ;
  1556. SUB EAX, 8 ;
  1557. JMP unaligned8 ;
  1558. ; LOOP FOR 2 pieces aligned
  1559. unaligned2: ;
  1560. CMP EAX, 2 ;
  1561. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1562. MOVUPD XMM0, [EBX] ;
  1563. ADD EBX, 16 ;
  1564. MOVUPD XMM1, [ECX] ;
  1565. ADD ECX, 16 ;
  1566. SUBPD XMM0, XMM1 ;
  1567. MOVUPD [EDX], XMM0 ;
  1568. ADD EDX, 16 ;
  1569. SUB EAX, 2 ;
  1570. JMP unaligned2 ;
  1571. ; one piece left OR non-contiguous data
  1572. single:
  1573. singlepieces: ;
  1574. CMP EAX, 0 ;
  1575. JLE endL ; len <= 0- > EXIT
  1576. MOVSD XMM0, [EBX]
  1577. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1578. MOVSD XMM1, [ECX]
  1579. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1580. SUBSD XMM0, XMM1 ;
  1581. MOVSD [EDX], XMM0
  1582. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1583. DEC EAX ; DEC(len)
  1584. JMP singlepieces ;
  1585. endL:
  1586. END SubAXAXLoopSSE;
  1587. PROCEDURE SubARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1588. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1589. MOV EAX, [EBP+len] ;
  1590. CMP EAX, 0 ;
  1591. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1592. MOV EBX, [EBP+ladr] ;
  1593. MOV ECX, [EBP+radr] ;
  1594. MOV EDX, [EBP+dadr] ;
  1595. ; check IF data are contiguous IN memory
  1596. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1597. JNE single ; not continuous- > simplest method
  1598. CMP [EBP+rinc], 4 ; check right FOR contiunuity
  1599. JNE single ; not continuous- > simplest method
  1600. CMP [EBP+dinc], 4 ; check destination FOR contiunuity
  1601. JNE single ; not continuous- > simplest method
  1602. ; check FOR alignment
  1603. MOV ESI, EBX ;
  1604. AND ESI, 3 ; ladr MOD 4
  1605. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1606. JNE unaligned ; not 32 bit aligned
  1607. MOV ESI, ECX ;
  1608. AND ESI, 3 ; radr MOD 4
  1609. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1610. JNE unaligned ; not 32 bit aligned
  1611. MOV ESI, EDX ;
  1612. AND ESI, 3 ; dadr MOD 4
  1613. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1614. JNE unaligned ; not 32 bit aligned
  1615. MOV ESI, EBX ;
  1616. AND ESI, 8+4 ; 16 byte alignment?
  1617. MOV EDI, ECX ;
  1618. AND EDI, 8+4 ; 16 byte alignment?
  1619. CMP ESI, EDI ;
  1620. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1621. MOV EDI, EDX ;
  1622. AND EDI, 8+4 ; 16 byte alignment
  1623. CMP ESI, EDI ;
  1624. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1625. CMP ESI, 0 ;
  1626. JE aligned ; already aligned
  1627. align:
  1628. ; one single element processing UNTIL 128 bt alignment achieved
  1629. MOVSS XMM1, [EBX] ;
  1630. MOVSS XMM0, [ECX] ;
  1631. SUBSS XMM0, XMM1 ;
  1632. MOVSS [EDX], XMM0 ;
  1633. ADD EBX, 4 ;
  1634. ADD ECX, 4 ;
  1635. ADD EDX, 4 ;
  1636. DEC EAX ; one element has been processed ;
  1637. CMP EAX, 0 ; all elements already processed?
  1638. JLE single ;
  1639. MOV ESI, EBX ;
  1640. AND ESI, 8+4 ;
  1641. CMP ESI, 0 ;
  1642. JNE align ;
  1643. aligned:
  1644. aligned16:
  1645. CMP EAX, 16 ;
  1646. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1647. MOVAPS XMM0, [EBX] ;
  1648. MOVAPS XMM1, [EBX+16] ;
  1649. MOVAPS XMM2, [EBX+32] ;
  1650. MOVAPS XMM3, [EBX+48] ;
  1651. ADD EBX, 64 ;
  1652. MOVAPS XMM4, [ECX] ;
  1653. MOVAPS XMM5, [ECX+16] ;
  1654. MOVAPS XMM6, [ECX+32] ;
  1655. MOVAPS XMM7, [ECX+48] ;
  1656. ADD ECX, 64 ;
  1657. SUBPS XMM0, XMM4 ;
  1658. SUBPS XMM1, XMM5 ;
  1659. SUBPS XMM2, XMM6 ;
  1660. SUBPS XMM3, XMM7 ;
  1661. MOVAPS [EDX], XMM0 ;
  1662. MOVAPS [EDX+16], XMM1 ;
  1663. MOVAPS [EDX+32], XMM2 ;
  1664. MOVAPS [EDX+48], XMM3 ;
  1665. ADD EDX, 64 ;
  1666. SUB EAX, 16 ;
  1667. JMP aligned16 ;
  1668. ; LOOP FOR 2 pieces aligned
  1669. aligned4: ;
  1670. CMP EAX, 4 ;
  1671. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1672. MOVAPS XMM0, [EBX] ;
  1673. ADD EBX, 16 ;
  1674. MOVAPS XMM1, [ECX] ;
  1675. ADD ECX, 16 ;
  1676. SUBPS XMM0, XMM1 ;
  1677. MOVAPS [EDX], XMM0 ;
  1678. ADD EDX, 16 ;
  1679. SUB EAX, 4 ;
  1680. JMP aligned4 ;
  1681. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1682. unaligned: ;
  1683. unaligned16: ;
  1684. CMP EAX, 16 ;
  1685. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1686. MOVUPS XMM0, [EBX] ;
  1687. MOVUPS XMM1, [EBX+16] ;
  1688. MOVUPS XMM2, [EBX+32] ;
  1689. MOVUPS XMM3, [EBX+48] ;
  1690. ADD EBX, 64 ;
  1691. MOVUPS XMM4, [ECX] ;
  1692. MOVUPS XMM5, [ECX+16] ;
  1693. MOVUPS XMM6, [ECX+32] ;
  1694. MOVUPS XMM7, [ECX+48] ;
  1695. ADD ECX, 64 ;
  1696. SUBPS XMM0, XMM4 ;
  1697. SUBPS XMM1, XMM5 ;
  1698. SUBPS XMM2, XMM6 ;
  1699. SUBPS XMM3, XMM7 ;
  1700. MOVUPS [EDX], XMM0 ;
  1701. MOVUPS [EDX+16], XMM1 ;
  1702. MOVUPS [EDX+32], XMM2 ;
  1703. MOVUPS [EDX+48], XMM3 ;
  1704. ADD EDX, 64 ;
  1705. SUB EAX, 16 ;
  1706. JMP unaligned16 ;
  1707. ; LOOP FOR 2 pieces aligned
  1708. unaligned4: ;
  1709. CMP EAX, 4 ;
  1710. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1711. MOVUPS XMM0, [EBX] ;
  1712. ADD EBX, 16 ;
  1713. MOVUPS XMM1, [ECX] ;
  1714. ADD ECX, 16 ;
  1715. SUBPS XMM0, XMM1 ;
  1716. MOVUPS [EDX], XMM0 ;
  1717. ADD EDX, 16 ;
  1718. SUB EAX, 4 ;
  1719. JMP unaligned4 ;
  1720. ; one piece left OR non-contiguous data
  1721. single:
  1722. singlepieces: ;
  1723. CMP EAX, 0 ;
  1724. JLE endL ; len <= 0- > EXIT
  1725. MOVSS XMM0, [EBX]
  1726. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1727. MOVSS XMM1, [ECX]
  1728. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1729. SUBSS XMM0, XMM1 ;
  1730. MOVSS [EDX], XMM0
  1731. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1732. DEC EAX ; DEC(len)
  1733. JMP singlepieces ;
  1734. endL:
  1735. END SubARARLoopSSE;
  1736. PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1737. CODE {SYSTEM.i386, SYSTEM.FPU}
  1738. MOV EAX, [EBP+len] ; eax := len
  1739. MOV EBX, [EBP+ladr] ; ebx := ladr
  1740. MOV ECX, [EBP+radr] ; ecx := radr
  1741. MOV EDX, [EBP+dadr] ; edx := dadr
  1742. FLD QWORD [EDX] ; S.GET(dadr, x)
  1743. start:
  1744. CMP EAX, 0 ; WHILE len > 0 DO
  1745. JLE endL
  1746. FLD QWORD [EBX] ; S.GET(ladr, x)
  1747. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1748. FLD QWORD [ECX] ; S.GET(ladr, y)
  1749. FMULP ; x := x*y
  1750. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1751. FADDP ; z := z+x
  1752. DEC EAX ; DEC(len)
  1753. JMP start ;
  1754. endL:
  1755. FSTP QWORD [EDX] ; S.PUT(dadr, x)
  1756. FWAIT ;
  1757. END SPAXAXLoopA;
  1758. PROCEDURE SPARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1759. CODE {SYSTEM.i386, SYSTEM.FPU}
  1760. MOV EAX, [EBP+len] ; eax := len
  1761. MOV EBX, [EBP+ladr] ; ebx := ladr
  1762. MOV ECX, [EBP+radr] ; ecx := radr
  1763. MOV EDX, [EBP+dadr] ; edx := dadr
  1764. FLD DWORD [EDX] ; S.GET(dadr, x)
  1765. start:
  1766. CMP EAX, 0 ; WHILE len > 0 DO
  1767. JLE endL
  1768. FLD DWORD [EBX] ; S.GET(ladr, x)
  1769. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1770. FLD DWORD [ECX] ; S.GET(ladr, y)
  1771. FMULP ; x := x*y
  1772. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1773. FADDP ; z := z+x
  1774. DEC EAX ; DEC(len)
  1775. JMP start ;
  1776. endL:
  1777. FSTP DWORD [EDX] ; S.PUT(dadr, x)
  1778. FWAIT ;
  1779. END SPARARLoopA;
  1780. (* sse version of scalar product *)
  1781. PROCEDURE SPAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1782. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1783. ; register initialization
  1784. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1785. CMP EAX, 0 ;
  1786. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1787. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1788. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  1789. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1790. XORPD XMM0, XMM0 ;
  1791. MOVSD XMM0, [EDX] ; destination- > low bytes OF xmm0
  1792. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1793. JNE single ; not continuous- > simplest method
  1794. CMP [EBP+rinc], 8 ; check dest FOR continuity
  1795. JNE single ; not continuous- > simplest method
  1796. ; check FOR alignment
  1797. MOV ESI, EBX ;
  1798. AND ESI, 7 ; ladr MOD 8
  1799. CMP ESI, 0 ; ECX = 0- > 64 Bit alignment
  1800. JNE unaligned ; not 64 bit aligned
  1801. MOV ESI, ECX ;
  1802. AND ESI, 7 ; radr MOD 8
  1803. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1804. JNE unaligned ; not 64 bit aligned
  1805. MOV ESI, EBX ;
  1806. AND ESI, 8 ; 16 byte alignment
  1807. MOV EDI, ECX ;
  1808. AND EDI, 8 ; 16 byte alignment
  1809. CMP ESI, EDI ;
  1810. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1811. CMP ESI, 8 ;
  1812. JNE aligned ; ladr and dadr already 128 bit aligned
  1813. ; one single element processing TO achieve 128 bt alignment
  1814. MOVSD XMM1, [EBX] ;
  1815. MOVSD XMM2, [ECX] ;
  1816. MULSD XMM1, XMM2 ;
  1817. ADDSD XMM0, XMM1 ;
  1818. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1819. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1820. DEC EAX ; one element has been processed
  1821. ; LOOP FOR 4 pieces aligned
  1822. aligned:
  1823. aligned6:
  1824. CMP EAX, 6 ;
  1825. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1826. MOVAPD XMM1, [EBX] ;
  1827. MOVAPD XMM2, [EBX+16] ;
  1828. MOVAPD XMM3, [EBX+32] ;
  1829. MOVAPD XMM4, [ECX] ;
  1830. MOVAPD XMM5, [ECX+16] ;
  1831. MOVAPD XMM6, [ECX+32] ;
  1832. MULPD XMM1, XMM4 ;
  1833. ADDPD XMM0, XMM1 ;
  1834. MULPD XMM2, XMM5 ;
  1835. ADDPD XMM0, XMM2 ;
  1836. MULPD XMM3, XMM6 ;
  1837. ADDPD XMM0, XMM3 ;
  1838. ADD EBX, 48 ;
  1839. ADD ECX, 48 ;
  1840. SUB EAX, 6 ;
  1841. JMP aligned6 ;
  1842. ; LOOP FOR 2 pieces aligned
  1843. aligned2:
  1844. CMP EAX, 2 ;
  1845. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1846. MOVAPD XMM1, [EBX] ;
  1847. MOVAPD XMM2, [ECX] ;
  1848. MULPD XMM1, XMM2 ;
  1849. ADDPD XMM0, XMM1 ;
  1850. ADD EBX, 16 ;
  1851. ADD ECX, 16 ;
  1852. SUB EAX, 2 ;
  1853. JMP aligned2 ;
  1854. unaligned:
  1855. unaligned6:
  1856. CMP EAX, 6 ;
  1857. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1858. MOVUPD XMM1, [EBX] ;
  1859. MOVUPD XMM2, [EBX+16] ;
  1860. MOVUPD XMM3, [EBX+32] ;
  1861. MOVUPD XMM4, [ECX] ;
  1862. MOVUPD XMM5, [ECX+16] ;
  1863. MOVUPD XMM6, [ECX+32] ;
  1864. MULPD XMM1, XMM4 ;
  1865. ADDPD XMM0, XMM1 ;
  1866. MULPD XMM2, XMM5 ;
  1867. ADDPD XMM0, XMM2 ;
  1868. MULPD XMM3, XMM6 ;
  1869. ADDPD XMM0, XMM3 ;
  1870. ADD EBX, 48 ;
  1871. ADD ECX, 48 ;
  1872. SUB EAX, 6 ;
  1873. JMP unaligned6 ;
  1874. ; LOOP FOR 2 pieces aligned
  1875. unaligned2:
  1876. CMP EAX, 2 ;
  1877. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1878. MOVUPD XMM1, [EBX] ;
  1879. MOVUPD XMM2, [ECX] ;
  1880. MULPD XMM1, XMM2 ;
  1881. ADDPD XMM0, XMM1 ;
  1882. ADD EBX, 16 ;
  1883. ADD ECX, 16 ;
  1884. SUB EAX, 2 ;
  1885. JMP unaligned2 ;
  1886. horizontaladd: ;
  1887. MOVAPD XMM1, XMM0 ;
  1888. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  1889. ADDPD XMM0, XMM1 ;
  1890. JMP singlepieces ;
  1891. single:
  1892. singlepieces: ;
  1893. CMP EAX, 0 ;
  1894. JLE store ; len <= 0- > EXIT
  1895. MOVSD XMM1, [EBX]
  1896. MOVSD XMM2, [ECX]
  1897. MULSD XMM1, XMM2
  1898. ADDSD XMM0, XMM1
  1899. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1900. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1901. DEC EAX ; DEC(len)
  1902. JMP singlepieces ;
  1903. store:
  1904. MOVSD [EDX], XMM0 ;
  1905. endL:
  1906. END SPAXAXLoopSSE;
  1907. (* sse version of scalar product *)
  1908. PROCEDURE SPARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1909. CODE {SYSTEM.i386, SYSTEM.SSE}
  1910. ; register initialization
  1911. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1912. CMP EAX, 0 ;
  1913. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1914. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1915. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  1916. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1917. XORPS XMM0, XMM0 ;
  1918. MOVSS XMM0, [EDX] ; destination- > low bytes OF xmm0
  1919. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1920. JNE single ; not continuous- > simplest method
  1921. CMP [EBP+rinc], 4 ; check dest FOR continuity
  1922. JNE single ; not continuous- > simplest method
  1923. ; check FOR alignment
  1924. MOV ESI, EBX ;
  1925. AND ESI, 3 ; ladr MOD 4
  1926. CMP ESI, 0 ; ECX = 0- > 32 Bit alignment
  1927. JNE unaligned ; not 32 bit aligned
  1928. MOV ESI, ECX ;
  1929. AND ESI, 3 ; radr MOD 4
  1930. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1931. JNE unaligned ; not 32 bit aligned
  1932. MOV ESI, EBX ;
  1933. AND ESI, 8+4 ; 16 byte alignment
  1934. MOV EDI, ECX ;
  1935. AND EDI, 8+4 ; 16 byte alignment
  1936. CMP ESI, EDI ;
  1937. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1938. CMP ESI, 0 ;
  1939. JE aligned ; already aligned
  1940. align:
  1941. ; one single element processing UNTIL 128 bt alignment achieved
  1942. MOVSS XMM1, [EBX] ;
  1943. MOVSS XMM2, [ECX] ;
  1944. MULSS XMM1, XMM2 ;
  1945. ADDSS XMM0, XMM1 ;
  1946. ADD EBX, 4 ;
  1947. ADD ECX, 4 ;
  1948. DEC EAX ; one element has been processed ;
  1949. CMP EAX, 0 ; all elements already processed?
  1950. JLE single ;
  1951. MOV ESI, EBX ;
  1952. AND ESI, 8+4 ;
  1953. CMP ESI, 0 ;
  1954. JNE align ;
  1955. aligned:
  1956. aligned12:
  1957. CMP EAX, 12 ;
  1958. JL aligned4 ; len < 4- > EXIT TO singlepieces
  1959. MOVAPS XMM1, [EBX] ;
  1960. MOVAPS XMM2, [EBX+16] ;
  1961. MOVAPS XMM3, [EBX+32] ;
  1962. MOVAPS XMM4, [ECX] ;
  1963. MOVAPS XMM5, [ECX+16] ;
  1964. MOVAPS XMM6, [ECX+32] ;
  1965. MULPS XMM1, XMM4 ;
  1966. ADDPS XMM0, XMM1 ;
  1967. MULPS XMM2, XMM5 ;
  1968. ADDPS XMM0, XMM2 ;
  1969. MULPS XMM3, XMM6 ;
  1970. ADDPS XMM0, XMM3 ;
  1971. ADD EBX, 48 ;
  1972. ADD ECX, 48 ;
  1973. SUB EAX, 12 ;
  1974. JMP aligned12 ;
  1975. ; LOOP FOR 2 pieces aligned
  1976. aligned4:
  1977. CMP EAX, 4 ;
  1978. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1979. MOVAPS XMM1, [EBX] ;
  1980. MOVAPS XMM2, [ECX] ;
  1981. MULPS XMM1, XMM2 ;
  1982. ADDPS XMM0, XMM1 ;
  1983. ADD EBX, 16 ;
  1984. ADD ECX, 16 ;
  1985. SUB EAX, 4 ;
  1986. JMP aligned4 ;
  1987. unaligned:
  1988. unaligned12:
  1989. CMP EAX, 12 ;
  1990. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1991. MOVUPS XMM1, [EBX] ;
  1992. MOVUPS XMM2, [EBX+16] ;
  1993. MOVUPS XMM3, [EBX+32] ;
  1994. MOVUPS XMM4, [ECX] ;
  1995. MOVUPS XMM5, [ECX+16] ;
  1996. MOVUPS XMM6, [ECX+32] ;
  1997. MULPS XMM1, XMM4 ;
  1998. ADDPS XMM0, XMM1 ;
  1999. MULPS XMM2, XMM5 ;
  2000. ADDPS XMM0, XMM2 ;
  2001. MULPS XMM3, XMM6 ;
  2002. ADDPS XMM0, XMM3 ;
  2003. ADD EBX, 48 ;
  2004. ADD ECX, 48 ;
  2005. SUB EAX, 12 ;
  2006. JMP unaligned12 ;
  2007. ; LOOP FOR 2 pieces aligned
  2008. unaligned4:
  2009. CMP EAX, 4 ;
  2010. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2011. MOVUPS XMM1, [EBX] ;
  2012. MOVUPS XMM2, [ECX] ;
  2013. MULPS XMM1, XMM2 ;
  2014. ADDPS XMM0, XMM1 ;
  2015. ADD EBX, 16 ;
  2016. ADD ECX, 16 ;
  2017. SUB EAX, 4 ;
  2018. JMP unaligned4 ;
  2019. horizontaladd: ;
  2020. MOVAPS XMM1, XMM0 ;
  2021. ; 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *)
  2022. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  2023. ADDPS XMM1, XMM0 ;
  2024. MOVAPS XMM0, XMM1
  2025. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  2026. ADDPS XMM0, XMM1 ;
  2027. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  2028. JMP singlepieces ;
  2029. single:
  2030. singlepieces: ;
  2031. CMP EAX, 0 ;
  2032. JLE store ; len <= 0- > EXIT
  2033. MOVSS XMM1, [EBX]
  2034. MOVSS XMM2, [ECX]
  2035. MULSS XMM1, XMM2
  2036. ADDSS XMM0, XMM1
  2037. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2038. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  2039. DEC EAX ; DEC(len)
  2040. JMP singlepieces ;
  2041. store:
  2042. MOVSS [EDX], XMM0 ;
  2043. endL:
  2044. END SPARARLoopSSE;
  2045. PROCEDURE MulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2046. CODE {SYSTEM.i386, SYSTEM.FPU}
  2047. MOV EAX, [EBP+len] ; eax := len
  2048. MOV EBX, [EBP+ladr] ; ebx := ladr
  2049. MOV ECX, [EBP+radr] ; ecx := radr
  2050. MOV EDX, [EBP+dadr] ; edx := dadr
  2051. start:
  2052. CMP EAX, 0 ; WHILE len > 0 DO
  2053. JLE endL
  2054. FLD QWORD [EBX] ; S.GET(ladr, x)
  2055. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2056. FLD QWORD [ECX] ; S.GET(ladr, y)
  2057. FMULP ; x := x*y
  2058. FSTP QWORD [EDX]
  2059. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2060. DEC EAX ; DEC(len)
  2061. JMP start ;
  2062. endL:
  2063. FWAIT ;
  2064. END MulAXSXLoopA;
  2065. PROCEDURE MulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2066. CODE {SYSTEM.i386, SYSTEM.FPU}
  2067. MOV EAX, [EBP+len] ; eax := len
  2068. MOV EBX, [EBP+ladr] ; ebx := ladr
  2069. MOV ECX, [EBP+radr] ; ecx := radr
  2070. MOV EDX, [EBP+dadr] ; edx := dadr
  2071. start:
  2072. CMP EAX, 0 ; WHILE len > 0 DO
  2073. JLE endL
  2074. FLD DWORD [EBX] ; S.GET(ladr, x)
  2075. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2076. FLD DWORD [ECX] ; S.GET(ladr, y)
  2077. FMULP ; x := x*y
  2078. FSTP DWORD [EDX]
  2079. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2080. DEC EAX ; DEC(len)
  2081. JMP start ;
  2082. endL:
  2083. FWAIT ;
  2084. END MulARSRLoopA;
  2085. PROCEDURE IncMulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2086. CODE {SYSTEM.i386, SYSTEM.FPU}
  2087. MOV EAX, [EBP+len] ; eax := len
  2088. MOV EBX, [EBP+ladr] ; ebx := ladr
  2089. MOV ECX, [EBP+radr] ; ecx := radr
  2090. MOV EDX, [EBP+dadr] ; edx := dadr
  2091. start:
  2092. CMP EAX, 0 ; WHILE len > 0 DO
  2093. JLE endL
  2094. FLD QWORD [EBX] ; S.GET(ladr, x)
  2095. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2096. FLD QWORD [ECX] ; S.GET(ladr, y)
  2097. FMULP ; x := x*y
  2098. FLD QWORD [EDX+8] ;
  2099. FADDP ;
  2100. FSTP QWORD [EDX]
  2101. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2102. DEC EAX ; DEC(len)
  2103. JMP start ;
  2104. endL:
  2105. FWAIT ;
  2106. END IncMulAXSXLoopA;
  2107. PROCEDURE IncMulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2108. CODE {SYSTEM.i386, SYSTEM.FPU}
  2109. MOV EAX, [EBP+len] ; eax := len
  2110. MOV EBX, [EBP+ladr] ; ebx := ladr
  2111. MOV ECX, [EBP+radr] ; ecx := radr
  2112. MOV EDX, [EBP+dadr] ; edx := dadr
  2113. start:
  2114. CMP EAX, 0 ; WHILE len > 0 DO
  2115. JLE endL
  2116. FLD DWORD [EBX] ; S.GET(ladr, x)
  2117. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2118. FLD DWORD [ECX] ; S.GET(ladr, y)
  2119. FMULP ; x := x*y
  2120. FLD DWORD [EDX+8] ;
  2121. FADDP ;
  2122. FSTP DWORD [EDX]
  2123. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2124. DEC EAX ; DEC(len)
  2125. JMP start ;
  2126. endL:
  2127. FWAIT ;
  2128. END IncMulARSRLoopA;
  2129. PROCEDURE MulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2130. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2131. (*
  2132. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2133. 2.) process starting unaligned data ( using single instructions)
  2134. 3.) process aligned data
  2135. 4.) process remaining unaligned data (using single instructions)
  2136. *)
  2137. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2138. ; register initialization
  2139. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2140. CMP EAX, 0 ;
  2141. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2142. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2143. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2144. MOV ECX, [EBP+radr] ;
  2145. MOVSD XMM0, [ECX] ;
  2146. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2147. ; check IF data are contiguous IN memory
  2148. CMP [EBP+linc], 8 ; check left FOR contiunuity
  2149. JNE single ; not continuous- > simplest method
  2150. CMP [EBP+dinc], 8 ; check dest FOR continuity
  2151. JNE single ; not continuous- > simplest method
  2152. ; check FOR alignment
  2153. MOV ECX, EBX ;
  2154. AND ECX, 7 ; ladr MOD 8
  2155. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2156. JNE unaligned ; not 64 bit aligned
  2157. MOV ECX, EDX ;
  2158. AND ECX, 7 ; dadr MOD 8
  2159. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2160. JNE unaligned ; not 64 bit aligned
  2161. MOV ESI, EBX ;
  2162. AND ESI, 8 ; 16 byte alignment
  2163. MOV EDI, EDX ;
  2164. AND EDI, 8 ; 16 byte alignment
  2165. CMP ESI, EDI ;
  2166. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2167. CMP ESI, 8 ;
  2168. JNE aligned ; ladr and dadr already 128 bit aligned
  2169. ; one single element processing TO achieve 128 bt alignment
  2170. MOVSD XMM1, [EBX] ;
  2171. MULSD XMM1, XMM0 ;
  2172. MOVSD [EDX], XMM1 ;
  2173. ADD EBX, 8 ; now EBX IS 16 byte aligned
  2174. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  2175. DEC EAX ; one element has been processed
  2176. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2177. aligned:
  2178. aligned8:
  2179. CMP EAX, 8 ;
  2180. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2181. MOVAPD XMM1, [EBX] ;
  2182. MOVAPD XMM2, [EBX+16] ;
  2183. MOVAPD XMM3, [EBX+32] ;
  2184. MOVAPD XMM4, [EBX+48] ;
  2185. ADD EBX, 64 ;
  2186. MULPD XMM1, XMM0 ;
  2187. MULPD XMM2, XMM0 ;
  2188. MULPD XMM3, XMM0 ;
  2189. MULPD XMM4, XMM0 ;
  2190. MOVAPD [EDX], XMM1 ;
  2191. MOVAPD [EDX+16], XMM2 ;
  2192. MOVAPD [EDX+32], XMM3 ;
  2193. MOVAPD [EDX+48], XMM4 ;
  2194. ADD EDX, 64 ;
  2195. SUB EAX, 8 ;
  2196. JMP aligned8 ;
  2197. ; LOOP FOR 2 pieces aligned
  2198. aligned2: ;
  2199. CMP EAX, 2 ;
  2200. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2201. MOVAPD XMM1, [EBX] ;
  2202. ADD EBX, 16 ;
  2203. MULPD XMM1, XMM0 ;
  2204. MOVAPD [EDX], XMM1 ;
  2205. ADD EDX, 16 ;
  2206. SUB EAX, 2 ;
  2207. JMP aligned2 ;
  2208. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2209. unaligned: ;
  2210. unaligned8: ;
  2211. CMP EAX, 8 ;
  2212. JL unaligned2 ; len < 12- > EXIT
  2213. MOVUPD XMM1, [EBX] ;
  2214. MOVUPD XMM2, [EBX+16] ;
  2215. MOVUPD XMM3, [EBX+32] ;
  2216. MOVUPD XMM4, [EBX+48] ;
  2217. ADD EBX, 64
  2218. MULPD XMM1, XMM0 ;
  2219. MULPD XMM2, XMM0 ;
  2220. MULPD XMM3, XMM0 ;
  2221. MULPD XMM4, XMM0 ;
  2222. MOVUPD [EDX], XMM1 ;
  2223. MOVUPD [EDX+16], XMM2 ;
  2224. MOVUPD [EDX+32], XMM3 ;
  2225. MOVUPD [EDX+48], XMM4 ;
  2226. ADD EDX, 64 ;
  2227. SUB EAX, 8 ;
  2228. JMP unaligned8 ;
  2229. ; LOOP FOR 2 pieces unaligned
  2230. unaligned2: ;
  2231. CMP EAX, 2 ;
  2232. JL singlepieces ; len < 2- > EXIT
  2233. MOVUPD XMM1, [EBX] ;
  2234. ADD EBX, 16 ;
  2235. MULPD XMM1, XMM0 ;
  2236. MOVUPD [EDX], XMM1 ;
  2237. ADD EDX, 16 ;
  2238. SUB EAX, 2 ;
  2239. JMP unaligned2 ;
  2240. ; one piece left OR non-contiguous data
  2241. single:
  2242. singlepieces: ;
  2243. CMP EAX, 0 ;
  2244. JLE endL ; len <= 0- > EXIT
  2245. MOVSD XMM1, [EBX]
  2246. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2247. MULSD XMM1, XMM0
  2248. MOVSD [EDX], XMM1
  2249. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2250. DEC EAX ; DEC(len)
  2251. JMP singlepieces ;
  2252. endL:
  2253. END MulAXSXLoopSSE;
  2254. PROCEDURE MulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2255. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2256. (*
  2257. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2258. 2.) process starting unaligned data ( using single instructions)
  2259. 3.) process aligned data
  2260. 4.) process remaining unaligned data (using single instructions)
  2261. *)
  2262. CODE {SYSTEM.i386, SYSTEM.SSE}
  2263. ; register initialization
  2264. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2265. CMP EAX, 0 ;
  2266. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2267. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2268. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2269. MOV ECX, [EBP+radr] ;
  2270. MOVSS XMM0, [ECX] ;
  2271. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2272. ; check IF data are contiguous IN memory
  2273. CMP [EBP+linc], 4 ; check left FOR contiunuity
  2274. JNE single ; not continuous- > simplest method
  2275. CMP [EBP+dinc], 4 ; check dest FOR continuity
  2276. JNE single ; not continuous- > simplest method
  2277. ; check FOR alignment
  2278. MOV ECX, EBX ;
  2279. AND ECX, 3 ; ladr MOD 4
  2280. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2281. JNE unaligned ; not 32 bit aligned
  2282. MOV ECX, EDX ;
  2283. AND ECX, 3 ; dadr MOD 4
  2284. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2285. JNE unaligned ; not 64 bit aligned
  2286. MOV ESI, EBX ;
  2287. AND ESI, 8+4 ; 16 byte alignment
  2288. MOV EDI, EDX ;
  2289. AND EDI, 8+4 ; 16 byte alignment
  2290. CMP ESI, EDI ;
  2291. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2292. CMP ESI, 0 ;
  2293. JE aligned ; already aligned
  2294. align:
  2295. ; one single element processing UNTIL 128 bt alignment achieved
  2296. MOVSS XMM1, [EBX] ;
  2297. MULSS XMM1, XMM0 ;
  2298. MOVSS [EDX], XMM1 ;
  2299. ADD EBX, 4 ;
  2300. ADD EDX, 4 ;
  2301. DEC EAX ; one element has been processed ;
  2302. CMP EAX, 0 ; all elements already processed?
  2303. JLE single
  2304. MOV ESI, EBX ;
  2305. AND ESI, 8+4 ;
  2306. CMP ESI, 0 ;
  2307. JNE align ;
  2308. aligned:
  2309. aligned16:
  2310. CMP EAX, 16 ;
  2311. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2312. MOVAPS XMM1, [EBX] ;
  2313. MOVAPS XMM2, [EBX+16] ;
  2314. MOVAPS XMM3, [EBX+32] ;
  2315. MOVAPS XMM4, [EBX+48] ;
  2316. ADD EBX, 64 ;
  2317. MULPS XMM1, XMM0 ;
  2318. MULPS XMM2, XMM0 ;
  2319. MULPS XMM3, XMM0 ;
  2320. MULPS XMM4, XMM0 ;
  2321. MOVAPS [EDX], XMM1 ;
  2322. MOVAPS [EDX+16], XMM2 ;
  2323. MOVAPS [EDX+32], XMM3 ;
  2324. MOVAPS [EDX+48], XMM4 ;
  2325. ADD EDX, 64 ;
  2326. SUB EAX, 16 ;
  2327. JMP aligned16 ;
  2328. ; LOOP FOR 2 pieces aligned
  2329. aligned4: ;
  2330. CMP EAX, 4 ;
  2331. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2332. MOVAPS XMM1, [EBX] ;
  2333. ADD EBX, 16 ;
  2334. MULPS XMM1, XMM0 ;
  2335. MOVAPS [EDX], XMM1 ;
  2336. ADD EDX, 16 ;
  2337. SUB EAX, 4 ;
  2338. JMP aligned4 ;
  2339. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2340. unaligned: ;
  2341. unaligned16: ;
  2342. CMP EAX, 16 ;
  2343. JL unaligned4 ; len < 12- > EXIT
  2344. MOVUPS XMM1, [EBX] ;
  2345. MOVUPS XMM2, [EBX+16] ;
  2346. MOVUPS XMM3, [EBX+32] ;
  2347. MOVUPS XMM4, [EBX+48] ;
  2348. ADD EBX, 64
  2349. MULPS XMM1, XMM0 ;
  2350. MULPS XMM2, XMM0 ;
  2351. MULPS XMM3, XMM0 ;
  2352. MULPS XMM4, XMM0 ;
  2353. MOVUPS [EDX], XMM1 ;
  2354. MOVUPS [EDX+16], XMM2 ;
  2355. MOVUPS [EDX+32], XMM3 ;
  2356. MOVUPS [EDX+48], XMM4 ;
  2357. ADD EDX, 64 ;
  2358. SUB EAX, 16 ;
  2359. JMP unaligned16 ;
  2360. ; LOOP FOR 2 pieces unaligned
  2361. unaligned4: ;
  2362. CMP EAX, 4 ;
  2363. JL singlepieces ; len < 2- > EXIT
  2364. MOVUPS XMM1, [EBX] ;
  2365. ADD EBX, 16 ;
  2366. MULPS XMM1, XMM0 ;
  2367. MOVUPS [EDX], XMM1 ;
  2368. ADD EDX, 16 ;
  2369. SUB EAX, 4 ;
  2370. JMP unaligned4 ;
  2371. ; one piece left OR non-contiguous data
  2372. single:
  2373. singlepieces: ;
  2374. CMP EAX, 0 ;
  2375. JLE endL ; len <= 0- > EXIT
  2376. MOVSS XMM1, [EBX]
  2377. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2378. MULSS XMM1, XMM0
  2379. MOVSS [EDX], XMM1
  2380. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2381. DEC EAX ; DEC(len)
  2382. JMP singlepieces ;
  2383. endL:
  2384. END MulARSRLoopSSE;
  2385. PROCEDURE IncMulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2386. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2387. (*
  2388. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2389. 2.) process starting unaligned data ( using single instructions)
  2390. 3.) process aligned data
  2391. 4.) process remaining unaligned data (using single instructions)
  2392. *)
  2393. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2394. ; register initialization
  2395. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2396. CMP EAX, 0 ;
  2397. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2398. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2399. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2400. MOV ECX, [EBP+radr] ;
  2401. MOVSD XMM0, [ECX] ;
  2402. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2403. ; check IF data are contiguous IN memory
  2404. CMP [EBP+linc], 8 ; check left FOR contiunuity
  2405. JNE single ; not continuous- > simplest method
  2406. CMP [EBP+dinc], 8 ; check dest FOR continuity
  2407. JNE single ; not continuous- > simplest method
  2408. ; check FOR alignment
  2409. MOV ECX, EBX ;
  2410. AND ECX, 7 ; ladr MOD 8
  2411. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2412. JNE unaligned ; not 64 bit aligned
  2413. MOV ECX, EDX ;
  2414. AND ECX, 7 ; dadr MOD 8
  2415. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2416. JNE unaligned ; not 64 bit aligned
  2417. MOV ESI, EBX ;
  2418. AND ESI, 8 ; 16 byte alignment
  2419. MOV EDI, EDX ;
  2420. AND EDI, 8 ; 16 byte alignment
  2421. CMP ESI, EDI ;
  2422. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2423. CMP ESI, 8 ;
  2424. JNE aligned ; ladr and dadr already 128 bit aligned
  2425. ; one single element processing TO achieve 128 bt alignment
  2426. MOVSD XMM1, [EBX] ;
  2427. MULSD XMM1, XMM0 ;
  2428. MOVSD XMM2, [EDX] ;
  2429. ADDSD XMM1, XMM2 ;
  2430. MOVSD [EDX], XMM1 ;
  2431. ADD EBX, 8 ; now EBX IS 16 byte aligned
  2432. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  2433. DEC EAX ; one element has been processed
  2434. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2435. aligned:
  2436. aligned8:
  2437. CMP EAX, 8 ;
  2438. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2439. MOVAPD XMM1, [EBX] ;
  2440. MOVAPD XMM2, [EBX+16] ;
  2441. MOVAPD XMM3, [EBX+32] ;
  2442. MOVAPD XMM4, [EBX+48] ;
  2443. ADD EBX, 64 ;
  2444. MULPD XMM1, XMM0 ;
  2445. MULPD XMM2, XMM0 ;
  2446. MULPD XMM3, XMM0 ;
  2447. MULPD XMM4, XMM0 ;
  2448. MOVAPD XMM5, [EDX] ;
  2449. ADDPD XMM1, XMM5
  2450. MOVAPD [EDX], XMM1 ;
  2451. MOVAPD XMM6, [EDX+16] ;
  2452. ADDPD XMM2, XMM6
  2453. MOVAPD [EDX+16], XMM2 ;
  2454. MOVAPD XMM7, [EDX+32] ;
  2455. ADDPD XMM3, XMM7
  2456. MOVAPD [EDX+32], XMM3 ;
  2457. MOVAPD XMM5, [EDX+48] ;
  2458. ADDPD XMM4, XMM5
  2459. MOVAPD [EDX+48], XMM4 ;
  2460. ADD EDX, 64 ;
  2461. SUB EAX, 8 ;
  2462. JMP aligned8 ;
  2463. ; LOOP FOR 2 pieces aligned
  2464. aligned2: ;
  2465. CMP EAX, 2 ;
  2466. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2467. MOVAPD XMM1, [EBX] ;
  2468. ADD EBX, 16 ;
  2469. MULPD XMM1, XMM0 ;
  2470. MOVAPD XMM2, [EDX] ;
  2471. ADDPD XMM1, XMM2
  2472. MOVAPD [EDX], XMM1 ;
  2473. ADD EDX, 16 ;
  2474. SUB EAX, 2 ;
  2475. JMP aligned2 ;
  2476. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2477. unaligned: ;
  2478. unaligned8: ;
  2479. CMP EAX, 8 ;
  2480. JL unaligned2 ; len < 12- > EXIT
  2481. MOVUPD XMM1, [EBX] ;
  2482. MOVUPD XMM2, [EBX+16] ;
  2483. MOVUPD XMM3, [EBX+32] ;
  2484. MOVUPD XMM4, [EBX+48] ;
  2485. ADD EBX, 64
  2486. MULPD XMM1, XMM0 ;
  2487. MULPD XMM2, XMM0 ;
  2488. MULPD XMM3, XMM0 ;
  2489. MULPD XMM4, XMM0 ;
  2490. MOVUPD XMM5, [EDX] ;
  2491. ADDPD XMM1, XMM5
  2492. MOVUPD [EDX], XMM1 ;
  2493. MOVUPD XMM6, [EDX+16] ;
  2494. ADDPD XMM2, XMM6
  2495. MOVUPD [EDX+16], XMM2 ;
  2496. MOVUPD XMM7, [EDX+32] ;
  2497. ADDPD XMM3, XMM7
  2498. MOVUPD [EDX+32], XMM3 ;
  2499. MOVUPD XMM5, [EDX+48] ;
  2500. ADDPD XMM4, XMM5
  2501. MOVUPD [EDX+48], XMM4 ;
  2502. ADD EDX, 64 ;
  2503. SUB EAX, 8 ;
  2504. JMP unaligned8 ;
  2505. ; LOOP FOR 2 pieces unaligned
  2506. unaligned2: ;
  2507. CMP EAX, 2 ;
  2508. JL singlepieces ; len < 2- > EXIT
  2509. MOVUPD XMM1, [EBX] ;
  2510. ADD EBX, 16 ;
  2511. MULPD XMM1, XMM0 ;
  2512. MOVUPD XMM2, [EDX] ;
  2513. ADDPD XMM1, XMM2
  2514. MOVUPD [EDX], XMM1 ;
  2515. ADD EDX, 16 ;
  2516. SUB EAX, 2 ;
  2517. JMP unaligned2 ;
  2518. ; one piece left OR non-contiguous data
  2519. single:
  2520. singlepieces: ;
  2521. CMP EAX, 0 ;
  2522. JLE endL ; len <= 0- > EXIT
  2523. MOVSD XMM1, [EBX]
  2524. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2525. MULSD XMM1, XMM0
  2526. MOVSD XMM2, [EDX] ;
  2527. ADDSD XMM1, XMM2
  2528. MOVSD [EDX], XMM1
  2529. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2530. DEC EAX ; DEC(len)
  2531. JMP singlepieces ;
  2532. endL:
  2533. END IncMulAXSXLoopSSE;
  2534. PROCEDURE IncMulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2535. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2536. (*
  2537. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2538. 2.) process starting unaligned data ( using single instructions)
  2539. 3.) process aligned data
  2540. 4.) process remaining unaligned data (using single instructions)
  2541. *)
  2542. CODE {SYSTEM.i386, SYSTEM.SSE}
  2543. ; register initialization
  2544. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2545. CMP EAX, 0 ;
  2546. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2547. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2548. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2549. MOV ECX, [EBP+radr] ;
  2550. MOVSS XMM0, [ECX] ;
  2551. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2552. ; check IF data are contiguous IN memory
  2553. CMP [EBP+linc], 4 ; check left FOR contiunuity
  2554. JNE single ; not continuous- > simplest method
  2555. CMP [EBP+dinc], 4 ; check dest FOR continuity
  2556. JNE single ; not continuous- > simplest method
  2557. ; check FOR alignment
  2558. MOV ECX, EBX ;
  2559. AND ECX, 3 ; ladr MOD 4
  2560. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2561. JNE unaligned ; not 32 bit aligned
  2562. MOV ECX, EDX ;
  2563. AND ECX, 3 ; dadr MOD 4
  2564. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2565. JNE unaligned ; not 64 bit aligned
  2566. MOV ESI, EBX ;
  2567. AND ESI, 8+4 ; 16 byte alignment
  2568. MOV EDI, EDX ;
  2569. AND EDI, 8+4 ; 16 byte alignment
  2570. CMP ESI, EDI ;
  2571. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2572. CMP ESI, 0 ;
  2573. JE aligned ; already aligned
  2574. align:
  2575. ; one single element processing UNTIL 128 bt alignment achieved
  2576. MOVSS XMM1, [EBX] ;
  2577. MULSS XMM1, XMM0 ;
  2578. MOVSS XMM2, [EDX] ;
  2579. ADDSS XMM1, XMM2 ;
  2580. MOVSS [EDX], XMM1 ;
  2581. ADD EBX, 4 ;
  2582. ADD EDX, 4 ;
  2583. DEC EAX ; one element has been processed ;
  2584. CMP EAX, 0 ; all elements already processed?
  2585. JLE single
  2586. MOV ESI, EBX ;
  2587. AND ESI, 8+4 ;
  2588. CMP ESI, 0 ;
  2589. JNE align ;
  2590. aligned:
  2591. aligned16:
  2592. CMP EAX, 16 ;
  2593. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2594. MOVAPS XMM1, [EBX] ;
  2595. MOVAPS XMM2, [EBX+16] ;
  2596. MOVAPS XMM3, [EBX+32] ;
  2597. MOVAPS XMM4, [EBX+48] ;
  2598. ADD EBX, 64 ;
  2599. MULPS XMM1, XMM0 ;
  2600. MULPS XMM2, XMM0 ;
  2601. MULPS XMM3, XMM0 ;
  2602. MULPS XMM4, XMM0 ;
  2603. MOVAPS XMM5, [EDX] ;
  2604. ADDPS XMM1, XMM5 ;
  2605. MOVAPS [EDX], XMM1 ;
  2606. MOVAPS XMM6, [EDX+16] ;
  2607. ADDPS XMM2, XMM6 ;
  2608. MOVAPS [EDX+16], XMM2 ;
  2609. MOVAPS XMM7, [EDX+32] ;
  2610. ADDPS XMM3, XMM7 ;
  2611. MOVAPS [EDX+32], XMM3 ;
  2612. MOVAPS XMM5, [EDX+48] ;
  2613. ADDPS XMM4, XMM5 ;
  2614. MOVAPS [EDX+48], XMM4 ;
  2615. ADD EDX, 64 ;
  2616. SUB EAX, 16 ;
  2617. JMP aligned16 ;
  2618. ; LOOP FOR 2 pieces aligned
  2619. aligned4: ;
  2620. CMP EAX, 4 ;
  2621. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2622. MOVAPS XMM1, [EBX] ;
  2623. ADD EBX, 16 ;
  2624. MULPS XMM1, XMM0 ;
  2625. MOVAPS XMM2, [EDX] ;
  2626. ADDPS XMM1, XMM2 ;
  2627. MOVAPS [EDX], XMM1 ;
  2628. ADD EDX, 16 ;
  2629. SUB EAX, 4 ;
  2630. JMP aligned4 ;
  2631. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2632. unaligned: ;
  2633. unaligned16: ;
  2634. CMP EAX, 16 ;
  2635. JL unaligned4 ; len < 12- > EXIT
  2636. MOVUPS XMM1, [EBX] ;
  2637. MOVUPS XMM2, [EBX+16] ;
  2638. MOVUPS XMM3, [EBX+32] ;
  2639. MOVUPS XMM4, [EBX+48] ;
  2640. ADD EBX, 64
  2641. MULPS XMM1, XMM0 ;
  2642. MULPS XMM2, XMM0 ;
  2643. MULPS XMM3, XMM0 ;
  2644. MULPS XMM4, XMM0 ;
  2645. MOVUPS XMM5, [EDX] ;
  2646. ADDPS XMM1, XMM5 ;
  2647. MOVUPS [EDX], XMM1 ;
  2648. MOVUPS XMM6, [EDX+16] ;
  2649. ADDPS XMM2, XMM6 ;
  2650. MOVUPS [EDX+16], XMM2 ;
  2651. MOVUPS XMM7, [EDX+32] ;
  2652. ADDPS XMM3, XMM7 ;
  2653. MOVUPS [EDX+32], XMM3 ;
  2654. MOVUPS XMM5, [EDX+48] ;
  2655. ADDPS XMM4, XMM5 ;
  2656. MOVUPS [EDX+48], XMM4 ;
  2657. ADD EDX, 64 ;
  2658. SUB EAX, 16 ;
  2659. JMP unaligned16 ;
  2660. ; LOOP FOR 2 pieces unaligned
  2661. unaligned4: ;
  2662. CMP EAX, 4 ;
  2663. JL singlepieces ; len < 2- > EXIT
  2664. MOVUPS XMM1, [EBX] ;
  2665. ADD EBX, 16 ;
  2666. MULPS XMM1, XMM0 ;
  2667. MOVUPS XMM2, [EDX] ;
  2668. ADDPS XMM1, XMM2 ;
  2669. MOVUPS [EDX], XMM1 ;
  2670. ADD EDX, 16 ;
  2671. SUB EAX, 4 ;
  2672. JMP unaligned4 ;
  2673. ; one piece left OR non-contiguous data
  2674. single:
  2675. singlepieces: ;
  2676. CMP EAX, 0 ;
  2677. JLE endL ; len <= 0- > EXIT
  2678. MOVSS XMM1, [EBX]
  2679. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2680. MULSS XMM1, XMM0
  2681. MOVSS XMM2, [EDX] ;
  2682. ADDSS XMM1, XMM2 ;
  2683. MOVSS [EDX], XMM1
  2684. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2685. DEC EAX ; DEC(len)
  2686. JMP singlepieces ;
  2687. endL:
  2688. END IncMulARSRLoopSSE;
  2689. (*
  2690. PROCEDURE AlignedSPXSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2691. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2692. ; ; register initialization
  2693. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  2694. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  2695. MOV ESI, [EBP+radr] ; ESI reserved for radr
  2696. MOV EAX, [EBP+len] ; EAX reserverd for length
  2697. MOV ECX, [EBP+stride] ; ECX reserved for stride
  2698. XORPD XMM2, XMM2 ;
  2699. XORPD XMM3, XMM3 ;
  2700. XORPD XMM4, XMM4 ;
  2701. XORPD XMM5, XMM5 ;
  2702. XORPD XMM6, XMM6 ;
  2703. XOR EDI, EDI ;
  2704. aligned4:
  2705. CMP EAX, 4 ;
  2706. JL aligned2 ; ; len < 4- > exit to singlepieces
  2707. MOV ESI, [EBP+radr] ;
  2708. ADD ESI, EDI ;
  2709. MOVAPD XMM7, [EBX] ;
  2710. MOVAPD XMM0, [ESI] ;
  2711. ADD ESI, ECX ;
  2712. MOVAPD XMM1, [ESI] ;
  2713. MULPD XMM0, XMM7 ;
  2714. ADDPD XMM2, XMM0 ;
  2715. ADD ESI, ECX ;
  2716. MOVAPD XMM0, [ESI] ;
  2717. MULPD XMM1, XMM7 ;
  2718. ADDPD XMM3, XMM1 ;
  2719. ADD ESI, ECX ;
  2720. MOVAPD XMM1, [ESI] ;
  2721. MULPD XMM0, XMM7 ;
  2722. ADDPD XMM4, XMM0 ;
  2723. ADD ESI, ECX ;
  2724. MOVAPD XMM0, [ESI] ;
  2725. MULPD XMM1, XMM7 ;
  2726. ADDPD XMM5, XMM1 ;
  2727. MULPD XMM0, XMM7 ;
  2728. ADDPD XMM6, XMM0 ;
  2729. ADD EBX, 16 ;
  2730. ADD EDI, 16 ;
  2731. MOV ESI, [EBP+radr] ;
  2732. ADD ESI, EDI ;
  2733. MOVAPD XMM7, [EBX] ;
  2734. MOVAPD XMM0, [ESI] ;
  2735. ADD ESI, ECX ;
  2736. MOVAPD XMM1, [ESI] ;
  2737. MULPD XMM0, XMM7 ;
  2738. ADDPD XMM2, XMM0 ;
  2739. ADD ESI, ECX ;
  2740. MOVAPD XMM0, [ESI] ;
  2741. MULPD XMM1, XMM7 ;
  2742. ADDPD XMM3, XMM1 ;
  2743. ADD ESI, ECX ;
  2744. MOVAPD XMM1, [ESI] ;
  2745. MULPD XMM0, XMM7 ;
  2746. ADDPD XMM4, XMM0 ;
  2747. ADD ESI, ECX ;
  2748. MOVAPD XMM0, [ESI] ;
  2749. MULPD XMM1, XMM7 ;
  2750. ADDPD XMM5, XMM1 ;
  2751. MULPD XMM0, XMM7 ;
  2752. ADDPD XMM6, XMM0 ;
  2753. ADD EBX, 16 ;
  2754. ADD EDI, 16 ;
  2755. SUB EAX, 4 ;
  2756. JMP aligned4 ;
  2757. aligned2:
  2758. CMP EAX, 2 ;
  2759. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2760. MOV ESI, [EBP+radr] ;
  2761. ADD ESI, EDI ;
  2762. MOVAPD XMM7, [EBX] ;
  2763. MOVAPD XMM0, [ESI] ;
  2764. ADD ESI, ECX ;
  2765. MOVAPD XMM1, [ESI] ;
  2766. MULPD XMM0, XMM7 ;
  2767. ADDPD XMM2, XMM0 ;
  2768. ADD ESI, ECX ;
  2769. MOVAPD XMM0, [ESI] ;
  2770. MULPD XMM1, XMM7 ;
  2771. ADDPD XMM3, XMM1 ;
  2772. ADD ESI, ECX ;
  2773. MOVAPD XMM1, [ESI] ;
  2774. MULPD XMM0, XMM7 ;
  2775. ADDPD XMM4, XMM0 ;
  2776. ADD ESI, ECX ;
  2777. MOVAPD XMM0, [ESI] ;
  2778. MULPD XMM1, XMM7 ;
  2779. ADDPD XMM5, XMM1 ;
  2780. MULPD XMM0, XMM7 ;
  2781. ADDPD XMM6, XMM0 ;
  2782. ADD EBX, 16 ;
  2783. ADD EDI, 16 ;
  2784. SUB EAX, 2 ;
  2785. JMP aligned2 ;
  2786. horizontaladd: ;
  2787. MOVAPD XMM1, XMM2 ;
  2788. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2789. ADDPD XMM2, XMM1 ;
  2790. MOVAPD XMM1, XMM3 ;
  2791. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2792. ADDPD XMM3, XMM1 ;
  2793. MOVAPD XMM1, XMM4 ;
  2794. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2795. ADDPD XMM4, XMM1 ;
  2796. MOVAPD XMM1, XMM5 ;
  2797. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2798. ADDPD XMM5, XMM1 ;
  2799. MOVAPD XMM1, XMM6 ;
  2800. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2801. ADDPD XMM6, XMM1 ;
  2802. singlepieces: ;
  2803. CMP EAX, 0 ;
  2804. JLE store ; len <= 0- > exit
  2805. MOV ESI, [EBP+radr] ;
  2806. MOVSD XMM7, [EBX] ;
  2807. MOVSD XMM0, [ESI+EDI] ;
  2808. ADD ESI, ECX ;
  2809. MOVSD XMM1, [ESI+EDI] ;
  2810. MULSD XMM0, XMM7 ;
  2811. ADDSD XMM2, XMM0 ;
  2812. ADD ESI, ECX ;
  2813. MOVSD XMM0, [ESI+EDI] ;
  2814. MULSD XMM1, XMM7 ;
  2815. ADDSD XMM3, XMM1 ;
  2816. ADD ESI, ECX ;
  2817. MOVSD XMM1, [ESI+EDI] ;
  2818. MULSD XMM0, XMM7 ;
  2819. ADDSD XMM4, XMM0 ;
  2820. ADD ESI, ECX ;
  2821. MOVSD XMM1, [ESI+EDI] ;
  2822. MULSD XMM0, XMM7 ;
  2823. ADDSD XMM4, XMM0 ;
  2824. ADD ESI, ECX ;
  2825. MOVSD XMM0, [ESI+EDI] ;
  2826. MULSD XMM1, XMM7 ;
  2827. ADDSD XMM5, XMM1 ;
  2828. MULSD XMM0, XMM7 ;
  2829. ADDSD XMM6, XMM0 ;
  2830. ADD EBX, 4 (* INC(ladr,incl) *)
  2831. ADD EDI, 4 (* INC(radr,incr) *)
  2832. DEC EAX ; DEC(len)
  2833. JMP singlepieces ;
  2834. store:
  2835. MOVSD [EDX], XMM2 ;
  2836. ADD EDX, [EBP+incd] ;
  2837. MOVSD [EDX], XMM3 ;
  2838. ADD EDX, [EBP+incd] ;
  2839. MOVSD [EDX], XMM4 ;
  2840. ADD EDX, [EBP+incd] ;
  2841. MOVSD [EDX], XMM5 ;
  2842. ADD EDX, [EBP+incd] ;
  2843. MOVSD [EDX], XMM6 ;
  2844. end:
  2845. END AlignedSPXSSE5;
  2846. *)
  2847. (* sse version of scalar product *)
  2848. PROCEDURE AlignedSPXSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  2849. add: BOOLEAN );
  2850. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2851. ; register initialization
  2852. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2853. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2854. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  2855. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2856. XORPD XMM0, XMM0 ;
  2857. CMP [EBP+add], 0 ; add?
  2858. JE aligned8 ; no add
  2859. MOVSD XMM0, [EDX] ;
  2860. aligned8:
  2861. CMP EAX, 8 ;
  2862. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2863. MOVAPD XMM1, [EBX] ;
  2864. MOVAPD XMM2, [EBX+16] ;
  2865. MOVAPD XMM3, [EBX+32] ;
  2866. MOVAPD XMM4, [ECX] ;
  2867. MOVAPD XMM5, [ECX+16] ;
  2868. MOVAPD XMM6, [ECX+32] ;
  2869. MULPD XMM1, XMM4 ;
  2870. ADDPD XMM0, XMM1 ;
  2871. MULPD XMM2, XMM5 ;
  2872. ADDPD XMM0, XMM2 ;
  2873. MULPD XMM3, XMM6 ;
  2874. ADDPD XMM0, XMM3 ;
  2875. MOVAPD XMM7, [EBX+48] ;
  2876. MOVAPD XMM1, [ECX+48] ;
  2877. MULPD XMM1, XMM7 ;
  2878. ADDPD XMM0, XMM1 ;
  2879. ADD EBX, 64 ;
  2880. ADD ECX, 64 ;
  2881. SUB EAX, 8 ;
  2882. JMP aligned8 ;
  2883. ; LOOP FOR 2 pieces aligned
  2884. aligned4:
  2885. CMP EAX, 4 ;
  2886. JL aligned2 ; ; len < 4- > EXIT TO singlepieces
  2887. MOVAPD XMM1, [EBX] ;
  2888. MOVAPD XMM2, [ECX] ;
  2889. MOVAPD XMM3, [EBX+16] ;
  2890. MOVAPD XMM4, [ECX+16] ;
  2891. MULPD XMM1, XMM2 ;
  2892. ADDPD XMM0, XMM1 ;
  2893. MULPD XMM3, XMM4 ;
  2894. ADDPD XMM0, XMM3 ;
  2895. ADD EBX, 32 ;
  2896. ADD ECX, 32 ;
  2897. SUB EAX, 4 ;
  2898. JMP aligned4 ;
  2899. aligned2:
  2900. CMP EAX, 2 ;
  2901. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2902. MOVAPD XMM1, [EBX] ;
  2903. MOVAPD XMM2, [ECX] ;
  2904. MULPD XMM1, XMM2 ;
  2905. ADDPD XMM0, XMM1 ;
  2906. ADD EBX, 16 ;
  2907. ADD ECX, 16 ;
  2908. SUB EAX, 2 ;
  2909. JMP aligned2 ;
  2910. horizontaladd: ;
  2911. MOVAPD XMM1, XMM0 ;
  2912. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2913. ADDPD XMM0, XMM1 ;
  2914. singlepieces: ;
  2915. CMP EAX, 0 ;
  2916. JLE store ; len <= 0- > EXIT
  2917. MOVSD XMM1, [EBX]
  2918. MOVSD XMM2, [ECX]
  2919. MULSD XMM1, XMM2
  2920. ADDSD XMM0, XMM1
  2921. ADD EBX, 8 ; INC(ladr, incl)
  2922. ADD ECX, 8 ; INC(radr, incr)
  2923. DEC EAX ; DEC(len)
  2924. JMP singlepieces ;
  2925. store:
  2926. MOVSD [EDX], XMM0 ;
  2927. endL:
  2928. END AlignedSPXSSE;
  2929. (*
  2930. PROCEDURE AlignedSPRSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2931. CODE {SYSTEM.i386, SYSTEM.SSE}
  2932. ; register initialization
  2933. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  2934. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  2935. MOV ESI, [EBP+radr] ; ECX reserved for radr
  2936. MOV EAX, [EBP+len] ; EAX reserverd for length
  2937. MOV ECX, [EBP+stride] ;
  2938. XORPS XMM2, XMM2 ;
  2939. XORPS XMM3, XMM3 ;
  2940. XORPS XMM4, XMM4 ;
  2941. XORPS XMM5, XMM5 ;
  2942. XORPS XMM6, XMM6 ;
  2943. XOR EDI, EDI ;
  2944. aligned8:
  2945. CMP EAX, 8 ;
  2946. JL aligned4 ; ; len < 4- > exit to singlepieces
  2947. PREFETCH0 24[EBX] ;
  2948. ; PREFETCH0[ESI] ;
  2949. MOV ESI, [EBP+radr] ;
  2950. ADD ESI, EDI ;
  2951. MOVAPS XMM7, [EBX] ;
  2952. MOVAPS XMM0, [ESI] ;
  2953. ADD ESI, ECX ;
  2954. MOVAPS XMM1, [ESI] ;
  2955. MULPS XMM0, XMM7 ;
  2956. ADDPS XMM2, XMM0 ;
  2957. ADD ESI, ECX ;
  2958. MOVAPS XMM0, [ESI] ;
  2959. MULPS XMM1, XMM7 ;
  2960. ADDPS XMM3, XMM1 ;
  2961. ADD ESI, ECX ;
  2962. MOVAPS XMM1, [ESI] ;
  2963. MULPS XMM0, XMM7 ;
  2964. ADDPS XMM4, XMM0 ;
  2965. ADD ESI, ECX ;
  2966. MOVAPS XMM0, [ESI] ;
  2967. MULPS XMM1, XMM7 ;
  2968. ADDPS XMM5, XMM1 ;
  2969. MULPS XMM0, XMM7 ;
  2970. ADDPS XMM6, XMM0 ;
  2971. ADD EBX, 16 ;
  2972. ADD EDI, 16 ;
  2973. MOV ESI, [EBP+radr] ;
  2974. ADD ESI, EDI ;
  2975. MOVAPS XMM7, [EBX] ;
  2976. MOVAPS XMM0, [ESI] ;
  2977. ADD ESI, ECX ;
  2978. MOVAPS XMM1, [ESI] ;
  2979. MULPS XMM0, XMM7 ;
  2980. ADDPS XMM2, XMM0 ;
  2981. ADD ESI, ECX ;
  2982. MOVAPS XMM0, [ESI] ;
  2983. MULPS XMM1, XMM7 ;
  2984. ADDPS XMM3, XMM1 ;
  2985. ADD ESI, ECX ;
  2986. MOVAPS XMM1, [ESI] ;
  2987. MULPS XMM0, XMM7 ;
  2988. ADDPS XMM4, XMM0 ;
  2989. ADD ESI, ECX ;
  2990. MOVAPS XMM0, [ESI] ;
  2991. MULPS XMM1, XMM7 ;
  2992. ADDPS XMM5, XMM1 ;
  2993. MULPS XMM0, XMM7 ;
  2994. ADDPS XMM6, XMM0 ;
  2995. ADD EBX, 16 ;
  2996. ADD EDI, 16 ;
  2997. SUB EAX, 8 ;
  2998. JMP aligned8 ;
  2999. aligned4:
  3000. CMP EAX, 4 ;
  3001. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3002. MOV ESI, [EBP+radr] ;
  3003. ADD ESI, EDI ;
  3004. MOVAPS XMM7, [EBX] ;
  3005. MOVAPS XMM0, [ESI] ;
  3006. ADD ESI, ECX ;
  3007. MOVAPS XMM1, [ESI] ;
  3008. MULPS XMM0, XMM7 ;
  3009. ADDPS XMM2, XMM0 ;
  3010. ADD ESI, ECX ;
  3011. MOVAPS XMM0, [ESI] ;
  3012. MULPS XMM1, XMM7 ;
  3013. ADDPS XMM3, XMM1 ;
  3014. ADD ESI, ECX ;
  3015. MOVAPS XMM1, [ESI] ;
  3016. MULPS XMM0, XMM7 ;
  3017. ADDPS XMM4, XMM0 ;
  3018. ADD ESI, ECX ;
  3019. MOVAPS XMM0, [ESI] ;
  3020. MULPS XMM1, XMM7 ;
  3021. ADDPS XMM5, XMM1 ;
  3022. MULPS XMM0, XMM7 ;
  3023. ADDPS XMM6, XMM0 ;
  3024. ADD EBX, 16 ;
  3025. ADD EDI, 16 ;
  3026. SUB EAX, 4 ;
  3027. JMP aligned4 ;
  3028. horizontaladd: ;
  3029. MOVLHPS XMM1, XMM2 ;
  3030. ADDPS XMM1, XMM2 ;
  3031. SHUFPS XMM2, XMM1, 48 ;
  3032. ADDPS XMM2, XMM1 ;
  3033. MOVHLPS XMM2, XMM2 ;
  3034. MOVLHPS XMM1, XMM3 ;
  3035. ADDPS XMM1, XMM3 ;
  3036. SHUFPS XMM3, XMM1, 48 ;
  3037. ADDPS XMM3, XMM1 ;
  3038. MOVHLPS XMM3, XMM3 ;
  3039. MOVLHPS XMM1, XMM4 ;
  3040. ADDPS XMM1, XMM4 ;
  3041. SHUFPS XMM4, XMM1, 48 ;
  3042. ADDPS XMM4, XMM1 ;
  3043. MOVHLPS XMM4, XMM4 ;
  3044. MOVLHPS XMM1, XMM5 ;
  3045. ADDPS XMM1, XMM5 ;
  3046. SHUFPS XMM5, XMM1, 48 ;
  3047. ADDPS XMM5, XMM1 ;
  3048. MOVHLPS XMM5, XMM5 ;
  3049. MOVLHPS XMM1, XMM6 ;
  3050. ADDPS XMM1, XMM6 ;
  3051. SHUFPS XMM6, XMM1, 48 ;
  3052. ADDPS XMM6, XMM1 ;
  3053. MOVHLPS XMM6, XMM6 ;
  3054. singlepieces: ;
  3055. CMP EAX, 0 ;
  3056. JLE store ; len <= 0- > exit
  3057. MOV ESI, [EBP+radr] ;
  3058. MOVSS XMM7, [EBX] ;
  3059. MOVSS XMM0, [ESI+EDI] ;
  3060. ADD ESI, ECX ;
  3061. MOVSS XMM1, [ESI+EDI] ;
  3062. MULSS XMM0, XMM7 ;
  3063. ADDSS XMM2, XMM0 ;
  3064. ADD ESI, ECX ;
  3065. MOVSS XMM0, [ESI+EDI] ;
  3066. MULSS XMM1, XMM7 ;
  3067. ADDSS XMM3, XMM1 ;
  3068. ADD ESI, ECX ;
  3069. MOVSS XMM1, [ESI+EDI] ;
  3070. MULSS XMM0, XMM7 ;
  3071. ADDSS XMM4, XMM0 ;
  3072. ADD ESI, ECX ;
  3073. MOVSS XMM0, [ESI+EDI] ;
  3074. MULSS XMM1, XMM7 ;
  3075. ADDSS XMM5, XMM1 ;
  3076. MULSS XMM0, XMM7 ;
  3077. ADDSS XMM6, XMM0 ;
  3078. ADD EBX, 4 (* INC(ladr,incl) *)
  3079. ADD EDI, 4 (* INC(radr,incr) *)
  3080. DEC EAX ; DEC(len)
  3081. JMP singlepieces ;
  3082. store:
  3083. MOVSS [EDX], XMM2 ;
  3084. ADD EDX, [EBP+incd] ;
  3085. MOVSS [EDX], XMM3 ;
  3086. ADD EDX, [EBP+incd] ;
  3087. MOVSS [EDX], XMM4 ;
  3088. ADD EDX, [EBP+incd] ;
  3089. MOVSS [EDX], XMM5 ;
  3090. ADD EDX, [EBP+incd] ;
  3091. MOVSS [EDX], XMM6 ;
  3092. end:
  3093. END AlignedSPRSSE5;
  3094. *)
  3095. PROCEDURE AlignedSPRSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  3096. add: BOOLEAN );
  3097. CODE {SYSTEM.i386, SYSTEM.SSE}
  3098. ; register initialization
  3099. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  3100. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  3101. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  3102. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  3103. XORPS XMM0, XMM0 ;
  3104. CMP [EBP+add], 0 ; add?
  3105. JE aligned16 ; no add
  3106. MOVSS XMM0, [EDX] ;
  3107. aligned16:
  3108. CMP EAX, 16 ;
  3109. JL aligned8 ; len < 4- > EXIT TO singlepieces
  3110. MOVAPS XMM1, [EBX] ;
  3111. MOVAPS XMM4, [ECX] ;
  3112. MOVAPS XMM2, [EBX+16] ;
  3113. MOVAPS XMM5, [ECX+16] ;
  3114. MULPS XMM1, XMM4 ;
  3115. ADDPS XMM0, XMM1 ;
  3116. MOVAPS XMM3, [EBX+32] ;
  3117. MOVAPS XMM6, [ECX+32] ;
  3118. MULPS XMM2, XMM5 ;
  3119. ADDPS XMM0, XMM2 ;
  3120. MOVAPS XMM7, [EBX+48] ;
  3121. MOVAPS XMM1, [ECX+48] ;
  3122. MULPS XMM3, XMM6 ;
  3123. ADDPS XMM0, XMM3 ;
  3124. MULPS XMM1, XMM7 ;
  3125. ADDPS XMM0, XMM1 ;
  3126. ADD EBX, 64 ;
  3127. ADD ECX, 64 ;
  3128. SUB EAX, 16 ;
  3129. JMP aligned16 ;
  3130. ; LOOP FOR 8 pieces aligned
  3131. aligned8:
  3132. CMP EAX, 8 ;
  3133. JL aligned4 ; ; len < 4- > EXIT TO singlepieces
  3134. MOVAPS XMM1, [EBX] ;
  3135. MOVAPS XMM4, [ECX] ;
  3136. MOVAPS XMM2, [EBX+16] ;
  3137. MOVAPS XMM5, [ECX+16] ;
  3138. MULPS XMM1, XMM4 ;
  3139. ADDPS XMM0, XMM1 ;
  3140. MULPS XMM2, XMM5 ;
  3141. ADDPS XMM0, XMM2 ;
  3142. ADD EBX, 32 ;
  3143. ADD ECX, 32 ;
  3144. SUB EAX, 8 ;
  3145. JMP aligned8 ;
  3146. aligned4:
  3147. CMP EAX, 4 ;
  3148. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  3149. MOVAPS XMM1, [EBX] ;
  3150. MOVAPS XMM2, [ECX] ;
  3151. MULPS XMM1, XMM2 ;
  3152. ADDPS XMM0, XMM1 ;
  3153. ADD EBX, 16 ;
  3154. ADD ECX, 16 ;
  3155. SUB EAX, 4 ;
  3156. JMP aligned4 ;
  3157. horizontaladd: ;
  3158. MOVAPS XMM1, XMM0 ;
  3159. ; 1*0 (* dest 0 -> dest 0 *) + 4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  3160. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  3161. ADDPS XMM1, XMM0 ;
  3162. MOVAPS XMM0, XMM1
  3163. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  3164. ADDPS XMM0, XMM1 ;
  3165. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  3166. singlepieces: ;
  3167. CMP EAX, 0 ;
  3168. JLE store ; len <= 0- > EXIT
  3169. MOVSS XMM1, [EBX]
  3170. MOVSS XMM2, [ECX]
  3171. MULSS XMM1, XMM2
  3172. ADDSS XMM0, XMM1
  3173. ADD EBX, 4 ; INC(ladr, incl)
  3174. ADD ECX, 4 ; INC(radr, incr)
  3175. DEC EAX ; DEC(len)
  3176. JMP singlepieces ;
  3177. store:
  3178. MOVSS [EDX], XMM0 ;
  3179. endL:
  3180. END AlignedSPRSSE;
  3181. (*
  3182. (* sse version of scalar product *)
  3183. PROCEDURE AlignedSPRSSE( ladr, radr, dadr, rows, stride, dinc, len: LONGINT );
  3184. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3185. ; register initialization
  3186. MOV EDI, [EBP+radr] ; radr start
  3187. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  3188. MOV ESI, [EBP+rows] ; outer loop counter
  3189. outerloop:
  3190. CMP ESI, 0 ;
  3191. JLE end ;
  3192. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  3193. MOV ECX, EDI ; ECX reserved for radr
  3194. MOV EAX, [EBP+len] ; EAX reserverd for length
  3195. XORPS XMM0, XMM0 ;
  3196. aligned16:
  3197. CMP EAX, 16 ;
  3198. JL aligned8 ; len < 4- > exit to singlepieces
  3199. MOVAPS XMM1, [EBX] ;
  3200. MOVAPS XMM2, [EBX+16] ;
  3201. MOVAPS XMM3, [EBX+32] ;
  3202. MOVAPS XMM4, [ECX] ;
  3203. MOVAPS XMM5, [ECX+16] ;
  3204. MOVAPS XMM6, [ECX+32] ;
  3205. MULPS XMM1, XMM4 ;
  3206. ADDPS XMM0, XMM1 ;
  3207. MULPS XMM2, XMM5 ;
  3208. ADDPS XMM0, XMM2 ;
  3209. MULPS XMM3, XMM6 ;
  3210. ADDPS XMM0, XMM3 ;
  3211. MOVAPS XMM7, [EBX+48] ;
  3212. MOVAPS XMM1, [ECX+48] ;
  3213. MULPS XMM1, XMM7 ;
  3214. ADDPS XMM0, XMM1 ;
  3215. ADD EBX, 64 ;
  3216. ADD ECX, 64 ;
  3217. SUB EAX, 16 ;
  3218. JMP aligned16 ;
  3219. ; loop for 8 pieces aligned
  3220. aligned8:
  3221. CMP EAX, 8 ;
  3222. JL aligned4 ; ; len < 4- > exit to singlepieces
  3223. MOVAPS XMM1, [EBX] ;
  3224. MOVAPS XMM2, [EBX+16] ;
  3225. MOVAPS XMM4, [ECX] ;
  3226. MOVAPS XMM5, [ECX+16] ;
  3227. MULPS XMM1, XMM4 ;
  3228. ADDPS XMM0, XMM1 ;
  3229. MULPS XMM2, XMM5 ;
  3230. ADDPS XMM0, XMM2 ;
  3231. ADD EBX, 32 ;
  3232. ADD ECX, 32 ;
  3233. SUB EAX, 8 ;
  3234. JMP aligned8 ;
  3235. aligned4:
  3236. CMP EAX, 4 ;
  3237. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3238. MOVAPS XMM1, [EBX] ;
  3239. MOVAPS XMM2, [ECX] ;
  3240. MULPS XMM1, XMM2 ;
  3241. ADDPS XMM0, XMM1 ;
  3242. ADD EBX, 16 ;
  3243. ADD ECX, 16 ;
  3244. SUB EAX, 4 ;
  3245. JMP aligned4 ;
  3246. horizontaladd: ;
  3247. MOVAPS XMM1, XMM0 ;
  3248. SHUFPS XMM1, XMM1, 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  3249. ADDPS XMM1, XMM0 ;
  3250. MOVAPS XMM0, XMM1
  3251. SHUFPS XMM0, XMM0, 16*3 ; (* src 3-> dest 2 *)
  3252. ADDPS XMM0, XMM1 ;
  3253. SHUFPS XMM0, XMM0, 1*2 ; (* dest 2 -> dest 0 *)
  3254. singlepieces: ;
  3255. CMP EAX, 0 ;
  3256. JLE store ; len <= 0- > exit
  3257. MOVSS XMM1, [EBX]
  3258. MOVSS XMM2, [ECX]
  3259. MULSS XMM1, XMM2
  3260. ADDSS XMM0, XMM1
  3261. ADD EBX, 4 (* INC(ladr,incl) *)
  3262. ADD ECX, 4 (* INC(radr,incr) *)
  3263. DEC EAX ; DEC(len)
  3264. JMP singlepieces ;
  3265. store:
  3266. MOVSS [EDX], XMM0 ;
  3267. ADD EDX, [EBP+dinc] ;
  3268. ADD EDI, [EBP+stride] ;
  3269. DEC ESI ;
  3270. JMP outerloop ;
  3271. end:
  3272. END AlignedSPRSSE;
  3273. *)
  3274. PROCEDURE Copy4( ladr, dadr: ADDRESS; linc, dinc, len: SIZE);
  3275. CODE {SYSTEM.i386}
  3276. MOV ESI, [EBP+ladr] ; ECX := ladr
  3277. MOV EDI, [EBP+dadr] ; EDX := dadr
  3278. MOV ECX, [EBP+len] ; EBX := len
  3279. MOV EAX, [EBP+linc] ;
  3280. CMP EAX, 4 ;
  3281. JNE loopL ;
  3282. MOV EAX, [EBP+dinc] ;
  3283. CMP EAX, 4 ;
  3284. JNE loopL ;
  3285. fastmove:
  3286. CLD ; incremental
  3287. REP ;
  3288. MOVSD ; move rest IN one byte steps
  3289. JMP endL ;
  3290. loopL:
  3291. CMP ECX, 0 ;
  3292. JLE endL ; WHILE ECX > 0 DO
  3293. MOV EAX, [ESI] ; EAX := SYSTEM.GET32(ESI)
  3294. MOV [EDI], EAX ; SYSTEM.PUT32(EDI, EAX))
  3295. ADD ESI, [EBP+linc] ; INC(ESI, linc)
  3296. ADD EDI, [EBP+dinc] ; INC(EDI, rinc)
  3297. DEC ECX ; DEC(ECX)
  3298. JMP loopL
  3299. endL:
  3300. END Copy4;
  3301. PROCEDURE Copy8( ladr, dadr: ADDRESS; linc, dinc, len: SIZE );
  3302. CODE {SYSTEM.i386}
  3303. MOV ESI, [EBP+ladr] ; ECX := ladr
  3304. MOV EDI, [EBP+dadr] ; EDX := dadr
  3305. MOV ECX, [EBP+len] ; EBX := len
  3306. MOV EAX, [EBP+linc] ;
  3307. CMP EAX, 8 ;
  3308. JNE loopL ;
  3309. MOV EAX, [EBP+dinc] ;
  3310. CMP EAX, 8 ;
  3311. JNE loopL ;
  3312. fastmove:
  3313. SHL ECX, 1 ;
  3314. CLD ; incremental
  3315. REP ;
  3316. MOVSD ; move rest IN one byte steps
  3317. JMP endL ;
  3318. loopL:
  3319. CMP ECX, 0 ;
  3320. JLE endL ; WHILE EBX > 0 DO
  3321. MOV EAX, [ESI] ; EAX := SYSTEM.GET32(ECX)
  3322. MOV [EDI], EAX ; SYSTEM.PUT32(EDX, EAX))
  3323. MOV EAX, [ESI+4] ; EAX := SYSTEM.GET32(ECX+4)
  3324. MOV [EDI+4], EAX ; SYSTEM.PUT32(EDX+4, EAX))
  3325. ADD ESI, [EBP+linc] ; INC(ECX, linc)
  3326. ADD EDI, [EBP+dinc] ; INC(EDX, rinc)
  3327. DEC ECX ; DEC(EBX)
  3328. JMP loopL
  3329. endL:
  3330. END Copy8;
  3331. PROCEDURE Transpose4A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3332. CODE {SYSTEM.i386}
  3333. startrows:
  3334. MOV EAX, [EBP+rows] ;
  3335. startouter:
  3336. CMP EAX, 0 ;
  3337. JLE endL ;
  3338. MOV ESI, [EBP+ladr] ;
  3339. MOV EDI, [EBP+dadr] ;
  3340. MOV EBX, [EBP+linc] ;
  3341. MOV ECX, [EBP+dstride] ;
  3342. MOV EAX, [EBP+cols] ;
  3343. startinner:
  3344. CMP EAX, 0 ;
  3345. JLE endinner ;
  3346. MOV EDX, [ESI] ;
  3347. MOV [EDI], EDX ;
  3348. ADD ESI, EBX ;
  3349. ADD EDI, ECX ;
  3350. DEC EAX ;
  3351. JMP startinner ;
  3352. endinner:
  3353. MOV ESI, [EBP+ladr] ;
  3354. ADD ESI, [EBP+lstride] ;
  3355. MOV [EBP+ladr], ESI
  3356. MOV EDI, [EBP+dadr] ;
  3357. ADD EDI, [EBP+dinc] ;
  3358. MOV [EBP+dadr], EDI ;
  3359. MOV EAX, [EBP+rows] ;
  3360. DEC EAX ;
  3361. MOV [EBP+rows], EAX ;
  3362. JMP startouter ;
  3363. endL:
  3364. END Transpose4A;
  3365. PROCEDURE Transpose4( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3366. VAR l, d, c: SIZE; BlockSize: SIZE;
  3367. BEGIN
  3368. BlockSize :=
  3369. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3370. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3371. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3372. BlockSize := MAX( 8, BlockSize );
  3373. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3374. WHILE (rows >= BlockSize) DO
  3375. c := cols; l := ladr; d := dadr;
  3376. WHILE (c >= BlockSize) DO
  3377. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3378. BlockSize );
  3379. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3380. INC( d, BlockSize * dstride );
  3381. END;
  3382. IF c > 0 THEN
  3383. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3384. END;
  3385. DEC( rows, BlockSize ); INC( ladr, BlockSize * lstride );
  3386. INC( dadr, BlockSize * dinc );
  3387. END;
  3388. IF (rows > 0) THEN
  3389. c := cols; l := ladr; d := dadr;
  3390. WHILE (c >= BlockSize) DO
  3391. Transpose4A( l, d, lstride, linc, dstride, dinc, rows,
  3392. BlockSize );
  3393. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3394. INC( d, BlockSize * dstride );
  3395. END;
  3396. IF c > 0 THEN
  3397. Transpose4A( l, d, lstride, linc, dstride, dinc, rows, c );
  3398. END;
  3399. END;
  3400. END Transpose4;
  3401. PROCEDURE Transpose8( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3402. VAR l, d, c: SIZE; BlockSize: SIZE;
  3403. BEGIN
  3404. BlockSize :=
  3405. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3406. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3407. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3408. BlockSize := MAX( 8, BlockSize );
  3409. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3410. WHILE (rows >= BlockSize) DO
  3411. c := cols; l := ladr; d := dadr;
  3412. WHILE (c >= BlockSize) DO
  3413. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3414. BlockSize );
  3415. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3416. INC( d, BlockSize * dstride );
  3417. END;
  3418. IF c > 0 THEN
  3419. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3420. END;
  3421. DEC( rows, BlockSize ); INC( ladr, lstride * BlockSize );
  3422. INC( dadr, dinc * BlockSize );
  3423. END;
  3424. IF (rows > 0) THEN
  3425. c := cols; l := ladr; d := dadr;
  3426. WHILE (c >= BlockSize) DO
  3427. Transpose8A( l, d, lstride, linc, dstride, dinc, rows,
  3428. BlockSize );
  3429. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3430. INC( d, BlockSize * dstride );
  3431. END;
  3432. IF c > 0 THEN
  3433. Transpose8A( l, d, lstride, linc, dstride, dinc, rows, c );
  3434. END;
  3435. END;
  3436. END Transpose8;
  3437. PROCEDURE Transpose8A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3438. CODE {SYSTEM.i386}
  3439. startrows:
  3440. MOV EAX, [EBP+rows] ;
  3441. startouter:
  3442. CMP EAX, 0 ;
  3443. JLE endL ;
  3444. MOV ESI, [EBP+ladr] ;
  3445. MOV EDI, [EBP+dadr] ;
  3446. MOV EBX, [EBP+linc] ;
  3447. MOV ECX, [EBP+dstride] ;
  3448. MOV EAX, [EBP+cols] ;
  3449. startinner:
  3450. CMP EAX, 0 ;
  3451. JLE endinner ;
  3452. MOV EDX, [ESI] ;
  3453. MOV [EDI], EDX ;
  3454. MOV EDX, [ESI+4] ;
  3455. MOV [EDI+4], EDX ;
  3456. ADD ESI, EBX ;
  3457. ADD EDI, ECX ;
  3458. DEC EAX ;
  3459. JMP startinner ;
  3460. endinner:
  3461. MOV ESI, [EBP+ladr] ;
  3462. ADD ESI, [EBP+lstride] ;
  3463. MOV [EBP+ladr], ESI
  3464. MOV EDI, [EBP+dadr] ;
  3465. ADD EDI, [EBP+dinc] ;
  3466. MOV [EBP+dadr], EDI ;
  3467. MOV EAX, [EBP+rows] ;
  3468. DEC EAX ;
  3469. MOV [EBP+rows], EAX ;
  3470. JMP startouter ;
  3471. endL:
  3472. END Transpose8A;
  3473. PROCEDURE SSEMul24BlockR( VAR CbFirst: SIZE;
  3474. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3475. add: BOOLEAN );
  3476. CODE {SYSTEM.i386, SYSTEM.SSE}
  3477. MatrixOfResultsSetup:
  3478. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3479. RowOfResultsLoop:
  3480. MOV EBX, 0 ; counter FOR columns IN B-Cb
  3481. DotProductSetup:
  3482. MOV ESI, [EBP+matrixA] ; matrixA
  3483. MOV EDI, [EBP+matrixB] ; matrixB
  3484. LEA EDI, [EDI+EBX*4] ; current position IN matrixB
  3485. XORPS XMM2, XMM2
  3486. XORPS XMM3, XMM3
  3487. XORPS XMM4, XMM4
  3488. XORPS XMM5, XMM5
  3489. XORPS XMM6, XMM6
  3490. XORPS XMM7, XMM7
  3491. MOV EAX, 0 ;
  3492. MOV AL, [EBP+add] ;
  3493. CMP AL, 0 ; add?
  3494. JE DotProductLoop ;
  3495. MOV EAX, [EBP+matrixC] ; matrixC
  3496. LEA EAX, [EAX+EBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3497. MOVUPS XMM2, [EAX]
  3498. MOVUPS XMM3, [EAX+16]
  3499. MOVUPS XMM4, [EAX+32]
  3500. MOVUPS XMM5, [EAX+48]
  3501. MOVUPS XMM6, [EAX+64]
  3502. MOVUPS XMM7, [EAX+80]
  3503. MOV EAX, 0
  3504. DotProductLoop:
  3505. MOV EDX, [ESI+EAX*4]
  3506. SHL EDX, 1
  3507. CMP EDX, 0
  3508. JE SparseEntryEscape
  3509. MOVSS XMM0, [ESI+EAX*4]
  3510. SHUFPS XMM0, XMM0, 0H
  3511. MOVUPS XMM1, [EDI]
  3512. MULPS XMM1, XMM0
  3513. ADDPS XMM2, XMM1
  3514. MOVUPS XMM1, [EDI+16]
  3515. MULPS XMM1, XMM0
  3516. ADDPS XMM3, XMM1
  3517. MOVUPS XMM1, [EDI+32]
  3518. MULPS XMM1, XMM0
  3519. ADDPS XMM4, XMM1
  3520. MOVUPS XMM1, [EDI+48]
  3521. MULPS XMM1, XMM0
  3522. ADDPS XMM5, XMM1
  3523. MOVUPS XMM1, [EDI+64]
  3524. MULPS XMM1, XMM0
  3525. ADDPS XMM6, XMM1
  3526. MOVUPS XMM1, [EDI+80]
  3527. MULPS XMM1, XMM0
  3528. ADDPS XMM7, XMM1
  3529. SparseEntryEscape:
  3530. ADD EDI, [EBP+StrideB] ; StrideB
  3531. INC EAX
  3532. CMP EAX, [EBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3533. JL DotProductLoop
  3534. ; endL DopProductLoop
  3535. MOV EAX, [EBP+matrixC] ; matrixC
  3536. LEA EAX, [EAX+EBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3537. MOVUPS [EAX], XMM2
  3538. MOVUPS [EAX+16], XMM3
  3539. MOVUPS [EAX+32], XMM4
  3540. MOVUPS [EAX+48], XMM5
  3541. MOVUPS [EAX+64], XMM6
  3542. MOVUPS [EAX+80], XMM7
  3543. ADD EBX, 24 ; move over TO next batch OF 24
  3544. MOV EDX, EBX
  3545. ADD EDX, 24
  3546. CMP EDX, [EBP+Cb] ; Cb, check TO see IF row IS complete
  3547. JLE DotProductSetup
  3548. ; endL RowOfResultsLoop
  3549. MOV EAX, [EBP+matrixA] ; matrixA
  3550. ADD EAX, [EBP+StrideA] ; StrideA
  3551. MOV [EBP+matrixA], EAX ; matrixA
  3552. MOV EAX, [EBP+matrixC] ; matrixC
  3553. ADD EAX, [EBP+StrideC] ; StrideC
  3554. MOV [EBP+matrixC], EAX ; matrixC
  3555. INC ECX
  3556. CMP ECX, [EBP+Ra] ; Ra
  3557. JL RowOfResultsLoop
  3558. Done:
  3559. MOV EAX, [EBP+CbFirst] ; CbFirst
  3560. MOV [EAX], EBX ;
  3561. END SSEMul24BlockR;
  3562. (*! might be better to make a 10Block operation and utilize 2 registers for temporary calculations, see article abaout Emmerald*)
  3563. PROCEDURE SSEMul12BlockX( VAR CbFirst: SIZE;
  3564. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC :ADDRESS;
  3565. add: BOOLEAN );
  3566. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3567. MatrixOfResultsSetup:
  3568. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3569. RowOfResultsLoop:
  3570. MOV EBX, 0 ; counter FOR columns IN B-Cb
  3571. DotProductSetup:
  3572. MOV ESI, [EBP+matrixA] ; matrixA
  3573. MOV EDI, [EBP+matrixB] ; matrixB
  3574. LEA EDI, [EDI+EBX*8]
  3575. XORPD XMM2, XMM2
  3576. XORPD XMM3, XMM3
  3577. XORPD XMM4, XMM4
  3578. XORPD XMM5, XMM5
  3579. XORPD XMM6, XMM6
  3580. XORPD XMM7, XMM7
  3581. MOV EAX, 0 ;
  3582. MOV AL, [EBP+add] ;
  3583. CMP AL, 0 ; add?
  3584. JE DotProductLoop ;
  3585. MOV EAX, [EBP+matrixC] ; matrixC
  3586. LEA EAX, [EAX+EBX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3587. MOVUPD XMM2, [EAX]
  3588. MOVUPD XMM3, [EAX+16]
  3589. MOVUPD XMM4, [EAX+32]
  3590. MOVUPD XMM5, [EAX+48]
  3591. MOVUPD XMM6, [EAX+64]
  3592. MOVUPD XMM7, [EAX+80]
  3593. MOV EAX, 0
  3594. DotProductLoop:
  3595. ; MOV EDX, [ESI+EAX*8]
  3596. ; SHL EDX, 1
  3597. ; CMP EDX, 0
  3598. ; JE SparseEntryEscape
  3599. MOVSD XMM0, [ESI+EAX*8]
  3600. SHUFPD XMM0, XMM0, 0H
  3601. MOVUPD XMM1, [EDI]
  3602. MULPD XMM1, XMM0
  3603. ADDPD XMM2, XMM1
  3604. MOVUPD XMM1, [EDI+16]
  3605. MULPD XMM1, XMM0
  3606. ADDPD XMM3, XMM1
  3607. MOVUPD XMM1, [EDI+32]
  3608. MULPD XMM1, XMM0
  3609. ADDPD XMM4, XMM1
  3610. MOVUPD XMM1, [EDI+48]
  3611. MULPD XMM1, XMM0
  3612. ADDPD XMM5, XMM1
  3613. MOVUPD XMM1, [EDI+64]
  3614. MULPD XMM1, XMM0
  3615. ADDPD XMM6, XMM1
  3616. MOVUPD XMM1, [EDI+80]
  3617. MULPD XMM1, XMM0
  3618. ADDPD XMM7, XMM1
  3619. SparseEntryEscape:
  3620. ADD EDI, [EBP+StrideB] ; StrideB
  3621. INC EAX
  3622. CMP EAX, [EBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3623. JL DotProductLoop ; endL DopProductLoop
  3624. MOV EAX , [EBP+matrixC] ; matrixC
  3625. LEA EAX, [EAX+EBX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3626. MOVUPD [EAX], XMM2
  3627. MOVUPD [EAX+16], XMM3
  3628. MOVUPD [EAX+32], XMM4
  3629. MOVUPD [EAX+48], XMM5
  3630. MOVUPD [EAX+64], XMM6
  3631. MOVUPD [EAX+80], XMM7
  3632. ADD EBX, 12 ; move over TO next batch OF 12
  3633. MOV EDX, EBX
  3634. ADD EDX, 12
  3635. CMP EDX, [EBP+Cb] ; Cb, check TO see IF row IS complete
  3636. JLE DotProductSetup ; end RowOfResultsLoop
  3637. MOV EAX , [EBP+matrixA] ; matrixA
  3638. ADD EAX, [EBP+StrideA] ; StrideA
  3639. MOV [EBP+matrixA], EAX ; matrixA
  3640. MOV EAX, [EBP+matrixC] ; matrixC
  3641. ADD EAX, [EBP+StrideC] ; StrideC
  3642. MOV [EBP+matrixC], EAX ; matrixC
  3643. INC ECX
  3644. CMP ECX, [EBP+Ra] ; Ra
  3645. JL RowOfResultsLoop
  3646. Done:
  3647. MOV EAX, [EBP+CbFirst] ; CbFirst
  3648. MOV [EAX], EBX ;
  3649. END SSEMul12BlockX;
  3650. PROCEDURE SSEMul16BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3651. add: BOOLEAN );
  3652. CODE {SYSTEM.i386, SYSTEM.SSE}
  3653. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3654. DotProductSetup:
  3655. MOV ESI, [EBP+matrixA] ; matrixA
  3656. MOV EDI, [EBP+matrixB] ; matrixB
  3657. MOV EDX, [EBP+CbFrom] ; CbFrom
  3658. LEA EDI, [EDI+EDX*4]
  3659. XORPS XMM2, XMM2
  3660. XORPS XMM3, XMM3
  3661. XORPS XMM4, XMM4
  3662. XORPS XMM5, XMM5
  3663. MOV EAX, 0 ;
  3664. MOV AL, [EBP+add] ;
  3665. CMP AL, 0 ; add?
  3666. JE DotProductLoop ;
  3667. MOV EAX, [EBP+matrixC] ; matrixC
  3668. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally
  3669. MOVUPS XMM2, [EAX]
  3670. MOVUPS XMM3, [EAX+16]
  3671. MOVUPS XMM4, [EAX+32]
  3672. MOVUPS XMM5, [EAX+48]
  3673. MOV EAX, 0
  3674. DotProductLoop:
  3675. MOV EDX, [ESI+EAX*4]
  3676. SHL EDX, 1
  3677. CMP EDX, 0
  3678. JE SparseEntryEscape
  3679. MOVSS XMM0, [ESI+EAX*4]
  3680. SHUFPS XMM0, XMM0, 0H
  3681. MOVUPS XMM1, [EDI]
  3682. MULPS XMM1, XMM0
  3683. ADDPS XMM2, XMM1
  3684. MOVUPS XMM1, [EDI+16]
  3685. MULPS XMM1, XMM0
  3686. ADDPS XMM3, XMM1
  3687. MOVUPS XMM1, [EDI+32]
  3688. MULPS XMM1, XMM0
  3689. ADDPS XMM4, XMM1
  3690. MOVUPS XMM1, [EDI+48]
  3691. MULPS XMM1, XMM0
  3692. ADDPS XMM5, XMM1
  3693. SparseEntryEscape:
  3694. ADD EDI, [EBP+StrideB] ; StrideB
  3695. INC EAX
  3696. CMP EAX, [EBP+Ca] ; Ca
  3697. JL DotProductLoop ; end DotProductLoop
  3698. MOV EAX , [EBP+matrixC] ; matrixC
  3699. MOV EDX, [EBP+CbFrom] ; CbFirst
  3700. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 12
  3701. MOVUPS [EAX], XMM2
  3702. MOVUPS [EAX+16], XMM3
  3703. MOVUPS [EAX+32], XMM4
  3704. MOVUPS [EAX+48], XMM5
  3705. MOV EAX, [EBP+matrixA] ; matrixA
  3706. ADD EAX, [EBP+StrideA] ; StrideA
  3707. MOV [EBP+matrixA], EAX ; matrixA
  3708. MOV EAX, [EBP+matrixC] ; matrixC
  3709. ADD EAX, [EBP+StrideC] ; StrideC
  3710. MOV [EBP+matrixC], EAX ; matrixC
  3711. INC ECX
  3712. CMP ECX, [EBP+Ra] ; Ra
  3713. JL DotProductSetup ;
  3714. END SSEMul16BlockR;
  3715. PROCEDURE SSEMul8BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3716. add: BOOLEAN );
  3717. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3718. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3719. DotProductSetup:
  3720. MOV ESI, [EBP+matrixA] ; matrixA
  3721. MOV EDI, [EBP+matrixB] ; matrixB
  3722. MOV EDX, [EBP+CbFrom] ; CbFrom
  3723. LEA EDI, [EDI+EDX*8]
  3724. XORPD XMM2, XMM2
  3725. XORPD XMM3, XMM3
  3726. XORPD XMM4, XMM4
  3727. XORPD XMM5, XMM5
  3728. MOV EAX, 0 ;
  3729. MOV AL, [EBP+add] ;
  3730. CMP AL, 0 ; add?
  3731. JE DotProductLoop ;
  3732. MOV EAX, [EBP+matrixC] ; matrixC
  3733. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3734. MOVUPD XMM2, [EAX]
  3735. MOVUPD XMM3, [EAX+16]
  3736. MOVUPD XMM4, [EAX+32]
  3737. MOVUPD XMM5, [EAX+48]
  3738. MOV EAX, 0
  3739. DotProductLoop:
  3740. ; MOV EDX, [ESI+EAX*8]
  3741. ; SHL EDX, 1
  3742. ; CMP EDX, 0
  3743. ; JE SparseEntryEscape
  3744. MOVSD XMM0, [ESI+EAX*8]
  3745. SHUFPD XMM0, XMM0, 0H
  3746. MOVUPD XMM1, [EDI]
  3747. MULPD XMM1, XMM0
  3748. ADDPD XMM2, XMM1
  3749. MOVUPD XMM1, [EDI+16]
  3750. MULPD XMM1, XMM0
  3751. ADDPD XMM3, XMM1
  3752. MOVUPD XMM1, [EDI+32]
  3753. MULPD XMM1, XMM0
  3754. ADDPD XMM4, XMM1
  3755. MOVUPD XMM1, [EDI+48]
  3756. MULPD XMM1, XMM0
  3757. ADDPD XMM5, XMM1
  3758. SparseEntryEscape:
  3759. ADD EDI, [EBP+StrideB] ; StrideB
  3760. INC EAX
  3761. CMP EAX, [EBP+Ca] ; Ca
  3762. JL DotProductLoop ; end DotProductLoop
  3763. MOV EAX , [EBP+matrixC] ; matrixC
  3764. MOV EDX, [EBP+CbFrom] ; CbFirst
  3765. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3766. MOVUPD [EAX], XMM2
  3767. MOVUPD [EAX+16], XMM3
  3768. MOVUPD [EAX+32], XMM4
  3769. MOVUPD [EAX+48], XMM5
  3770. MOV EAX, [EBP+matrixA] ; matrixA
  3771. ADD EAX, [EBP+StrideA] ; StrideA
  3772. MOV [EBP+matrixA], EAX ; matrixA
  3773. MOV EAX, [EBP+matrixC] ; matrixC
  3774. ADD EAX, [EBP+StrideC] ; StrideC
  3775. MOV [EBP+matrixC], EAX ; matrixC
  3776. INC ECX
  3777. CMP ECX, [EBP+Ra] ; Ra
  3778. JL DotProductSetup ;
  3779. END SSEMul8BlockX;
  3780. PROCEDURE SSEMul8BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3781. add: BOOLEAN );
  3782. CODE {SYSTEM.i386, SYSTEM.SSE}
  3783. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3784. DotProductSetup:
  3785. MOV ESI, [EBP+matrixA] ; matrixA
  3786. MOV EDI, [EBP+matrixB] ; matrixB
  3787. MOV EDX, [EBP+CbFrom] ; CbFrom
  3788. LEA EDI, [EDI+EDX*4]
  3789. XORPS XMM2, XMM2
  3790. XORPS XMM3, XMM3
  3791. MOV EAX, 0 ;
  3792. MOV AL, [EBP+add] ;
  3793. CMP AL, 0 ; add?
  3794. JE DotProductLoop ;
  3795. MOV EAX, [EBP+matrixC] ; matrixC
  3796. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3797. MOVUPS XMM2, [EAX]
  3798. MOVUPS XMM3, [EAX+16]
  3799. MOV EAX, 0
  3800. DotProductLoop:
  3801. MOV EDX, [ESI+EAX*4]
  3802. SHL EDX, 1
  3803. CMP EDX, 0
  3804. JE SparseEntryEscape
  3805. MOVSS XMM0, [ESI+EAX*4]
  3806. SHUFPS XMM0, XMM0, 0H
  3807. MOVUPS XMM1, [EDI]
  3808. MULPS XMM1, XMM0
  3809. ADDPS XMM2, XMM1
  3810. MOVUPS XMM1, [EDI+16]
  3811. MULPS XMM1, XMM0
  3812. ADDPS XMM3, XMM1
  3813. SparseEntryEscape:
  3814. ADD EDI, [EBP+StrideB] ; StrideB
  3815. INC EAX
  3816. CMP EAX, [EBP+Ca] ; Ca
  3817. JL DotProductLoop ; end DotProductLoop
  3818. MOV EAX , [EBP+matrixC] ; matrixC
  3819. MOV EDX, [EBP+CbFrom] ; CbFrom
  3820. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3821. MOVUPS [EAX], XMM2
  3822. MOVUPS [EAX+16], XMM3
  3823. MOV EAX, [EBP+matrixA] ; matrixA
  3824. ADD EAX, [EBP+StrideA] ; StrideA
  3825. MOV [EBP+matrixA], EAX ; matrixA
  3826. MOV EAX, [EBP+matrixC] ; matrixC
  3827. ADD EAX, [EBP+StrideC] ; StrideC
  3828. MOV [EBP+matrixC], EAX ; matrixC
  3829. INC ECX
  3830. CMP ECX, [EBP+Ra] ; Ra
  3831. JL DotProductSetup ;
  3832. END SSEMul8BlockR;
  3833. PROCEDURE SSEMul4BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3834. add: BOOLEAN );
  3835. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3836. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3837. DotProductSetup:
  3838. MOV EAX, 0 ; cols IN A
  3839. MOV ESI, [EBP+matrixA] ; matrixA
  3840. MOV EDI, [EBP+matrixB] ; matrixB
  3841. MOV EDX, [EBP+CbFrom] ; CbFrom
  3842. LEA EDI, [EDI+EDX*8]
  3843. XORPS XMM2, XMM2
  3844. XORPS XMM3, XMM3
  3845. MOV EAX, 0 ;
  3846. MOV AL, [EBP+add] ;
  3847. CMP AL, 0 ; add?
  3848. JE DotProductLoop ;
  3849. MOV EAX, [EBP+matrixC] ; matrixC
  3850. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3851. MOVUPD XMM2, [EAX]
  3852. MOVUPD XMM3, [EAX+16]
  3853. MOV EAX, 0
  3854. DotProductLoop:
  3855. ; MOV EDX, [ESI+EAX*8]
  3856. ; SHL EDX, 1
  3857. ; CMP EDX, 0
  3858. ; JE SparseEntryEscape
  3859. MOVSD XMM0, [ESI+EAX*8]
  3860. SHUFPD XMM0, XMM0, 0H
  3861. MOVUPD XMM1, [EDI]
  3862. MULPD XMM1, XMM0
  3863. ADDPD XMM2, XMM1
  3864. MOVUPD XMM1, [EDI+16]
  3865. MULPD XMM1, XMM0
  3866. ADDPD XMM3, XMM1
  3867. SparseEntryEscape:
  3868. ADD EDI, [EBP+StrideB] ; StrideB
  3869. INC EAX
  3870. CMP EAX, [EBP+Ca] ; Ca
  3871. JL DotProductLoop ; end DotProductLoop
  3872. MOV EAX , [EBP+matrixC] ; matrixC
  3873. MOV EDX, [EBP+CbFrom] ; CbFrom
  3874. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3875. MOVUPD [EAX], XMM2
  3876. MOVUPD [EAX+16], XMM3
  3877. MOV EAX, [EBP+matrixA] ; matrixA
  3878. ADD EAX, [EBP+StrideA] ; StrideA
  3879. MOV [EBP+matrixA], EAX ; matrixA
  3880. MOV EAX, [EBP+matrixC] ; matrixC
  3881. ADD EAX, [EBP+StrideC] ; StrideC
  3882. MOV [EBP+matrixC], EAX ; matrixC
  3883. INC ECX
  3884. CMP ECX, [EBP+Ra] ; Ra
  3885. JL DotProductSetup ;
  3886. END SSEMul4BlockX;
  3887. PROCEDURE SSEMul4BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3888. add: BOOLEAN );
  3889. CODE {SYSTEM.i386, SYSTEM.SSE}
  3890. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3891. DotProductSetup:
  3892. MOV EAX, 0 ; cols IN A
  3893. MOV ESI, [EBP+matrixA] ; matrixA
  3894. MOV EDI, [EBP+matrixB] ; matrixB
  3895. MOV EDX, [EBP+CbFrom] ; CbFrom
  3896. LEA EDI, [EDI+EDX*4]
  3897. XORPS XMM2, XMM2
  3898. MOV EAX, 0 ;
  3899. MOV AL, [EBP+add] ;
  3900. CMP AL, 0 ; add?
  3901. JE DotProductLoop ;
  3902. MOV EAX, [EBP+matrixC] ; matrixC
  3903. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3904. MOVUPS XMM2, [EAX]
  3905. MOV EAX, 0
  3906. DotProductLoop:
  3907. MOV EDX, [ESI+EAX*4]
  3908. SHL EDX, 1
  3909. CMP EDX, 0
  3910. JE SparseEntryEscape
  3911. MOVSS XMM0, [ESI+EAX*4]
  3912. SHUFPS XMM0, XMM0, 0H
  3913. MOVUPS XMM1, [EDI]
  3914. MULPS XMM1, XMM0
  3915. ADDPS XMM2, XMM1
  3916. SparseEntryEscape:
  3917. ADD EDI, [EBP+StrideB] ; StrideB
  3918. INC EAX
  3919. CMP EAX, [EBP+Ca] ; Ca
  3920. JL DotProductLoop ; end DopProductLoop
  3921. MOV EAX, [EBP+matrixC] ; matrixC
  3922. MOV EDX, [EBP+CbFrom] ; CbFrom
  3923. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3924. MOVUPS [EAX], XMM2
  3925. MOV EAX, [EBP+matrixA] ; matrixA
  3926. ADD EAX, [EBP+StrideA] ; StrideA
  3927. MOV [EBP+matrixA], EAX ; matrixA
  3928. MOV EAX, [EBP+matrixC] ; matrixC
  3929. ADD EAX, [EBP+StrideC] ; StrideC
  3930. MOV [EBP+matrixC], EAX ; matrixC
  3931. INC ECX
  3932. CMP ECX, [EBP+Ra] ; Ra
  3933. JL DotProductSetup ;
  3934. END SSEMul4BlockR;
  3935. PROCEDURE SSEMul2BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3936. add: BOOLEAN );
  3937. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3938. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3939. DotProductSetup:
  3940. MOV EAX, 0 ; cols IN A
  3941. MOV ESI, [EBP+matrixA] ; matrixA
  3942. MOV EDI, [EBP+matrixB] ; matrixB
  3943. MOV EDX, [EBP+CbFrom] ; CbFrom
  3944. LEA EDI, [EDI+EDX*8]
  3945. XORPD XMM2, XMM2
  3946. MOV EAX, 0 ;
  3947. MOV AL, [EBP+add] ;
  3948. CMP AL, 0 ; add?
  3949. JE DotProductLoop ;
  3950. MOV EAX, [EBP+matrixC] ; matrixC
  3951. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3952. MOVUPD XMM2, [EAX]
  3953. MOV EAX, 0
  3954. DotProductLoop:
  3955. ; MOV EDX, [ESI+EAX*4] ;
  3956. ; SHL EDX, 1 ;
  3957. ; CMP EDX, 0
  3958. ; JE SparseEntryEscape
  3959. MOVSD XMM0, [ESI+EAX*8]
  3960. SHUFPD XMM0, XMM0, 0H
  3961. MOVUPD XMM1, [EDI]
  3962. MULPD XMM1, XMM0
  3963. ADDPD XMM2, XMM1
  3964. SparseEntryEscape:
  3965. ADD EDI, [EBP+StrideB] ; StrideB
  3966. INC EAX
  3967. CMP EAX, [EBP+Ca] ; Ca
  3968. JL DotProductLoop ; end DotProductLoop
  3969. MOV EAX , [EBP+matrixC] ; matrixC
  3970. MOV EDX, [EBP+CbFrom] ; CbFrom
  3971. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3972. MOVUPD [EAX], XMM2
  3973. MOV EAX, [EBP+matrixA] ; matrixA
  3974. ADD EAX, [EBP+StrideA] ; StrideA
  3975. MOV [EBP+matrixA], EAX ; matrixA
  3976. MOV EAX, [EBP+matrixC] ; matrixC
  3977. ADD EAX, [EBP+StrideC] ; StrideC
  3978. MOV [EBP+matrixC], EAX ; matrixC
  3979. INC ECX
  3980. CMP ECX, [EBP+Ra] ; Ra
  3981. JL DotProductSetup ;
  3982. END SSEMul2BlockX;
  3983. (****** blocking matrix multiplication with copy of data ******)
  3984. PROCEDURE MagicBlockR( M, N, K: SIZE;
  3985. VAR L2BlockM, L2BlockN, L2BlockK: SIZE );
  3986. BEGIN
  3987. K := (K DIV L0BlockKR) * L0BlockKR;
  3988. N := (N DIV L1BlockN) * L1BlockN;
  3989. IF M = 0 THEN M := 1 END;
  3990. IF N = 0 THEN N := 1 END;
  3991. IF K = 0 THEN K := 1 END;
  3992. L2BlockK :=
  3993. K DIV ((K + L1MaxBlockKR - 1) DIV L1MaxBlockKR);
  3994. (* Round up to next multiple of 16 *)
  3995. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  3996. L2BlockN :=
  3997. L2BlockSize DIV SIZEOF( REAL ) DIV
  3998. (L2BlockK * (L2BARatio + 1));
  3999. IF L2BlockN > N THEN L2BlockN := N
  4000. ELSIF L2BlockN < 1 THEN L2BlockN := 1;
  4001. END;
  4002. L2BlockM :=
  4003. (L2BlockSize DIV SIZEOF( REAL ) - L2BlockN * L2BlockK) DIV
  4004. L2BlockK;
  4005. (* Round up to next multiple of 5 *)
  4006. IF L2BlockM > M THEN L2BlockM := M
  4007. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  4008. END;
  4009. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  4010. END MagicBlockR;
  4011. PROCEDURE MagicBlockX( M, N, K: SIZE;
  4012. VAR L2BlockM, L2BlockN, L2BlockK:SIZE );
  4013. BEGIN
  4014. K := (K DIV L0BlockKX) * L0BlockKX;
  4015. N := (N DIV L1BlockN) * L1BlockN;
  4016. IF M = 0 THEN M := 1 END;
  4017. IF N = 0 THEN N := 1 END;
  4018. IF K = 0 THEN K := 1 END;
  4019. L2BlockK :=
  4020. K DIV ((K + L1MaxBlockKX - 1) DIV L1MaxBlockKX);
  4021. (* Round up to next multiple of 16 *)
  4022. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  4023. L2BlockN :=
  4024. L2BlockSize DIV SIZEOF( LONGREAL ) DIV
  4025. (L2BlockK * (L2BARatio + 1));
  4026. IF L2BlockN > N THEN L2BlockN := N END;
  4027. L2BlockM :=
  4028. (L2BlockSize DIV SIZEOF( LONGREAL ) - L2BlockN * L2BlockK) DIV
  4029. L2BlockK;
  4030. (* Round up to next multiple of 5 *)
  4031. IF L2BlockM > M THEN L2BlockM := M
  4032. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  4033. END;
  4034. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  4035. END MagicBlockX;
  4036. (*
  4037. PROCEDURE L1Block1X( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4038. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  4039. PROCEDURE null( i: LONGINT );
  4040. BEGIN
  4041. reg[i, 0] := 0; reg[i, 1] := 0;
  4042. END null;
  4043. PROCEDURE get1( adr, i: LONGINT );
  4044. BEGIN
  4045. SYSTEM.GET( adr, reg[i, 0] );
  4046. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4047. END get1;
  4048. PROCEDURE get2( adr, i: LONGINT );
  4049. BEGIN
  4050. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  4051. IF debug THEN
  4052. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4053. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  4054. END;
  4055. END get2;
  4056. PROCEDURE mul2( i, j: LONGINT );
  4057. BEGIN
  4058. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4059. END mul2;
  4060. PROCEDURE add2( i, j: LONGINT );
  4061. BEGIN
  4062. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4063. END add2;
  4064. PROCEDURE put1( adr, i: LONGINT );
  4065. BEGIN
  4066. SYSTEM.PUT( adr, reg[i, 0] );
  4067. END put1;
  4068. PROCEDURE horadd( i: LONGINT );
  4069. BEGIN
  4070. reg[i, 0] := reg[i, 0] + reg[i, 1];
  4071. END horadd;
  4072. BEGIN
  4073. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  4074. null( 2 ); get1( adrC, 2 );
  4075. WHILE (K > 0) DO (* padding guaranteed *)
  4076. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 ); INC( adrB, 16 );
  4077. INC( adrA, 16 ); DEC( K, 2 );
  4078. END;
  4079. horadd( 2 ); put1( adrC, 2 );
  4080. END L1Block1X;
  4081. PROCEDURE L1Block5X( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4082. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  4083. PROCEDURE null( i: LONGINT );
  4084. BEGIN
  4085. reg[i, 0] := 0; reg[i, 1] := 0;
  4086. END null;
  4087. PROCEDURE get1( adr, i: LONGINT );
  4088. BEGIN
  4089. SYSTEM.GET( adr, reg[i, 0] );
  4090. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4091. END get1;
  4092. PROCEDURE get2( adr, i: LONGINT );
  4093. BEGIN
  4094. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  4095. IF debug THEN
  4096. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4097. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  4098. END;
  4099. END get2;
  4100. PROCEDURE mul2( i, j: LONGINT );
  4101. BEGIN
  4102. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4103. END mul2;
  4104. PROCEDURE add2( i, j: LONGINT );
  4105. BEGIN
  4106. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4107. END add2;
  4108. PROCEDURE put1( adr, i: LONGINT );
  4109. BEGIN
  4110. SYSTEM.PUT( adr, reg[i, 0] );
  4111. END put1;
  4112. PROCEDURE horadd( i: LONGINT );
  4113. BEGIN
  4114. reg[i, 0] := reg[i, 0] + reg[i, 1];
  4115. END horadd;
  4116. BEGIN
  4117. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  4118. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  4119. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  4120. get1( adrC + 4 * IncC, 6 );
  4121. WHILE (K > 0) DO (* padding guaranteed *)
  4122. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 );
  4123. get2( adrB + 16, 0 ); mul2( 0, 7 ); add2( 3, 0 ); get2( adrB + 32, 0 );
  4124. mul2( 0, 7 ); add2( 4, 0 ); get2( adrB + 48, 0 ); mul2( 0, 7 );
  4125. add2( 5, 0 ); get2( adrB + 64, 0 ); mul2( 0, 7 ); add2( 6, 0 ); INC( adrB, 80 );
  4126. INC( adrA, 16 ); DEC( K, 2 );
  4127. END;
  4128. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  4129. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  4130. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  4131. END L1Block5X;
  4132. PROCEDURE L1Block1R( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4133. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  4134. PROCEDURE null( i: LONGINT );
  4135. BEGIN
  4136. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  4137. END null;
  4138. PROCEDURE get1( adr, i: LONGINT );
  4139. BEGIN
  4140. SYSTEM.GET( adr, reg[i, 0] );
  4141. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4142. END get1;
  4143. PROCEDURE get4( adr, i: LONGINT );
  4144. BEGIN
  4145. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  4146. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  4147. IF debug THEN
  4148. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4149. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  4150. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  4151. END;
  4152. END get4;
  4153. PROCEDURE mul4( i, j: LONGINT );
  4154. BEGIN
  4155. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4156. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  4157. END mul4;
  4158. PROCEDURE add4( i, j: LONGINT );
  4159. BEGIN
  4160. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4161. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  4162. END add4;
  4163. PROCEDURE put1( adr, i: LONGINT );
  4164. BEGIN
  4165. SYSTEM.PUT( adr, reg[i, 0] );
  4166. END put1;
  4167. PROCEDURE horadd( i: LONGINT );
  4168. BEGIN
  4169. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  4170. END horadd;
  4171. BEGIN
  4172. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  4173. null( 2 ); get1( adrC, 2 );
  4174. WHILE (K > 0) DO (* padding guaranteed *)
  4175. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 ); INC( adrB, 16 );
  4176. INC( adrA, 16 ); DEC( K, 4 );
  4177. END;
  4178. horadd( 2 ); put1( adrC, 2 );
  4179. END L1Block1R;
  4180. PROCEDURE L1Block5R( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4181. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  4182. PROCEDURE null( i: LONGINT );
  4183. BEGIN
  4184. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  4185. END null;
  4186. PROCEDURE get1( adr, i: LONGINT );
  4187. BEGIN
  4188. SYSTEM.GET( adr, reg[i, 0] );
  4189. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4190. END get1;
  4191. PROCEDURE get4( adr, i: LONGINT );
  4192. BEGIN
  4193. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  4194. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  4195. IF debug THEN
  4196. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4197. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  4198. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  4199. END;
  4200. END get4;
  4201. PROCEDURE mul4( i, j: LONGINT );
  4202. BEGIN
  4203. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4204. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  4205. END mul4;
  4206. PROCEDURE add4( i, j: LONGINT );
  4207. BEGIN
  4208. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4209. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  4210. END add4;
  4211. PROCEDURE put1( adr, i: LONGINT );
  4212. BEGIN
  4213. SYSTEM.PUT( adr, reg[i, 0] );
  4214. END put1;
  4215. PROCEDURE horadd( i: LONGINT );
  4216. BEGIN
  4217. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  4218. END horadd;
  4219. BEGIN
  4220. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  4221. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  4222. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  4223. get1( adrC + 4 * IncC, 6 );
  4224. WHILE (K > 0) DO (* padding guaranteed *)
  4225. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 );
  4226. get4( adrB + 16, 0 ); mul4( 0, 7 ); add4( 3, 0 ); get4( adrB + 32, 0 );
  4227. mul4( 0, 7 ); add4( 4, 0 ); get4( adrB + 48, 0 ); mul4( 0, 7 );
  4228. add4( 5, 0 ); get4( adrB + 64, 0 ); mul4( 0, 7 ); add4( 6, 0 ); INC( adrB, 80 );
  4229. INC( adrA, 16 ); DEC( K, 4 );
  4230. END;
  4231. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  4232. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  4233. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  4234. END L1Block5R;
  4235. *)
  4236. PROCEDURE DispCR( adrM: ADDRESS;
  4237. inc, stride, M, N: SIZE );
  4238. VAR i, j: SIZE; adr: ADDRESS; val: REAL;
  4239. BEGIN
  4240. FOR i := 0 TO M - 1 DO
  4241. adr := adrM + i * stride;
  4242. FOR j := 0 TO N - 1 DO
  4243. SYSTEM.GET( adr, val );
  4244. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  4245. END;
  4246. KernelLog.Ln;
  4247. END;
  4248. END DispCR;
  4249. PROCEDURE DispCX( adrM: ADDRESS;
  4250. inc, stride, M, N: SIZE );
  4251. VAR i, j: SIZE; adr: ADDRESS; val: LONGREAL;
  4252. BEGIN
  4253. FOR i := 0 TO M - 1 DO
  4254. adr := adrM + i * stride;
  4255. FOR j := 0 TO N - 1 DO
  4256. SYSTEM.GET( adr, val );
  4257. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  4258. END;
  4259. KernelLog.Ln;
  4260. END;
  4261. END DispCX;
  4262. PROCEDURE L3BlockX( matrixA, matrixB, matrixC: ADDRESS;
  4263. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4264. (*
  4265. K N
  4266. *** N *****
  4267. M *** ****** -> ***** M
  4268. *** K ****** *****
  4269. *** ****** *****
  4270. A * B -> C
  4271. *)
  4272. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4273. KAligned: SIZE;
  4274. CONST Size = SIZEOF( LONGREAL );
  4275. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4276. (* M,N and K arbitrary ! *)
  4277. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4278. m, k, KAligned: SIZE;
  4279. BEGIN
  4280. KAligned := Align2( K ) * 8;
  4281. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4282. END;
  4283. adrB := matrixB;
  4284. WHILE (N >= L1BlockN) DO
  4285. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4286. adrC := matrixC; adrA := matrixA; m := M;
  4287. WHILE (m > 0) DO
  4288. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4289. IF SSE THEN
  4290. L1Block5XSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4291. ELSE
  4292. aadrA := adrA; aadrB := adrB; k := K;
  4293. WHILE (k > 0) DO
  4294. L1Block1XA( aadrA, aadrB, adrC, 2 );
  4295. L1Block1XA( aadrA, aadrB + 16, adrC + incC, 2 );
  4296. L1Block1XA( aadrA, aadrB + 32, adrC + 2 * incC,
  4297. 2 );
  4298. L1Block1XA( aadrA, aadrB + 48, adrC + 3 * incC,
  4299. 2 );
  4300. L1Block1XA( aadrA, aadrB + 64, adrC + 4 * incC,
  4301. 2 );
  4302. DEC( k, 2 ); INC( aadrA, 16 );
  4303. INC( aadrB, 16 * L1BlockN );
  4304. END;
  4305. END;
  4306. IF debug THEN
  4307. DispCX( matrixC, incC, strideC, M, N );
  4308. END;
  4309. INC( adrA, KAligned ); INC( adrC, strideC );
  4310. DEC( m );
  4311. END;
  4312. INC( matrixC, L1BlockN * incC );
  4313. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4314. END;
  4315. WHILE (N > 0) DO
  4316. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4317. adrC := matrixC; adrA := matrixA; m := M;
  4318. WHILE (m > 0) DO
  4319. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4320. IF SSE THEN
  4321. L1Block1XSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4322. ELSE L1Block1XA( adrA, adrB, adrC, K );
  4323. END;
  4324. IF debug THEN
  4325. DispCX( matrixC, incC, strideC, M, N );
  4326. END;
  4327. INC( adrA, KAligned ); INC( adrC, strideC );
  4328. DEC( m );
  4329. END;
  4330. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4331. END;
  4332. END L2Block;
  4333. BEGIN
  4334. KAligned := Align2( K ) * 8;
  4335. ASSERT( L2BlockK MOD 2 = 0 );
  4336. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4337. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4338. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4339. WHILE (n >= L2BlockN) DO
  4340. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4341. a1 := matrixA; adrC := matrixC; m := M;
  4342. WHILE (m >= L2BlockM) DO
  4343. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4344. adrA := a1; adrB := b1; k := K;
  4345. (* core: do matching level 2 cache Blocks *)
  4346. WHILE (k >= L2BlockK) DO
  4347. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4348. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4349. L2BlockK );
  4350. INC( adrA, L2BlockK * L2BlockM * Size );
  4351. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4352. DEC( k, L2BlockK );
  4353. END;
  4354. (* core: do rest of k *)
  4355. IF k > 0 THEN
  4356. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4357. END;
  4358. INC( a1, KAligned * L2BlockM );
  4359. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4360. END;
  4361. IF m > 0 THEN
  4362. (* clean up M *)
  4363. adrA := a1; adrB := b1; k := K;
  4364. WHILE (k >= L2BlockK) DO
  4365. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4366. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4367. INC( adrA, L2BlockK * Size * m );
  4368. INC( adrB, L2BlockK * L2BlockN * Size );
  4369. DEC( k, L2BlockK );
  4370. END;
  4371. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4372. IF k > 0 THEN
  4373. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4374. END;
  4375. END;
  4376. INC( b1, L2BlockN * KAligned );
  4377. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4378. END;
  4379. IF (n = 0) THEN RETURN
  4380. END;
  4381. a1 := matrixA; adrC := matrixC; m := M;
  4382. WHILE (m >= L2BlockM) DO
  4383. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4384. adrA := a1; adrB := b1; k := K;
  4385. WHILE (k >= L2BlockK) DO
  4386. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4387. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4388. INC( adrA, L2BlockM * L2BlockK * Size );
  4389. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4390. END;
  4391. IF k > 0 THEN
  4392. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4393. END;
  4394. INC( a1, L2BlockM * KAligned );
  4395. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4396. END;
  4397. IF (m = 0) THEN RETURN
  4398. END;
  4399. adrA := a1; adrB := b1; k := K;
  4400. WHILE (k >= L2BlockK) DO
  4401. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4402. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4403. INC( adrA, L2BlockK * m * Size );
  4404. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4405. END;
  4406. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4407. END;
  4408. END L3BlockX;
  4409. PROCEDURE L3BlockR( matrixA, matrixB, matrixC: ADDRESS;
  4410. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4411. (*
  4412. K N
  4413. *** N *****
  4414. M *** ****** -> ***** M
  4415. *** K ****** *****
  4416. *** ****** *****
  4417. A * B -> C
  4418. *)
  4419. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4420. KAligned: SIZE;
  4421. CONST Size = SIZEOF( REAL );
  4422. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4423. (* M,N and K arbitrary ! *)
  4424. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4425. m, KAligned, k: SIZE;
  4426. BEGIN
  4427. KAligned := Align4( K ) * 4;
  4428. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4429. END;
  4430. adrB := matrixB;
  4431. WHILE (N >= L1BlockN) DO
  4432. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4433. adrC := matrixC; adrA := matrixA; m := M;
  4434. WHILE (m > 0) DO
  4435. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4436. IF SSE THEN
  4437. L1Block5RSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4438. ELSE
  4439. aadrA := adrA; aadrB := adrB; k := K;
  4440. WHILE (k > 0) DO
  4441. L1Block1RA( aadrA, aadrB, adrC, 4 );
  4442. L1Block1RA( aadrA, aadrB + 16, adrC + incC, 4 );
  4443. L1Block1RA( aadrA, aadrB + 32, adrC + 2 * incC,
  4444. 4 );
  4445. L1Block1RA( aadrA, aadrB + 48, adrC + 3 * incC,
  4446. 4 );
  4447. L1Block1RA( aadrA, aadrB + 64, adrC + 4 * incC,
  4448. 4 );
  4449. DEC( k, 4 ); INC( aadrA, 16 );
  4450. INC( aadrB, 16 * L1BlockN );
  4451. END;
  4452. END;
  4453. IF debug THEN
  4454. DispCR( matrixC, incC, strideC, M, N );
  4455. END;
  4456. INC( adrA, KAligned ); INC( adrC, strideC );
  4457. DEC( m );
  4458. END;
  4459. INC( matrixC, L1BlockN * incC );
  4460. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4461. END;
  4462. WHILE (N > 0) DO
  4463. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4464. adrC := matrixC; adrA := matrixA; m := M;
  4465. WHILE (m > 0) DO
  4466. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4467. IF SSE THEN
  4468. L1Block1RSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4469. ELSE L1Block1RA( adrA, adrB, adrC, K );
  4470. END;
  4471. IF debug THEN
  4472. DispCR( matrixC, incC, strideC, M, N );
  4473. END;
  4474. INC( adrA, KAligned ); INC( adrC, strideC );
  4475. DEC( m );
  4476. END;
  4477. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4478. END;
  4479. END L2Block;
  4480. BEGIN
  4481. KAligned := Align4( K ) * 4;
  4482. ASSERT( L2BlockK MOD 4 = 0 );
  4483. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4484. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4485. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4486. WHILE (n >= L2BlockN) DO
  4487. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4488. a1 := matrixA; adrC := matrixC; m := M;
  4489. WHILE (m >= L2BlockM) DO
  4490. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4491. adrA := a1; adrB := b1; k := K;
  4492. (* core: do matching level 2 cache Blocks *)
  4493. WHILE (k >= L2BlockK) DO
  4494. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4495. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4496. L2BlockK );
  4497. INC( adrA, L2BlockK * L2BlockM * Size );
  4498. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4499. DEC( k, L2BlockK );
  4500. END;
  4501. (* core: do rest of k *)
  4502. IF k > 0 THEN
  4503. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4504. END;
  4505. INC( a1, KAligned * L2BlockM );
  4506. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4507. END;
  4508. IF m > 0 THEN
  4509. (* clean up M *)
  4510. adrA := a1; adrB := b1; k := K;
  4511. WHILE (k >= L2BlockK) DO
  4512. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4513. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4514. INC( adrA, L2BlockK * Size * m );
  4515. INC( adrB, L2BlockK * L2BlockN * Size );
  4516. DEC( k, L2BlockK );
  4517. END;
  4518. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4519. IF k > 0 THEN
  4520. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4521. END;
  4522. END;
  4523. INC( b1, L2BlockN * KAligned );
  4524. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4525. END;
  4526. IF (n = 0) THEN RETURN
  4527. END;
  4528. a1 := matrixA; adrC := matrixC; m := M;
  4529. WHILE (m >= L2BlockM) DO
  4530. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4531. adrA := a1; adrB := b1; k := K;
  4532. WHILE (k >= L2BlockK) DO
  4533. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4534. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4535. INC( adrA, L2BlockM * L2BlockK * Size );
  4536. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4537. END;
  4538. IF k > 0 THEN
  4539. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4540. END;
  4541. INC( a1, L2BlockM * KAligned );
  4542. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4543. END;
  4544. IF (m = 0) THEN RETURN
  4545. END;
  4546. adrA := a1; adrB := b1; k := K;
  4547. WHILE (k >= L2BlockK) DO
  4548. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4549. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4550. INC( adrA, L2BlockK * m * Size );
  4551. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4552. END;
  4553. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4554. END;
  4555. END L3BlockR;
  4556. PROCEDURE Align( adr: ADDRESS; align: SIZE ): ADDRESS;
  4557. BEGIN
  4558. RETURN adr + (-adr) MOD align; (* 128 bit = 16 byte alignment *)
  4559. END Align;
  4560. PROCEDURE CopyAX( matrixA, dest: ADDRESS;
  4561. IncA, StrideA: SIZE;
  4562. K, M, L2BlockK, L2BlockM: SIZE );
  4563. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4564. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4565. VAR rest: SIZE;
  4566. BEGIN
  4567. IF debug THEN
  4568. KernelLog.String( "CopyMK:" ); KernelLog.Int( M, 10 ); KernelLog.Int( K, 10 );
  4569. KernelLog.Ln;
  4570. END;
  4571. rest := (-K) MOD 2;
  4572. WHILE (M > 0) DO
  4573. MovX( matrixA, dest, IncA, K ); INC( dest, K * 8 );
  4574. IF rest # 0 THEN
  4575. ZeroX( dest, rest ); INC( dest, 8 * rest );
  4576. END;
  4577. INC( matrixA, StrideA ); DEC( M );
  4578. END;
  4579. END CopyMK;
  4580. BEGIN
  4581. Tic( t ); m := M;
  4582. WHILE (m >= L2BlockM) DO
  4583. k := K; adrA := matrixA;
  4584. WHILE (k >= L2BlockK) DO
  4585. CopyMK( adrA, L2BlockM, L2BlockK );
  4586. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4587. END;
  4588. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4589. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4590. END;
  4591. adrA := matrixA; k := K;
  4592. WHILE (k >= L2BlockK) DO
  4593. CopyMK( adrA, m, L2BlockK );
  4594. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4595. END;
  4596. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4597. Toc( t, copyT );
  4598. END CopyAX;
  4599. PROCEDURE CopyAR( matrixA, dest: ADDRESS;
  4600. IncA, StrideA: SIZE;
  4601. K, M, L2BlockK, L2BlockM: SIZE );
  4602. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4603. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4604. VAR rest: SIZE;
  4605. BEGIN
  4606. rest := (-K) MOD 4;
  4607. WHILE (M > 0) DO
  4608. MovR( matrixA, dest, IncA, K ); INC( dest, K * 4 );
  4609. IF rest # 0 THEN
  4610. ZeroR( dest, rest ); INC( dest, 4 * rest );
  4611. END;
  4612. INC( matrixA, StrideA ); DEC( M );
  4613. END;
  4614. END CopyMK;
  4615. BEGIN
  4616. Tic( t ); m := M;
  4617. WHILE (m >= L2BlockM) DO
  4618. k := K; adrA := matrixA;
  4619. WHILE (k >= L2BlockK) DO
  4620. CopyMK( adrA, L2BlockM, L2BlockK );
  4621. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4622. END;
  4623. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4624. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4625. END;
  4626. adrA := matrixA; k := K;
  4627. WHILE (k >= L2BlockK) DO
  4628. CopyMK( adrA, m, L2BlockK );
  4629. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4630. END;
  4631. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4632. Toc( t, copyT );
  4633. END CopyAR;
  4634. PROCEDURE CopyBX( matrixB, dest: ADDRESS;
  4635. IncB, StrideB: SIZE;
  4636. N, K, L2BlockN, L2BlockK: SIZE );
  4637. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4638. PROCEDURE Copy5x2k( matrixB: ADDRESS; k: SIZE );
  4639. VAR i: SIZE; adrB: ADDRESS; rest: SIZE;
  4640. BEGIN
  4641. rest := (-k) MOD 2;
  4642. WHILE (k >= 2) DO (* store 5x4 Block in line *)
  4643. adrB := matrixB;
  4644. FOR i := 1 TO L1BlockN DO
  4645. MovX( adrB, dest, StrideB, 2 ); INC( dest, 16 );
  4646. INC( adrB, IncB );
  4647. END;
  4648. INC( matrixB, 2 * StrideB ); DEC( k, 2 );
  4649. END;
  4650. IF k > 0 THEN
  4651. adrB := matrixB;
  4652. FOR i := 1 TO L1BlockN DO
  4653. MovX( adrB, dest, StrideB, k ); INC( dest, 8 * k );
  4654. IF rest # 0 THEN
  4655. ZeroX( dest, rest ); INC( dest, rest * 8 );
  4656. END;
  4657. INC( adrB, IncB );
  4658. END;
  4659. END;
  4660. END Copy5x2k;
  4661. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4662. VAR n, rest: SIZE;
  4663. BEGIN
  4664. rest := (-K) MOD 2;
  4665. IF debug THEN
  4666. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4667. END;
  4668. n := N;
  4669. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4670. Copy5x2k( matrixB, K );
  4671. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4672. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4673. END;
  4674. IF debug THEN KernelLog.String( "Copy1, n=" ); KernelLog.Int( n, 10 ); KernelLog.Ln;
  4675. END;
  4676. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4677. MovX( matrixB, dest, StrideB, K ); INC( dest, K * 8 );
  4678. ZeroR( dest, rest ); INC( dest, rest * 8 );
  4679. INC( matrixB, IncB ); DEC( n );
  4680. END;
  4681. END Copy1;
  4682. BEGIN
  4683. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4684. ASSERT( L2BlockK MOD 2 = 0 ); n := N;
  4685. WHILE (n >= L2BlockN) DO
  4686. k := K; adrB := matrixB;
  4687. WHILE (k >= L2BlockK) DO
  4688. Copy1( adrB, L2BlockK, L2BlockN );
  4689. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4690. END;
  4691. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4692. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4693. END;
  4694. IF (n = 0) THEN RETURN
  4695. END;
  4696. k := K; adrB := matrixB;
  4697. WHILE (k >= L2BlockK) DO
  4698. Copy1( adrB, L2BlockK, n );
  4699. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4700. END;
  4701. Copy1( adrB, k, n ); Toc( t, copyT );
  4702. END CopyBX;
  4703. PROCEDURE CopyBR( matrixB, dest: ADDRESS;
  4704. IncB, StrideB: SIZE;
  4705. N, K, L2BlockN, L2BlockK: SIZE );
  4706. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4707. PROCEDURE Copy5x4k( matrixB: ADDRESS; k: SIZE );
  4708. VAR i: SIZE; adrB: ADDRESS; rest, k4: SIZE;
  4709. BEGIN
  4710. k4 := k - k MOD 4; rest := (-k) MOD 4;
  4711. IF k4 > 0 THEN
  4712. MovR5( matrixB, IncB, StrideB, dest, k4 );
  4713. INC( matrixB, k4 * StrideB ); INC( dest, k4 * 80 DIV 4 );
  4714. DEC( k, k4 );
  4715. END;
  4716. (*
  4717. WHILE (k >= 4) DO (* store 5x4 Block in line *)
  4718. adrB := matrixB;
  4719. FOR i := 1 TO L1BlockN DO
  4720. MovR( adrB, dest, StrideB, 4 ); INC( dest, 16 ); INC( adrB, IncB );
  4721. END;
  4722. INC( matrixB, 4 * StrideB ); DEC( k, 4 );
  4723. END;
  4724. *)
  4725. IF k > 0 THEN
  4726. adrB := matrixB;
  4727. FOR i := 1 TO L1BlockN DO
  4728. MovR( adrB, dest, StrideB, k ); INC( dest, 4 * k );
  4729. IF rest # 0 THEN
  4730. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4731. END;
  4732. INC( adrB, IncB );
  4733. END;
  4734. END;
  4735. END Copy5x4k;
  4736. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4737. VAR n, rest: SIZE;
  4738. BEGIN
  4739. rest := (-K) MOD 4;
  4740. IF debug THEN
  4741. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4742. END;
  4743. n := N;
  4744. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4745. Copy5x4k( matrixB, K );
  4746. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4747. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4748. END;
  4749. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4750. MovR( matrixB, dest, StrideB, K ); INC( dest, K * 4 );
  4751. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4752. INC( matrixB, IncB ); DEC( n );
  4753. END;
  4754. END Copy1;
  4755. BEGIN
  4756. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4757. ASSERT( L2BlockK MOD 4 = 0 ); n := N;
  4758. WHILE (n >= L2BlockN) DO
  4759. k := K; adrB := matrixB;
  4760. WHILE (k >= L2BlockK) DO
  4761. Copy1( adrB, L2BlockK, L2BlockN );
  4762. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4763. END;
  4764. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4765. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4766. END;
  4767. IF (n = 0) THEN RETURN
  4768. END;
  4769. k := K; adrB := matrixB;
  4770. WHILE (k >= L2BlockK) DO
  4771. Copy1( adrB, L2BlockK, n );
  4772. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4773. END;
  4774. Copy1( adrB, k, n ); Toc( t, copyT );
  4775. END CopyBR;
  4776. (*
  4777. PROCEDURE FillMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4778. VAR i, j: LONGINT;
  4779. BEGIN
  4780. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4781. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4782. A[i, j] := ran.Dice( 10 );
  4783. IF debug THEN A[i, j] := 10 * i + j; END;
  4784. END;
  4785. END;
  4786. END FillMR;
  4787. PROCEDURE DispMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4788. VAR i, j: LONGINT;
  4789. BEGIN
  4790. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4791. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4792. KernelLog.Ln;
  4793. END;
  4794. END DispMR;
  4795. PROCEDURE FillMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4796. VAR i, j: LONGINT;
  4797. BEGIN
  4798. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4799. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4800. A[i, j] := ran.Dice( 10 );
  4801. IF debug THEN A[i, j] := 10 * i + j; END;
  4802. END;
  4803. END;
  4804. END FillMX;
  4805. PROCEDURE DispMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4806. VAR i, j: LONGINT;
  4807. BEGIN
  4808. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4809. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4810. KernelLog.Ln;
  4811. END;
  4812. END DispMX;
  4813. *)
  4814. PROCEDURE Tic( VAR t: HUGEINT );
  4815. BEGIN
  4816. t := Machine.GetTimer();
  4817. END Tic;
  4818. PROCEDURE Toc( VAR t, addto: HUGEINT );
  4819. BEGIN
  4820. INC( addto, Machine.GetTimer() - t ); t := Machine.GetTimer();
  4821. END Toc;
  4822. PROCEDURE MultiplyX( A, B, C: ADDRESS;
  4823. M, N, K, L2BlockM, L2BlockN, L2BlockK:SIZE;
  4824. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4825. add: BOOLEAN );
  4826. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4827. M1, M2, i: SIZE; val: LONGREAL; t: HUGEINT;
  4828. inc: SIZE;
  4829. obj: POINTER TO ARRAY OF MultiplyObjectX;
  4830. cache: Cache;
  4831. BEGIN
  4832. NEW(obj,nrProcesses+1);
  4833. lenA := M * Align2( K ) * 8; lenB := N * Align2( K ) * 8;
  4834. cache := cachePool.Acquire( lenA + lenB );
  4835. adrA := cache.adr; adrB := adrA + lenA;
  4836. CopyAX( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4837. CopyBX( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4838. Tic( t ); m := M; adrC := C;
  4839. IF ~add THEN
  4840. WHILE (m > 0) DO
  4841. ZeroXI( adrC, IncC, N ); INC( adrC, StrideC ); DEC( m );
  4842. END;
  4843. END;
  4844. Toc( t, zeroT );
  4845. IF debug THEN
  4846. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4847. FOR i := 0 TO M * Align2( K ) - 1 DO
  4848. SYSTEM.GET( adrA + i * 8, val );
  4849. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4850. END;
  4851. END;
  4852. IF debug THEN
  4853. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4854. FOR i := 0 TO N * Align2( K ) - 1 DO
  4855. SYSTEM.GET( adrB + i * 8, val );
  4856. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4857. END;
  4858. END;
  4859. IF parallel & (M > L2BlockM) THEN
  4860. inc := Align( MAX(M DIV nrProcesses,L2BlockM), L2BlockM ); M1 := 0;
  4861. i := 0;
  4862. WHILE (M1 < M) DO
  4863. M2 := M1 + inc;
  4864. IF M2 > M THEN M2 := M END;
  4865. NEW( obj[i], adrA + M1 * Align2( K ) * 8, adrB,
  4866. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4867. L2BlockM, L2BlockN, L2BlockK );
  4868. M1 := M2; INC( i );
  4869. END;
  4870. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4871. ELSE
  4872. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4873. L2BlockN, L2BlockK );
  4874. END;
  4875. Toc( t, compT ); cachePool.Release( cache );
  4876. END MultiplyX;
  4877. PROCEDURE MultiplyR( A, B, C: ADDRESS;
  4878. M, N, K, L2BlockM, L2BlockN, L2BlockK: SIZE;
  4879. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4880. add: BOOLEAN );
  4881. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4882. M1, M2, i: SIZE; val: REAL; inc: SIZE;
  4883. obj: POINTER TO ARRAY OF MultiplyObjectR;
  4884. t: HUGEINT; cache: Cache;
  4885. BEGIN
  4886. NEW(obj,nrProcesses+1);
  4887. lenA := M * Align4( K ) * 4; lenB := N * Align4( K ) * 4;
  4888. cache := cachePool.Acquire( lenA + lenB );
  4889. adrA := cache.adr; adrB := adrA + lenA;
  4890. CopyAR( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4891. CopyBR( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4892. Tic( t ); m := M; adrC := C;
  4893. IF ~add THEN
  4894. WHILE (m > 0) DO
  4895. ZeroRI( adrC, IncC, N ); INC( adrC, StrideC );
  4896. DEC( m );
  4897. END;
  4898. END;
  4899. Toc( t, zeroT );
  4900. IF debug THEN
  4901. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4902. FOR i := 0 TO M * Align4( K ) - 1 DO
  4903. SYSTEM.GET( adrA + i * 4, val );
  4904. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4905. END;
  4906. END;
  4907. IF debug THEN
  4908. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4909. FOR i := 0 TO N * Align4( K ) - 1 DO
  4910. SYSTEM.GET( adrB + i * 4, val );
  4911. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4912. END;
  4913. END;
  4914. IF parallel & (M > L2BlockM) THEN
  4915. inc := Align( MAX(M DIV nrProcesses,L2BlockM), L2BlockM ); M1 := 0;
  4916. i := 0;
  4917. WHILE (M1 < M) DO
  4918. M2 := M1 + inc;
  4919. IF M2 > M THEN M2 := M END;
  4920. NEW( obj[i], adrA + M1 * Align4( K ) * 4, adrB,
  4921. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4922. L2BlockM, L2BlockN, L2BlockK );
  4923. M1 := M2; INC( i );
  4924. END;
  4925. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4926. ELSE
  4927. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4928. L2BlockN, L2BlockK );
  4929. END;
  4930. Toc( t, compT ); cachePool.Release( cache );
  4931. END MultiplyR;
  4932. (*
  4933. PROCEDURE DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4934. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4935. A, B, C, D: ARRAY [ .. , .. ] OF LONGREAL;
  4936. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4937. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: LONGREAL; atime, time: LONGINT;
  4938. BEGIN
  4939. KernelLog.String( "LONGREAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4940. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4941. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4942. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4943. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMX( A ); FillMX( B );
  4944. IF debug THEN DispMX( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMX( B );
  4945. END;
  4946. atime := Input.Time(); (* C := 0; *)
  4947. WHILE (iter > 0) DO
  4948. MultiplyX( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4949. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4950. (*
  4951. 8,
  4952. LEN( A, 1 ) * 8, 8, LEN( B, 1 ) * 8, 8, LEN( C, 1 ) * 8
  4953. *)
  4954. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4955. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4956. );
  4957. DEC( iter );
  4958. END;
  4959. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4960. IF debug THEN
  4961. DispMX( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMX( B ); KernelLog.String( " = " );
  4962. KernelLog.Ln; DispMX( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  4963. END;
  4964. IF check THEN
  4965. (*
  4966. NEW(D,M,N);
  4967. MatMulAXAXNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4968. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4969. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  4970. *)
  4971. D := A * B;
  4972. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  4973. END;
  4974. END DoTestX;
  4975. PROCEDURE DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4976. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4977. A, B, C, D: ARRAY [ .. , .. ] OF REAL;
  4978. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4979. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: REAL; atime, time: LONGINT;
  4980. BEGIN
  4981. KernelLog.String( "REAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4982. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4983. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4984. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4985. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMR( A ); FillMR( B );
  4986. IF debug THEN DispMR( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMR( B );
  4987. END;
  4988. atime := Input.Time(); (* C := 0; *)
  4989. FOR i := 1 TO iter DO
  4990. MultiplyR( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4991. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4992. (* 4,
  4993. LEN( A, 1 ) * 4, 4, LEN( B, 1 ) * 4, 4, LEN( C, 1 ) * 4 *)
  4994. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4995. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4996. );
  4997. END;
  4998. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4999. IF debug THEN
  5000. DispMR( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMR( B ); KernelLog.String( " = " );
  5001. KernelLog.Ln; DispMR( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  5002. END;
  5003. IF check THEN
  5004. (*
  5005. NEW(D,M,N);
  5006. MatMulARARNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5007. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5008. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  5009. *)
  5010. D := A * B;
  5011. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  5012. END;
  5013. END DoTestR;
  5014. PROCEDURE RandTestR*;
  5015. VAR iter, i, time: LONGINT;
  5016. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  5017. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  5018. BEGIN
  5019. IF Min = Max THEN RETURN Min
  5020. ELSE RETURN ran.Dice( Max - Min ) + Min
  5021. END;
  5022. END Ran;
  5023. BEGIN
  5024. In.Open(); In.LongInt( iter );
  5025. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  5026. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  5027. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  5028. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  5029. K := Ran( MinK, MaxK );
  5030. IF N < 5 THEN N := 5 END;
  5031. IF K < 4 THEN K := 4 END;
  5032. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  5033. BN := Align( BN, 5 );
  5034. IF BN > N THEN DEC( BN, 5 ) END;
  5035. BK := Align( BK, 4 );
  5036. IF BK > K THEN DEC( BK, 4 ) END;
  5037. DoTestR( M, N, K, BM, BN, BK, TRUE , 1 );
  5038. END;
  5039. END RandTestR;
  5040. PROCEDURE RandTestX*;
  5041. VAR iter, i, time: LONGINT;
  5042. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  5043. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  5044. BEGIN
  5045. IF Min = Max THEN RETURN Min
  5046. ELSE RETURN ran.Dice( Max - Min ) + Min
  5047. END;
  5048. END Ran;
  5049. BEGIN
  5050. In.Open(); In.LongInt( iter );
  5051. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  5052. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  5053. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  5054. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  5055. K := Ran( MinK, MaxK );
  5056. IF N < 5 THEN N := 5 END;
  5057. IF K < 4 THEN K := 4 END;
  5058. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  5059. BN := Align( BN, 5 );
  5060. IF BN > N THEN DEC( BN, 5 ) END;
  5061. BK := Align( BK, 4 );
  5062. IF BK > K THEN DEC( BK, 4 ) END;
  5063. DoTestX( M, N, K, BM, BN, BK, TRUE , 1 );
  5064. END;
  5065. END RandTestX;
  5066. *)
  5067. (*
  5068. PROCEDURE Times*;
  5069. VAR all: HUGEINT;
  5070. BEGIN
  5071. all := allocT + copyT + zeroT + compT; KernelLog.String( "alloc=" );
  5072. KernelLog.LongRealFix( allocT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5073. KernelLog.Int( ENTIER( 100 * allocT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5074. KernelLog.Ln; KernelLog.String( "copy=" );
  5075. KernelLog.LongRealFix( copyT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5076. KernelLog.Int( ENTIER( 100 * copyT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5077. KernelLog.Ln; KernelLog.String( "zero=" );
  5078. KernelLog.LongRealFix( zeroT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5079. KernelLog.Int( ENTIER( 100 * zeroT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5080. KernelLog.Ln; KernelLog.String( "comp=" );
  5081. KernelLog.LongRealFix( compT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5082. KernelLog.Int( ENTIER( 100 * compT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5083. KernelLog.Ln;
  5084. END Times;
  5085. *)
  5086. (*
  5087. PROCEDURE TestRMM*;
  5088. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  5089. check, iter: LONGINT;
  5090. BEGIN
  5091. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  5092. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  5093. In.LongInt( iter ); In.LongInt( check );
  5094. IF L2BlockM = 0 THEN
  5095. MagicBlockR( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  5096. END;
  5097. DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  5098. END TestRMM;
  5099. PROCEDURE TestXMM*;
  5100. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  5101. iter, check: LONGINT;
  5102. BEGIN
  5103. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  5104. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  5105. In.LongInt( iter ); In.LongInt( check );
  5106. IF L2BlockM = 0 THEN
  5107. MagicBlockX( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  5108. END;
  5109. DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  5110. END TestXMM;
  5111. *)
  5112. (****** matrix multiplication using fast scalar product ******)
  5113. PROCEDURE MatMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5114. BEGIN
  5115. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  5116. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5117. END MatMulAXAXLoopA;
  5118. PROCEDURE MatMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5119. BEGIN
  5120. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  5121. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5122. END MatMulAXAXLoopSSE;
  5123. PROCEDURE MatMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5124. BEGIN
  5125. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  5126. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5127. END MatMulARARLoopA;
  5128. PROCEDURE MatMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5129. BEGIN
  5130. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  5131. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5132. END MatMulARARLoopSSE;
  5133. PROCEDURE MatMulIncAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5134. BEGIN
  5135. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5136. END MatMulIncAXAXLoopA;
  5137. PROCEDURE MatMulIncAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5138. BEGIN
  5139. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5140. END MatMulIncAXAXLoopSSE;
  5141. PROCEDURE MatMulIncARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5142. BEGIN
  5143. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5144. END MatMulIncARARLoopA;
  5145. PROCEDURE MatMulIncARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5146. BEGIN
  5147. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5148. END MatMulIncARARLoopSSE;
  5149. (****** matrix multiplication over rows with transposition of B *)
  5150. PROCEDURE MatMulHBlockR( MatrixA, MatrixB, MatrixC: ADDRESS;
  5151. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  5152. add: BOOLEAN );
  5153. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  5154. (*
  5155. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  5156. *)
  5157. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  5158. VAR i, j: SIZE; adrA, adrB, adrC: ADDRESS;
  5159. BEGIN
  5160. FOR i := fromA TO toA - 1 DO
  5161. adrA := MatrixA + i * Stride;
  5162. FOR j := fromB TO toB - 1 DO
  5163. adrB := MatrixB + j * Stride;
  5164. adrC := MatrixC + i * StrideC + j * IncC;
  5165. AlignedSPRSSE( adrA, adrB, adrC, Cols, add );
  5166. END;
  5167. END;
  5168. END Block;
  5169. BEGIN
  5170. IF cBlockSize = 0 THEN
  5171. BlockSize := L2CacheSize DIV Stride DIV 4;
  5172. ELSE BlockSize := cBlockSize;
  5173. END;
  5174. lastUsedBlockSize := BlockSize;
  5175. fromA := 0;
  5176. REPEAT
  5177. toA := fromA + BlockSize;
  5178. IF toA > RowsA THEN toA := RowsA END;
  5179. fromB := 0;
  5180. REPEAT
  5181. toB := fromB + BlockSize;
  5182. IF toB > RowsB THEN toB := RowsB END;
  5183. Block( fromA, toA, fromB, toB ); fromB := toB;
  5184. UNTIL toB = RowsB;
  5185. fromA := toA;
  5186. UNTIL toA = RowsA;
  5187. END MatMulHBlockR;
  5188. PROCEDURE MatMulHBlockX( MatrixA, MatrixB, MatrixC: ADDRESS;
  5189. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  5190. add: BOOLEAN );
  5191. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  5192. (*
  5193. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  5194. *)
  5195. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  5196. VAR adrA, adrB, adrC: ADDRESS; i, j: SIZE;
  5197. BEGIN
  5198. FOR i := fromA TO toA - 1 DO
  5199. adrA := MatrixA + i * Stride;
  5200. FOR j := fromB TO toB - 1 DO
  5201. adrB := MatrixB + j * Stride;
  5202. adrC := MatrixC + i * StrideC + j * IncC;
  5203. AlignedSPXSSE( adrA, adrB, adrC, Cols, add );
  5204. END;
  5205. END;
  5206. END Block;
  5207. BEGIN
  5208. IF cBlockSize = 0 THEN
  5209. BlockSize := L2CacheSize DIV Stride DIV 8;
  5210. ELSE BlockSize := cBlockSize;
  5211. END;
  5212. lastUsedBlockSize := BlockSize;
  5213. fromA := 0;
  5214. REPEAT
  5215. toA := fromA + BlockSize;
  5216. IF toA > RowsA THEN toA := RowsA END;
  5217. fromB := 0;
  5218. REPEAT
  5219. toB := fromB + BlockSize;
  5220. IF toB > RowsB THEN toB := RowsB END;
  5221. Block( fromA, toA, fromB, toB ); fromB := toB;
  5222. UNTIL toB = RowsB;
  5223. fromA := toA;
  5224. UNTIL toA = RowsA;
  5225. END MatMulHBlockX;
  5226. PROCEDURE CopyDataR( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  5227. VAR i: SIZE; t: HUGEINT;
  5228. BEGIN
  5229. Tic( t );
  5230. FOR i := 0 TO rows - 1 DO
  5231. Copy4( src, dest, incSrc, incDest, cols );
  5232. INC( src, strideSrc ); INC( dest, strideDest );
  5233. END;
  5234. Toc( t, copyT );
  5235. END CopyDataR;
  5236. PROCEDURE CopyDataX( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  5237. VAR i: SIZE; t: HUGEINT;
  5238. BEGIN
  5239. Tic( t );
  5240. FOR i := 0 TO rows - 1 DO
  5241. Copy8( src, dest, incSrc, incDest, cols );
  5242. INC( src, strideSrc ); INC( dest, strideDest );
  5243. END;
  5244. Toc( t, copyT );
  5245. END CopyDataX;
  5246. PROCEDURE MatMulARARTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5247. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5248. add: BOOLEAN ): BOOLEAN;
  5249. VAR stride: SIZE; adrB, adrC: ADDRESS;
  5250. proc: POINTER TO ARRAY OF MatMulHObjR;
  5251. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5252. t: HUGEINT;
  5253. BEGIN
  5254. NEW(proc,nrProcesses);
  5255. ASSERT( ColsA = RowsB );
  5256. (* allocate 128 bit = 16 byte aligned matrix *)
  5257. stride := Align( ColsA * SIZEOF( REAL ), 16 );
  5258. IF (IncA # SIZEOF( REAL )) OR (StrideA # stride) OR
  5259. (matrixA MOD 16 # 0) THEN
  5260. cacheA := cachePool.Acquire( stride * RowsA );
  5261. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5262. SIZEOF( REAL ), stride, RowsA, ColsA ); (* copy to array *)
  5263. matrixA := cacheA.adr;
  5264. ELSE cacheA := NIL;
  5265. END;
  5266. IF (StrideB # SIZEOF( REAL )) OR (IncB # stride) OR
  5267. (matrixB MOD 16 # 0) THEN
  5268. cacheB := cachePool.Acquire( stride * ColsB );
  5269. CopyDataR( matrixB, cacheB.adr, StrideB, IncB,
  5270. SIZEOF( REAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5271. matrixB := cacheB.adr;
  5272. ELSE cacheB := NIL;
  5273. END;
  5274. Tic( t );
  5275. (*! needs decision rule if to split by rows or columns *)
  5276. IF nrProcesses > 1 THEN
  5277. from := 0;
  5278. FOR i := 0 TO nrProcesses - 1 DO
  5279. (*
  5280. to := RowsA * (i + 1) DIV nrProcesses; adrA := matrixA + from * stride;
  5281. adrC := matrixC + from * StrideC;
  5282. *)
  5283. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5284. adrB := matrixB + from * stride;
  5285. adrC := matrixC + from * IncC;
  5286. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5287. RowsA, to0 - from, RowsB, add );
  5288. from := to0;
  5289. END;
  5290. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5291. ELSE
  5292. MatMulHBlockR( matrixA, matrixB, matrixC, stride, IncC,
  5293. StrideC, RowsA, ColsB, RowsB, add );
  5294. END;
  5295. Toc( t, compT ); cachePool.Release( cacheA );
  5296. cachePool.Release( cacheB ); RETURN TRUE;
  5297. END MatMulARARTransposed;
  5298. PROCEDURE MatMulAXAXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5299. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5300. add: BOOLEAN ): BOOLEAN;
  5301. VAR stride: SIZE; adrB, adrC: ADDRESS;
  5302. proc: POINTER TO ARRAY OF MatMulHObjX;
  5303. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5304. t: HUGEINT;
  5305. BEGIN
  5306. NEW(proc,nrProcesses);
  5307. ASSERT( ColsA = RowsB );
  5308. stride := Align( ColsA * SIZEOF( LONGREAL ), 16 );
  5309. IF (IncA # SIZEOF( LONGREAL )) OR (StrideA # stride) OR
  5310. (matrixA MOD 16 # 0) THEN
  5311. cacheA := cachePool.Acquire( stride * RowsA );
  5312. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5313. SIZEOF( LONGREAL ), stride, RowsA, ColsA ); (* copy to array *)
  5314. matrixA := cacheA.adr;
  5315. ELSE cacheA := NIL;
  5316. END;
  5317. IF (StrideB # SIZEOF( LONGREAL )) OR (IncB # stride) OR
  5318. (matrixB MOD 16 # 0) THEN
  5319. cacheB := cachePool.Acquire( stride * ColsB );
  5320. CopyDataX( matrixB, cacheB.adr, StrideB, IncB,
  5321. SIZEOF( LONGREAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5322. matrixB := cacheB.adr;
  5323. ELSE cacheB := NIL;
  5324. END;
  5325. Tic( t );
  5326. IF nrProcesses > 1 THEN
  5327. from := 0;
  5328. FOR i := 0 TO nrProcesses - 1 DO
  5329. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5330. adrB := matrixB + from * stride;
  5331. adrC := matrixC + from * IncC;
  5332. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5333. RowsA, to0 - from, RowsB, add );
  5334. from := to0;
  5335. END;
  5336. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5337. ELSE
  5338. MatMulHBlockX( matrixA, matrixB, matrixC, stride, IncC,
  5339. StrideC, RowsA, ColsB, RowsB, add );
  5340. END;
  5341. Toc( t, compT ); cachePool.Release( cacheA );
  5342. cachePool.Release( cacheB ); RETURN TRUE;
  5343. END MatMulAXAXTransposed;
  5344. (****** strided matrix multiplication with restrictions to increments ******)
  5345. PROCEDURE MatMulARARSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5346. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5347. add: BOOLEAN ): BOOLEAN;
  5348. VAR sum: REAL; CbFrom, i, j, k: SIZE; valA, valB: REAL;
  5349. adrA, adrB, adrC: ADDRESS;
  5350. cacheA, cacheB, cacheC: Cache;
  5351. matrixCO, StrideCO, IncCO: SIZE; t: HUGEINT;
  5352. (*VAR fromA, toA: LONGINT; *)
  5353. BEGIN
  5354. IF (IncA # SIZEOF( REAL )) THEN
  5355. cacheA :=
  5356. cachePool.Acquire( RowsA * ColsA * SIZEOF( REAL ) );
  5357. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5358. SIZEOF( REAL ), SIZEOF( REAL ) * ColsA, RowsA,
  5359. ColsA );
  5360. matrixA := cacheA.adr; IncA := SIZEOF( REAL );
  5361. StrideA := SIZEOF( REAL ) * ColsA;
  5362. END;
  5363. IF (IncB # SIZEOF( REAL )) THEN
  5364. cacheB :=
  5365. cachePool.Acquire( RowsB * ColsB * SIZEOF( REAL ) );
  5366. CopyDataR( matrixB, cacheB.adr, IncB, StrideB,
  5367. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsB,
  5368. ColsB );
  5369. matrixB := cacheB.adr; IncB := SIZEOF( REAL );
  5370. StrideB := SIZEOF( REAL ) * ColsB;
  5371. END;
  5372. IF (IncC # SIZEOF( REAL )) THEN
  5373. cacheC :=
  5374. cachePool.Acquire( RowsA * ColsB * SIZEOF( REAL ) );
  5375. CopyDataR( matrixC, cacheC.adr, IncC, StrideC,
  5376. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsA,
  5377. ColsB );
  5378. matrixCO := matrixC; StrideCO := StrideC;
  5379. IncCO := IncC; matrixC := cacheC.adr;
  5380. IncC := SIZEOF( REAL ); StrideC := SIZEOF( REAL ) * ColsB;
  5381. END;
  5382. Tic( t );
  5383. CbFrom := 0;
  5384. IF ColsB >= 24 THEN
  5385. SSEMul24BlockR( CbFrom, StrideA, StrideB, StrideC,
  5386. ColsA, RowsA, ColsB, RowsB, matrixA,
  5387. matrixB, matrixC, add );
  5388. END;
  5389. IF ColsB - CbFrom >= 16 THEN
  5390. SSEMul16BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5391. CbFrom, matrixA, matrixB, matrixC, add );
  5392. INC( CbFrom, 16 );
  5393. END;
  5394. IF ColsB - CbFrom >= 8 THEN
  5395. SSEMul8BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5396. CbFrom, matrixA, matrixB, matrixC, add );
  5397. INC( CbFrom, 8 );
  5398. END;
  5399. IF ColsB - CbFrom >= 4 THEN
  5400. SSEMul4BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5401. CbFrom, matrixA, matrixB, matrixC, add );
  5402. INC( CbFrom, 4 );
  5403. END;
  5404. IF ColsB - CbFrom > 0 THEN
  5405. (* do it in Oberon *)
  5406. FOR i := 0 TO RowsA - 1 DO
  5407. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5408. FOR j := CbFrom TO ColsB - 1 DO
  5409. adrA := matrixA + i * StrideA;
  5410. adrB := matrixB + j * IncB;
  5411. IF add THEN SYSTEM.GET( adrC, sum )
  5412. ELSE sum := 0
  5413. END;
  5414. FOR k := 0 TO RowsB - 1 DO
  5415. SYSTEM.GET( adrA, valA );
  5416. SYSTEM.GET( adrB, valB );
  5417. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5418. INC( adrA, IncA ); INC( adrB, StrideB );
  5419. END;
  5420. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5421. (* C[i, j] := sum; *)
  5422. END;
  5423. END;
  5424. END;
  5425. Toc( t, compT );
  5426. IF cacheC # NIL THEN
  5427. CopyDataR( matrixC, matrixCO, IncC, StrideC, IncCO,
  5428. StrideCO, RowsA, ColsB );
  5429. END;
  5430. cachePool.Release( cacheA );
  5431. cachePool.Release( cacheB );
  5432. cachePool.Release( cacheC );
  5433. RETURN TRUE;
  5434. END MatMulARARSSEStride;
  5435. PROCEDURE MatMulAXAXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5436. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5437. add: BOOLEAN ): BOOLEAN;
  5438. VAR sum: LONGREAL; CbFrom, i, j, k: SIZE;
  5439. valA, valB: LONGREAL; adrA, adrB, adrC: ADDRESS;
  5440. cacheA, cacheB, cacheC: Cache;
  5441. matrixCO, StrideCO, IncCO:SIZE; t: HUGEINT;
  5442. BEGIN
  5443. IF (IncA # SIZEOF( LONGREAL )) THEN
  5444. cacheA :=
  5445. cachePool.Acquire( RowsA * ColsA * SIZEOF( LONGREAL ) );
  5446. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5447. SIZEOF( LONGREAL ),
  5448. SIZEOF( LONGREAL ) * ColsA, RowsA, ColsA );
  5449. matrixA := cacheA.adr;
  5450. StrideA := SIZEOF( LONGREAL ) * ColsA;
  5451. IncA := SIZEOF( LONGREAL );
  5452. END;
  5453. IF (IncB # SIZEOF( LONGREAL )) THEN
  5454. cacheB :=
  5455. cachePool.Acquire( RowsB * ColsB * SIZEOF( LONGREAL ) );
  5456. CopyDataX( matrixB, cacheB.adr, IncB, StrideB,
  5457. SIZEOF( LONGREAL ),
  5458. SIZEOF( LONGREAL ) * ColsB, RowsB, ColsB );
  5459. matrixB := cacheB.adr;
  5460. StrideB := SIZEOF( LONGREAL ) * ColsB;
  5461. IncB := SIZEOF( LONGREAL );
  5462. END;
  5463. IF (IncC # SIZEOF( LONGREAL )) THEN
  5464. cacheC :=
  5465. cachePool.Acquire( RowsA * ColsB * SIZEOF( LONGREAL ) );
  5466. CopyDataX( matrixC, cacheC.adr, IncC, StrideC,
  5467. SIZEOF( LONGREAL ),
  5468. SIZEOF( LONGREAL ) * ColsB, RowsA, ColsB );
  5469. matrixCO := matrixC; StrideCO := StrideC;
  5470. IncCO := IncC; StrideC := SIZEOF( LONGREAL ) * ColsB;
  5471. IncC := SIZEOF( LONGREAL ); matrixC := cacheC.adr;
  5472. END;
  5473. Tic( t );
  5474. CbFrom := 0;
  5475. IF ColsB >= 12 THEN
  5476. SSEMul12BlockX( CbFrom, StrideA, StrideB, StrideC,
  5477. ColsA, RowsA, ColsB, RowsB, matrixA,
  5478. matrixB, matrixC, add );
  5479. END;
  5480. IF ColsB - CbFrom >= 8 THEN
  5481. SSEMul8BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5482. CbFrom, matrixA, matrixB, matrixC, add );
  5483. INC( CbFrom, 8 );
  5484. END;
  5485. IF ColsB - CbFrom >= 4 THEN
  5486. SSEMul4BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5487. CbFrom, matrixA, matrixB, matrixC, add );
  5488. INC( CbFrom, 4 );
  5489. END;
  5490. IF ColsB - CbFrom >= 2 THEN
  5491. SSEMul2BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5492. CbFrom, matrixA, matrixB, matrixC, add );
  5493. INC( CbFrom, 2 );
  5494. END;
  5495. IF ColsB - CbFrom > 0 THEN
  5496. (* do it in Oberon *)
  5497. FOR i := 0 TO RowsA - 1 DO
  5498. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5499. FOR j := CbFrom TO ColsB - 1 DO
  5500. adrA := matrixA + i * StrideA;
  5501. adrB := matrixB + j * IncB;
  5502. IF add THEN SYSTEM.GET( adrC, sum )
  5503. ELSE sum := 0
  5504. END;
  5505. FOR k := 0 TO RowsB - 1 DO
  5506. SYSTEM.GET( adrA, valA );
  5507. SYSTEM.GET( adrB, valB );
  5508. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5509. INC( adrA, IncA ); INC( adrB, StrideB );
  5510. END;
  5511. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5512. (* C[i, j] := sum; *)
  5513. END;
  5514. END;
  5515. END;
  5516. Toc( t, compT );
  5517. IF cacheC # NIL THEN
  5518. CopyDataX( matrixC, matrixCO, IncC, StrideC, IncCO,
  5519. StrideCO, RowsA, ColsB );
  5520. END;
  5521. cachePool.Release( cacheA );
  5522. cachePool.Release( cacheB );
  5523. cachePool.Release( cacheC );
  5524. RETURN TRUE;
  5525. END MatMulAXAXSSEStride;
  5526. (****** naiive Oberon matrix multiplication ******)
  5527. PROCEDURE MatMulARARNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5528. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5529. add: BOOLEAN );
  5530. (*
  5531. A is M x K matrix, M=rows (A); K=cols(A);
  5532. B is K x N matrix; K=rows(B); N = cols(B);
  5533. C is M x N matrix; M=rows(C); N=cols(C);
  5534. *)
  5535. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5536. val1, val2, sum: REAL; t: HUGEINT;
  5537. BEGIN
  5538. Tic( t );
  5539. FOR i := 1 TO M DO
  5540. adrC := matrixC; adrB := matrixB;
  5541. FOR j := 1 TO N DO
  5542. adrA := matrixA; innerB := adrB;
  5543. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5544. FOR k := 1 TO K DO
  5545. SYSTEM.GET( adrA, val1 );
  5546. SYSTEM.GET( innerB, val2 );
  5547. sum := sum + val1 * val2; INC( adrA, IncA );
  5548. INC( innerB, StrideB );
  5549. END;
  5550. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5551. INC( adrC, IncC );
  5552. END;
  5553. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5554. END;
  5555. Toc( t, compT );
  5556. END MatMulARARNaiive;
  5557. PROCEDURE MatMulAXAXNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5558. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5559. add: BOOLEAN );
  5560. (*
  5561. A is M x K matrix, M=rows (A); K=cols(A);
  5562. B is K x N matrix; K=rows(B); N = cols(B);
  5563. C is M x N matrix; M=rows(C); N=cols(C);
  5564. *)
  5565. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5566. val1, val2, sum: LONGREAL; t: HUGEINT;
  5567. BEGIN
  5568. Tic( t );
  5569. FOR i := 1 TO M DO
  5570. adrC := matrixC; adrB := matrixB;
  5571. FOR j := 1 TO N DO
  5572. adrA := matrixA; innerB := adrB;
  5573. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5574. FOR k := 1 TO K DO
  5575. SYSTEM.GET( adrA, val1 );
  5576. SYSTEM.GET( innerB, val2 );
  5577. sum := sum + val1 * val2; INC( adrA, IncA );
  5578. INC( innerB, StrideB );
  5579. END;
  5580. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5581. INC( adrC, IncC );
  5582. END;
  5583. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5584. END;
  5585. Toc( t, compT );
  5586. END MatMulAXAXNaiive;
  5587. (*
  5588. PROCEDURE Toggle( VAR A, B: LONGINT );
  5589. VAR temp: LONGINT;
  5590. BEGIN
  5591. temp := A; A := B; B := temp;
  5592. END Toggle;
  5593. PROCEDURE Transpose( VAR matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT );
  5594. (*
  5595. prepare computation of C=A*B via C = (B` * A`)`
  5596. *)
  5597. BEGIN
  5598. Toggle( matrixA, matrixB ); Toggle( IncA, StrideB ); Toggle( StrideA, IncB );
  5599. Toggle( IncC, StrideC ); Toggle( M, N );
  5600. END Transpose;
  5601. *)
  5602. (*
  5603. *)
  5604. PROCEDURE BestMethod( M, N, K: SIZE ): LONGINT;
  5605. BEGIN
  5606. IF M = 1 THEN
  5607. IF N < 32 THEN RETURN cMatMulScalarProduct
  5608. ELSIF N < 256 THEN
  5609. IF K < 256 THEN RETURN cMatMulScalarProduct
  5610. ELSE RETURN cMatMulStride
  5611. END;
  5612. ELSE RETURN cMatMulStride
  5613. END;
  5614. ELSIF N = 1 THEN
  5615. IF (M > 1024) & (K > 1024) THEN
  5616. RETURN cMatMulTransposed
  5617. ELSE RETURN cMatMulScalarProduct
  5618. END;
  5619. ELSIF K = 1 THEN
  5620. IF N < 32 THEN
  5621. IF M < 256 THEN RETURN cMatMulNaive
  5622. ELSE RETURN cMatMulStride
  5623. END;
  5624. ELSIF N < 256 THEN
  5625. IF M < 32 THEN RETURN cMatMulNaive
  5626. ELSE RETURN cMatMulStride
  5627. END;
  5628. ELSE RETURN cMatMulStride
  5629. END;
  5630. ELSIF M < 32 THEN
  5631. IF N < 32 THEN RETURN cMatMulScalarProduct
  5632. ELSIF N < 256 THEN
  5633. IF K < 32 THEN RETURN cMatMulScalarProduct
  5634. ELSE RETURN cMatMulStride
  5635. END;
  5636. ELSE RETURN cMatMulStride
  5637. END;
  5638. ELSIF M < 256 THEN
  5639. IF N < 32 THEN
  5640. IF K < 32 THEN RETURN cMatMulScalarProduct
  5641. ELSE RETURN cMatMulStride
  5642. END;
  5643. ELSE
  5644. IF K < 256 THEN RETURN cMatMulStride
  5645. ELSE RETURN cMatMulBlocked
  5646. END;
  5647. END;
  5648. ELSE
  5649. IF N < 32 THEN RETURN cMatMulStride ELSE
  5650. IF K < 256 THEN RETURN cMatMulStride
  5651. ELSE RETURN cMatMulBlocked
  5652. END;
  5653. END;
  5654. END;
  5655. RETURN cMatMulStride;
  5656. END BestMethod;
  5657. (*
  5658. (N) (K) (N)
  5659. CCCCCC AAAAA BBBBB
  5660. CCCCCC AAAAA BBBBB
  5661. (M) CCCCCC = (M) AAAAA * (K) BBBBB
  5662. CCCCCC AAAAA BBBBB
  5663. CCCCCC AAAAA BBBBB
  5664. *)
  5665. PROCEDURE MatMulR( matrixA, matrixB, matrixC: ADDRESS;
  5666. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5667. (*! heuristics for choice of different methods needs improvement *)
  5668. (*! transpose if superior*)
  5669. (*! provide special variant for small [up to 4x4] matrices *)
  5670. VAR M, N, K: SIZE;
  5671. BEGIN
  5672. ASSERT( ColsA = RowsB );
  5673. M := RowsA; N := ColsB; K := ColsA;
  5674. CASE BestMethod( M, N, K ) OF
  5675. | cMatMulScalarProduct:
  5676. RETURN FALSE;
  5677. | cMatMulNaive:
  5678. RETURN MatMulRNaive( matrixA, matrixB, matrixC, IncA,
  5679. StrideA, IncB, StrideB, IncC,
  5680. StrideC, RowsA, ColsA, RowsB,
  5681. ColsB );
  5682. | cMatMulTransposed:
  5683. RETURN MatMulARARTransposed( matrixA, matrixB,
  5684. matrixC, IncA,
  5685. StrideA, IncB,
  5686. StrideB, IncC,
  5687. StrideC, RowsA,
  5688. ColsA, RowsB,
  5689. ColsB, FALSE );
  5690. | cMatMulStride:
  5691. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5692. matrixC, IncA, StrideA,
  5693. IncB, StrideB, IncC,
  5694. StrideC, RowsA,
  5695. ColsA, RowsB, ColsB,
  5696. FALSE );
  5697. | cMatMulBlocked:
  5698. RETURN MatMulARARBlocked( matrixA, matrixB,
  5699. matrixC, IncA, StrideA,
  5700. IncB, StrideB, IncC,
  5701. StrideC, RowsA, ColsA,
  5702. RowsB, ColsB, FALSE );
  5703. ELSE
  5704. RETURN FALSE (* use scalar product for each row and column *)
  5705. END;
  5706. END MatMulR;
  5707. PROCEDURE MatMulX( matrixA, matrixB, matrixC: ADDRESS;
  5708. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5709. VAR M, N, K: SIZE;
  5710. BEGIN
  5711. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5712. K := ColsA;
  5713. (*
  5714. KernelLog.String("MatMulX, M,N,K = "); KernelLog.Int(M,10); KernelLog.Int(N,10); KernelLog.Int(K,10); KernelLog.Ln;
  5715. KernelLog.String("Method= "); KernelLog.Int( BestMethod(M,N,K),10); KernelLog.Ln;
  5716. *)
  5717. CASE BestMethod( M, N, K ) OF
  5718. | cMatMulScalarProduct:
  5719. RETURN FALSE;
  5720. | cMatMulNaive:
  5721. RETURN MatMulXNaive( matrixA, matrixB, matrixC, IncA,
  5722. StrideA, IncB, StrideB, IncC,
  5723. StrideC, RowsA, ColsA, RowsB,
  5724. ColsB );
  5725. | cMatMulTransposed:
  5726. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5727. matrixC, IncA,
  5728. StrideA, IncB, StrideB,
  5729. IncC, StrideC, RowsA,
  5730. ColsA, RowsB, ColsB,
  5731. FALSE );
  5732. | cMatMulStride:
  5733. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5734. matrixC, IncA, StrideA,
  5735. IncB, StrideB, IncC,
  5736. StrideC, RowsA, ColsA,
  5737. RowsB, ColsB,
  5738. FALSE );
  5739. | cMatMulBlocked:
  5740. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5741. matrixC, IncA, StrideA,
  5742. IncB, StrideB, IncC,
  5743. StrideC, RowsA, ColsA,
  5744. RowsB, ColsB, FALSE );
  5745. ELSE
  5746. RETURN FALSE (* use scalar product for each row and column *)
  5747. END;
  5748. END MatMulX;
  5749. PROCEDURE MatMulIncR( matrixA, matrixB, matrixC: ADDRESS;
  5750. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5751. (*! heuristics for choice of different methods needs improvement *)
  5752. (*! transpose if superior*)
  5753. (*! provide special variant for small [up to 4x4] matrices *)
  5754. VAR M, N, K: SIZE;
  5755. BEGIN
  5756. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5757. K := ColsA;
  5758. CASE BestMethod( M, N, K ) OF
  5759. | cMatMulScalarProduct:
  5760. RETURN FALSE;
  5761. | cMatMulNaive:
  5762. RETURN MatMulIncRNaive( matrixA, matrixB, matrixC,
  5763. IncA, StrideA, IncB, StrideB,
  5764. IncC, StrideC, RowsA, ColsA,
  5765. RowsB, ColsB );
  5766. | cMatMulTransposed:
  5767. RETURN MatMulARARTransposed( matrixA, matrixB,
  5768. matrixC, IncA,
  5769. StrideA, IncB,
  5770. StrideB, IncC,
  5771. StrideC, RowsA,
  5772. ColsA, RowsB,
  5773. ColsB, TRUE );
  5774. | cMatMulStride:
  5775. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5776. matrixC, IncA, StrideA,
  5777. IncB, StrideB, IncC,
  5778. StrideC, RowsA,
  5779. ColsA, RowsB, ColsB,
  5780. TRUE );
  5781. | cMatMulBlocked:
  5782. RETURN MatMulARARBlocked( matrixA, matrixB,
  5783. matrixC, IncA, StrideA,
  5784. IncB, StrideB, IncC,
  5785. StrideC, RowsA, ColsA,
  5786. RowsB, ColsB, TRUE );
  5787. ELSE
  5788. RETURN FALSE (* use scalar product for each row and column *)
  5789. END;
  5790. END MatMulIncR;
  5791. PROCEDURE MatMulIncX( matrixA, matrixB, matrixC: ADDRESS;
  5792. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5793. VAR M, N, K: SIZE;
  5794. BEGIN
  5795. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5796. K := ColsA;
  5797. CASE BestMethod( M, N, K ) OF
  5798. | cMatMulScalarProduct:
  5799. RETURN FALSE;
  5800. | cMatMulNaive:
  5801. RETURN MatMulIncXNaive( matrixA, matrixB, matrixC,
  5802. IncA, StrideA, IncB, StrideB,
  5803. IncC, StrideC, RowsA, ColsA,
  5804. RowsB, ColsB );
  5805. | cMatMulTransposed:
  5806. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5807. matrixC, IncA,
  5808. StrideA, IncB, StrideB,
  5809. IncC, StrideC, RowsA,
  5810. ColsA, RowsB, ColsB,
  5811. TRUE );
  5812. | cMatMulStride:
  5813. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5814. matrixC, IncA, StrideA,
  5815. IncB, StrideB, IncC,
  5816. StrideC, RowsA, ColsA,
  5817. RowsB, ColsB, TRUE );
  5818. | cMatMulBlocked:
  5819. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5820. matrixC, IncA, StrideA,
  5821. IncB, StrideB, IncC,
  5822. StrideC, RowsA, ColsA,
  5823. RowsB, ColsB, TRUE );
  5824. ELSE
  5825. RETURN FALSE (* use scalar product for each row and column *)
  5826. END;
  5827. END MatMulIncX;
  5828. PROCEDURE MatMulARARBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5829. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5830. add: BOOLEAN ): BOOLEAN;
  5831. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5832. BEGIN
  5833. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5834. K := ColsA; MagicBlockR( M, N, K, L2M, L2N, L2K );
  5835. (*
  5836. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5837. IncC, StrideC, RowsA, ColsB, ColsA );
  5838. *)
  5839. MultiplyR( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5840. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5841. StrideC, add );
  5842. RETURN TRUE;
  5843. END MatMulARARBlocked;
  5844. PROCEDURE MatMulAXAXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5845. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5846. add: BOOLEAN ): BOOLEAN;
  5847. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5848. BEGIN
  5849. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5850. K := ColsA; MagicBlockX( M, N, K, L2M, L2N, L2K );
  5851. (*
  5852. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5853. IncC, StrideC, RowsA, ColsB, ColsA );
  5854. *)
  5855. MultiplyX( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5856. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5857. StrideC, add );
  5858. RETURN TRUE;
  5859. END MatMulAXAXBlocked;
  5860. PROCEDURE MatMulRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5861. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5862. BEGIN
  5863. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5864. IncB, StrideB, IncC, StrideC, RowsA,
  5865. ColsB, ColsA, FALSE );
  5866. RETURN TRUE;
  5867. END MatMulRNaive;
  5868. PROCEDURE MatMulXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5869. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5870. BEGIN
  5871. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5872. IncB, StrideB, IncC, StrideC, RowsA,
  5873. ColsB, ColsA, FALSE );
  5874. RETURN TRUE;
  5875. END MatMulXNaive;
  5876. PROCEDURE MatMulIncRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5877. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5878. BEGIN
  5879. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5880. IncB, StrideB, IncC, StrideC, RowsA,
  5881. ColsB, ColsA, TRUE );
  5882. RETURN TRUE;
  5883. END MatMulIncRNaive;
  5884. PROCEDURE MatMulIncXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5885. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5886. BEGIN
  5887. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5888. IncB, StrideB, IncC, StrideC, RowsA,
  5889. ColsB, ColsA, TRUE );
  5890. RETURN TRUE;
  5891. END MatMulIncXNaive;
  5892. PROCEDURE MatMulXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5893. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5894. BEGIN
  5895. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5896. IncA, StrideA, IncB,
  5897. StrideB, IncC, StrideC,
  5898. RowsA, ColsA, RowsB,
  5899. ColsB, FALSE );
  5900. END MatMulXTransposed;
  5901. PROCEDURE MatMulIncXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5902. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5903. BEGIN
  5904. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5905. IncA, StrideA, IncB,
  5906. StrideB, IncC, StrideC,
  5907. RowsA, ColsA, RowsB,
  5908. ColsB, TRUE )
  5909. END MatMulIncXTransposed;
  5910. PROCEDURE MatMulRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5911. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5912. BEGIN
  5913. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5914. IncA, StrideA, IncB,
  5915. StrideB, IncC, StrideC,
  5916. RowsA, ColsA, RowsB,
  5917. ColsB, FALSE );
  5918. END MatMulRTransposed;
  5919. PROCEDURE MatMulIncRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5920. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5921. BEGIN
  5922. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5923. IncA, StrideA, IncB,
  5924. StrideB, IncC, StrideC,
  5925. RowsA, ColsA, RowsB,
  5926. ColsB, TRUE )
  5927. END MatMulIncRTransposed;
  5928. PROCEDURE MatMulXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5929. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5930. BEGIN
  5931. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5932. IncA, StrideA, IncB, StrideB,
  5933. IncC, StrideC, RowsA,
  5934. ColsA, RowsB, ColsB,
  5935. FALSE );
  5936. END MatMulXSSEStride;
  5937. PROCEDURE MatMulIncXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5938. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5939. BEGIN
  5940. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5941. IncA, StrideA, IncB, StrideB,
  5942. IncC, StrideC, RowsA,
  5943. ColsA, RowsB, ColsB,
  5944. TRUE );
  5945. END MatMulIncXSSEStride;
  5946. PROCEDURE MatMulRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5947. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5948. BEGIN
  5949. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5950. IncA, StrideA, IncB, StrideB,
  5951. IncC, StrideC, RowsA,
  5952. ColsA, RowsB, ColsB,
  5953. FALSE );
  5954. END MatMulRSSEStride;
  5955. PROCEDURE MatMulIncRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5956. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5957. BEGIN
  5958. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5959. IncA, StrideA, IncB, StrideB,
  5960. IncC, StrideC, RowsA,
  5961. ColsA, RowsB, ColsB,
  5962. TRUE )
  5963. END MatMulIncRSSEStride;
  5964. PROCEDURE MatMulRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5965. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5966. BEGIN
  5967. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5968. IncA, StrideA, IncB, StrideB,
  5969. IncC, StrideC, RowsA, ColsA,
  5970. RowsB, ColsB, FALSE )
  5971. END MatMulRBlocked;
  5972. PROCEDURE MatMulIncRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5973. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5974. BEGIN
  5975. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5976. IncA, StrideA, IncB, StrideB,
  5977. IncC, StrideC, RowsA, ColsA,
  5978. RowsB, ColsB, TRUE )
  5979. END MatMulIncRBlocked;
  5980. PROCEDURE MatMulXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5981. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5982. BEGIN
  5983. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5984. IncA, StrideA, IncB, StrideB,
  5985. IncC, StrideC, RowsA, ColsA,
  5986. RowsB, ColsB, FALSE )
  5987. END MatMulXBlocked;
  5988. PROCEDURE MatMulIncXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5989. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5990. BEGIN
  5991. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5992. IncA, StrideA, IncB, StrideB,
  5993. IncC, StrideC, RowsA, ColsA,
  5994. RowsB, ColsB, TRUE )
  5995. END MatMulIncXBlocked;
  5996. PROCEDURE SetMatMulMethod*( i: LONGINT );
  5997. BEGIN
  5998. KernelLog.String("ArrayBaseOptimized, method = ");
  5999. IF i = cMatMulDynamic THEN
  6000. KernelLog.String("dynamic.");
  6001. ArrayBase.matMulIncR := MatMulIncR;
  6002. ArrayBase.matMulIncX := MatMulIncX;
  6003. ArrayBase.matMulR := MatMulR;
  6004. ArrayBase.matMulX := MatMulX;
  6005. ELSIF i = cMatMulScalarProduct THEN
  6006. KernelLog.String("scalarproduct.");
  6007. ArrayBase.matMulIncR := NIL;
  6008. ArrayBase.matMulIncX := NIL;
  6009. ArrayBase.matMulR := NIL; ArrayBase.matMulX := NIL;
  6010. ELSIF i = cMatMulNaive THEN
  6011. KernelLog.String("naiive.");
  6012. ArrayBase.matMulR := MatMulRNaive;
  6013. ArrayBase.matMulX := MatMulXNaive;
  6014. ArrayBase.matMulIncR := MatMulIncRNaive;
  6015. ArrayBase.matMulIncX := MatMulIncXNaive;
  6016. ELSIF i = cMatMulTransposed THEN
  6017. KernelLog.String("transposed.");
  6018. ArrayBase.matMulR := MatMulRTransposed;
  6019. ArrayBase.matMulX := MatMulXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  6020. ArrayBase.matMulIncR := MatMulIncRTransposed;
  6021. ArrayBase.matMulIncX := MatMulIncXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  6022. ELSIF i = cMatMulStride THEN
  6023. KernelLog.String("stride.");
  6024. ArrayBase.matMulR := MatMulRSSEStride;
  6025. ArrayBase.matMulX := MatMulXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  6026. ArrayBase.matMulIncR := MatMulIncRSSEStride;
  6027. ArrayBase.matMulIncX := MatMulIncXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  6028. ELSIF i = cMatMulBlocked THEN
  6029. KernelLog.String("blocked.");
  6030. ArrayBase.matMulR := MatMulRBlocked;
  6031. ArrayBase.matMulX := MatMulXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  6032. ArrayBase.matMulIncR := MatMulIncRBlocked;
  6033. ArrayBase.matMulIncX := MatMulIncXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  6034. END;
  6035. KernelLog.Ln;
  6036. END SetMatMulMethod;
  6037. (* optimizations for small arrays (Alexey Morozov) *)
  6038. (* assumes that all arrays do not overlap *)
  6039. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  6040. PROCEDURE MatMulR2x2(dadr, ladr, radr: ADDRESS);
  6041. CODE{SYSTEM.i386, SYSTEM.SSE2}
  6042. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6043. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6044. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6045. MOVUPS XMM0, [EAX] ; [a00,a01,a10,a11]
  6046. MOVUPS XMM1, [EBX] ; [b00,b01,b10,b11]
  6047. MOVAPS XMM2, XMM1
  6048. SHUFPS XMM2, XMM1, 204 ; XMM2 := [b00,b11,b00,b11]
  6049. MULPS XMM2, XMM0
  6050. SHUFPS XMM0, XMM0, 177 ; XMM0 := [a01,a00,a11,a10]
  6051. SHUFPS XMM1, XMM1, 102 ; XMM1 := [b10,b01,b10,b01]
  6052. MULPS XMM1, XMM0
  6053. ADDPS XMM1, XMM2
  6054. MOVUPS [ECX], XMM1
  6055. END MatMulR2x2;
  6056. (* based on weighted sum of rows (Alexey Morozov) *)
  6057. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  6058. PROCEDURE MatMulR3x3(dadr, ladr, radr: ADDRESS);
  6059. CODE{SYSTEM.i386, SYSTEM.SSE2}
  6060. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6061. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6062. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6063. MOVUPS XMM0, [EBX] ; XMM0 := [b00,b01,b02,-]
  6064. MOVUPS XMM1, [EBX+12] ; XMM1 := [b10,b11,b12,-]
  6065. ; last element is out of range, is it still OK?
  6066. MOVUPS XMM2, [EBX+24] ; XMM2 := [b20,b21,b22,-]
  6067. ;MOVLPS XMM2, [EBX+24]
  6068. ;MOVSS XMM3, [EBX+32]
  6069. ;MOVLHPS XMM2, XMM3
  6070. MOVSS XMM3, [EAX]
  6071. SHUFPS XMM3, XMM3, 0; XMM3 := [a00,a00,a00,-]
  6072. MOVAPS XMM4, XMM0
  6073. MULPS XMM4, XMM3
  6074. MOVSS XMM3, [EAX+4]
  6075. SHUFPS XMM3, XMM3, 0; XMM3 := [a01,a01,a01,-]
  6076. MULPS XMM3, XMM1
  6077. ADDPS XMM4, XMM3
  6078. MOVSS XMM3, [EAX+8]
  6079. SHUFPS XMM3, XMM3, 0; XMM3 := [a02,a02,a02,-]
  6080. MULPS XMM3, XMM2
  6081. ADDPS XMM4, XMM3
  6082. MOVUPS [ECX], XMM4
  6083. ;***************************************************;
  6084. MOVSS XMM3, [EAX+12]
  6085. SHUFPS XMM3, XMM3, 0; XMM3 := [a10,a10,a10,-]
  6086. MOVAPS XMM4, XMM0
  6087. MULPS XMM4, XMM3
  6088. MOVSS XMM3, [EAX+16]
  6089. SHUFPS XMM3, XMM3, 0; XMM3 := [a11,a11,a11,-]
  6090. MULPS XMM3, XMM1
  6091. ADDPS XMM4, XMM3
  6092. MOVSS XMM3, [EAX+20]
  6093. SHUFPS XMM3, XMM3, 0; XMM3 := [a12,a12,a12,-]
  6094. MULPS XMM3, XMM2
  6095. ADDPS XMM4, XMM3
  6096. MOVUPS [ECX+12], XMM4
  6097. ;***************************************************;
  6098. MOVSS XMM3, [EAX+24]
  6099. SHUFPS XMM3, XMM3, 0; XMM3 := [a20,a20,a20,-]
  6100. MOVAPS XMM4, XMM0
  6101. MULPS XMM4, XMM3
  6102. MOVSS XMM3, [EAX+28]
  6103. SHUFPS XMM3, XMM3, 0; XMM3 := [a21,a21,a21,-]
  6104. MULPS XMM3, XMM1
  6105. ADDPS XMM4, XMM3
  6106. MOVSS XMM3, [EAX+32]
  6107. SHUFPS XMM3, XMM3, 0; XMM3 := [a22,a22,a22,-]
  6108. MULPS XMM3, XMM2
  6109. ADDPS XMM4, XMM3
  6110. ;MOVUPS [ECX+24], XMM4
  6111. MOVLPS [ECX+24], XMM4
  6112. MOVHLPS XMM4, XMM4
  6113. MOVSS [ECX+32], XMM4
  6114. END MatMulR3x3;
  6115. (* based on Strassen algorithm (Alexey Morozov) *)
  6116. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  6117. PROCEDURE MatMulR4x4(dadr, ladr, radr: ADDRESS);
  6118. CODE{SYSTEM.i386, SYSTEM.SSE2}
  6119. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6120. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6121. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6122. ; load A00
  6123. MOVLPS XMM0, [EAX] ; XMM0 := [a00,a01,-,-]
  6124. MOVHPS XMM0, [EAX+16] ; XMM0 := [a00,a01,a10,a11]
  6125. ; load A01
  6126. MOVLPS XMM1, [EAX+8] ; XMM1 := [a02,a03,-,-]
  6127. MOVHPS XMM1, [EAX+24] ; XMM1 := [a02,a03,a12,a13]
  6128. ; load B00
  6129. MOVLPS XMM2, [EBX] ; XMM2 := [b00,b01,-,-]
  6130. MOVHPS XMM2, [EBX+16] ; XMM2 := [b00,b01,b10,b11]
  6131. ; load B01
  6132. MOVLPS XMM3, [EBX+8] ; XMM3 := [a02,a03,-,-]
  6133. MOVHPS XMM3, [EBX+24] ; XMM3 := [a02,a03,a12,a13]
  6134. ; load B10
  6135. MOVLPS XMM4, [EBX+32] ; XMM4 := [b20,b21,-,-]
  6136. MOVHPS XMM4, [EBX+48] ; XMM4 := [b20,b21,b30,b31]
  6137. ; load B11
  6138. MOVLPS XMM5, [EBX+40] ; XMM5 := [b22,b23,-,-]
  6139. MOVHPS XMM5, [EBX+56] ; XMM5 := [b22,b23,b32,b33]
  6140. ;****************************************************;
  6141. ; multiply A00(D)*B00(E) (use MatMulR2x2 code)
  6142. MOVAPS XMM6, XMM2
  6143. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6144. MULPS XMM6, XMM0
  6145. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6146. MOVAPS XMM7, XMM2
  6147. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6148. MULPS XMM7, XMM0
  6149. ADDPS XMM7, XMM6
  6150. ; multiply A01(D)*B10(E)
  6151. MOVAPS XMM0, XMM4
  6152. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6153. MULPS XMM0, XMM1
  6154. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6155. MOVAPS XMM6, XMM4
  6156. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6157. MULPS XMM6, XMM1
  6158. ADDPS XMM6, XMM0
  6159. ADDPS XMM7, XMM6
  6160. MOVLPS [ECX], XMM7
  6161. MOVHPS [ECX+16], XMM7
  6162. ;****************************************************;
  6163. ; load A00
  6164. MOVLPS XMM0, [EAX] ; XMM0 := [a00,a01,-,-]
  6165. MOVHPS XMM0, [EAX+16] ; XMM0 := [a00,a01,a10,a11]
  6166. ; load A01
  6167. MOVLPS XMM1, [EAX+8] ; XMM1 := [a02,a03,-,-]
  6168. MOVHPS XMM1, [EAX+24] ; XMM1 := [a02,a03,a12,a13]
  6169. ; multiply A00(D)*B01(E) (use MatMulR2x2 code)
  6170. MOVAPS XMM6, XMM3
  6171. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6172. MULPS XMM6, XMM0
  6173. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6174. MOVAPS XMM7, XMM3
  6175. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6176. MULPS XMM7, XMM0
  6177. ADDPS XMM7, XMM6
  6178. ; multiply A01(D)*B11(E)
  6179. MOVAPS XMM0, XMM5
  6180. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6181. MULPS XMM0, XMM1
  6182. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6183. MOVAPS XMM6, XMM5
  6184. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6185. MULPS XMM6, XMM1
  6186. ADDPS XMM6, XMM0
  6187. ADDPS XMM7, XMM6
  6188. MOVLPS [ECX+8], XMM7
  6189. MOVHPS [ECX+24], XMM7
  6190. ;****************************************************;
  6191. ; load A10
  6192. MOVLPS XMM0, [EAX+32] ; XMM0 := [a20,a21,-,-]
  6193. MOVHPS XMM0, [EAX+48] ; XMM0 := [a20,a21,a30,a31]
  6194. ; load A11
  6195. MOVLPS XMM1, [EAX+40] ; XMM1 := [a22,a23,-,-]
  6196. MOVHPS XMM1, [EAX+56] ; XMM1 := [a22,a23,a32,a33]
  6197. ; multiply A10(D)*B00(E) (use MatMulR2x2 code)
  6198. MOVAPS XMM6, XMM2
  6199. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6200. MULPS XMM6, XMM0
  6201. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6202. MOVAPS XMM7, XMM2
  6203. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6204. MULPS XMM7, XMM0
  6205. ADDPS XMM7, XMM6
  6206. ; multiply A11(D)*B10(E)
  6207. MOVAPS XMM0, XMM4
  6208. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6209. MULPS XMM0, XMM1
  6210. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6211. MOVAPS XMM6, XMM4
  6212. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6213. MULPS XMM6, XMM1
  6214. ADDPS XMM6, XMM0
  6215. ADDPS XMM7, XMM6
  6216. MOVLPS [ECX+32], XMM7
  6217. MOVHPS [ECX+48], XMM7
  6218. ;****************************************************;
  6219. ; load A10
  6220. MOVLPS XMM0, [EAX+32] ; XMM0 := [a20,a21,-,-]
  6221. MOVHPS XMM0, [EAX+48] ; XMM0 := [a20,a21,a30,a31]
  6222. ; load A11
  6223. MOVLPS XMM1, [EAX+40] ; XMM1 := [a22,a23,-,-]
  6224. MOVHPS XMM1, [EAX+56] ; XMM1 := [a22,a23,a32,a33]
  6225. ; multiply A10(D)*B01(E) (use MatMulR2x2 code)
  6226. MOVAPS XMM6, XMM3
  6227. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6228. MULPS XMM6, XMM0
  6229. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6230. MOVAPS XMM7, XMM3
  6231. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6232. MULPS XMM7, XMM0
  6233. ADDPS XMM7, XMM6
  6234. ; multiply A11(D)*B11(E)
  6235. MOVAPS XMM0, XMM5
  6236. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6237. MULPS XMM0, XMM1
  6238. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6239. MOVAPS XMM6, XMM5
  6240. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6241. MULPS XMM6, XMM1
  6242. ADDPS XMM6, XMM0
  6243. ADDPS XMM7, XMM6
  6244. MOVLPS [ECX+40], XMM7
  6245. MOVHPS [ECX+56], XMM7
  6246. END MatMulR4x4;
  6247. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  6248. (* FIXME: speed it up when horizontal add is available!!! *)
  6249. PROCEDURE MatVecMulR2x2(dadr, ladr, radr: ADDRESS);
  6250. CODE{SYSTEM.i386, SYSTEM.SSE2}
  6251. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6252. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6253. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6254. ; load the whole matrix
  6255. MOVUPS XMM0, [EAX] ; XMM0 := [a00,a01,a10,a11]
  6256. MOVLPS XMM1, [EBX] ; XMM1 := [b00,b01,-,-]
  6257. MOVLHPS XMM1, XMM1 ; XMM1 := [b00,b10,b00,b10]
  6258. MULPS XMM0, XMM1 ; XMM0 := [a00*b00,a01*b10,a10*b00,a11*b10]
  6259. MOVAPS XMM1, XMM0
  6260. SHUFPS XMM0, XMM0, 8; XMM0 := [a00*b00,a10*b00,-,-]
  6261. SHUFPS XMM1, XMM1, 13; XMM1 := [a01*b10,a11*b10,-,-]
  6262. ADDPS XMM0, XMM1
  6263. MOVLPS [ECX], XMM0
  6264. END MatVecMulR2x2;
  6265. (* PH *)
  6266. (* to do: use MOVAPS when Felix fixes issues with alignment *)
  6267. PROCEDURE MatVecMulR4x4(dadr, ladr, radr: ADDRESS);
  6268. CODE{SYSTEM.i386, SYSTEM.SSE3}
  6269. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6270. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6271. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6272. MOVUPS XMM0, [EBX] ; XMM0 := [b0,b1,b2,b3]
  6273. MOVUPS XMM1, [EAX] ; XMM1 := [a00,a01,a02,a03]
  6274. MOVUPS XMM2, [EAX+16] ; XMM2 := [a10,a11,a12,a13]
  6275. MOVUPS XMM3, [EAX+32] ; XMM3 := [a20,a21,a22,a23]
  6276. MOVUPS XMM4, [EAX+48] ; XMM4 := [a30,a31,a32,a33]
  6277. MULPS XMM1, XMM0
  6278. MULPS XMM2, XMM0
  6279. HADDPS XMM1, XMM2 ; adjacent pairs are horizontally added
  6280. MULPS XMM3, XMM0
  6281. MULPS XMM4, XMM0
  6282. HADDPS XMM3, XMM4 ; adjacent pairs are horizontally added
  6283. HADDPS XMM1, XMM3 ; adjacent pairs are horizontally added
  6284. MOVUPS [ECX], XMM1
  6285. END MatVecMulR4x4;
  6286. PROCEDURE InstallMatMul*(context: Commands.Context);
  6287. VAR type: LONGINT; string: ARRAY 32 OF CHAR;
  6288. BEGIN
  6289. context.arg.String(string);
  6290. IF string = "dynamic" THEN
  6291. type := cMatMulDynamic;
  6292. ELSIF string = "scalarproduct" THEN
  6293. type := cMatMulScalarProduct
  6294. ELSIF string = "naive" THEN
  6295. type := cMatMulNaive
  6296. ELSIF string = "transposed" THEN
  6297. type := cMatMulTransposed
  6298. ELSIF string = "stride" THEN
  6299. type := cMatMulStride
  6300. ELSIF string ="blocked" THEN
  6301. type := cMatMulBlocked
  6302. ELSE
  6303. KernelLog.String("unknown method: "); KernelLog.String(string); KernelLog.Ln;
  6304. type := cMatMulDynamic;
  6305. END;
  6306. SetMatMulMethod( type );
  6307. END InstallMatMul;
  6308. PROCEDURE InstallAsm*;
  6309. BEGIN
  6310. KernelLog.String( "ASM " );
  6311. ArrayBase.loopSPAXAX := SPAXAXLoopA;
  6312. ArrayBase.loopSPARAR := SPARARLoopA;
  6313. ArrayBase.loopAddAXAX := AddAXAXLoopA;
  6314. ArrayBase.loopAddARAR := AddARARLoopA;
  6315. ArrayBase.loopSubAXAX := SubAXAXLoopA;
  6316. ArrayBase.loopSubARAR := SubARARLoopA;
  6317. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
  6318. ArrayBase.loopMatMulARAR := MatMulARARLoopA;
  6319. ArrayBase.loopMulAXSX := MulAXSXLoopA;
  6320. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopA;
  6321. ArrayBase.loopMulARSR := MulARSRLoopA;
  6322. ArrayBase.loopIncMulARSR := IncMulARSRLoopA;
  6323. ArrayBase.loopMatMulIncAXAX := MatMulIncAXAXLoopA;
  6324. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopA;
  6325. ArrayBase.transpose4 := Transpose4;
  6326. ArrayBase.transpose8 := Transpose8;
  6327. END InstallAsm;
  6328. PROCEDURE InstallSSE*;
  6329. BEGIN
  6330. IF Machine.SSESupport THEN
  6331. KernelLog.String( "SSE " );
  6332. ArrayBase.loopSPARAR := SPARARLoopSSE;
  6333. ArrayBase.loopAddARAR := AddARARLoopSSE;
  6334. ArrayBase.loopSubARAR := SubARARLoopSSE;
  6335. ArrayBase.loopMulARSR := MulARSRLoopSSE;
  6336. ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
  6337. ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
  6338. ArrayBase.matMulR := MatMulR;
  6339. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopSSE;
  6340. ArrayBase.matMulIncR := MatMulIncR;
  6341. (* optimizations for small matrices (Alexey Morozov) *)
  6342. ArrayBase.matMulR2x2 := MatMulR2x2;
  6343. ArrayBase.matMulR3x3 := MatMulR3x3;
  6344. ArrayBase.matMulR4x4 := MatMulR4x4;
  6345. ArrayBase.matVecMulR2x2 := MatVecMulR2x2;
  6346. END;
  6347. END InstallSSE;
  6348. PROCEDURE InstallSSE2*; (* extra for testing, will be merged with Install in later versions *)
  6349. BEGIN
  6350. IF Machine.SSE2Support THEN
  6351. KernelLog.String( "SSE2 " );
  6352. ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
  6353. ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
  6354. ArrayBase.loopSubAXAX := SubAXAXLoopSSE;
  6355. ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
  6356. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
  6357. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;
  6358. ArrayBase.matMulX := MatMulX;
  6359. ArrayBase.loopMatMulIncAXAX :=
  6360. MatMulIncAXAXLoopSSE;
  6361. ArrayBase.matMulIncX := MatMulIncX;
  6362. END;
  6363. END InstallSSE2;
  6364. (*! to do: at current, this only works for Win, not for native because SSE3Support is not yet implemented in BIOS.I386.Machine.Mod*)
  6365. PROCEDURE InstallSSE3*; (* extra for testing, will be merged with Install in later versions *)
  6366. BEGIN
  6367. IF Machine.SSE3Support THEN
  6368. KernelLog.String( "SSE3 " );
  6369. (* optimizations for small matrices *)
  6370. ArrayBase.matVecMulR4x4 := MatVecMulR4x4;
  6371. END;
  6372. END InstallSSE3;
  6373. PROCEDURE Install*;
  6374. BEGIN
  6375. KernelLog.String( "ArrayBaseOptimized: installing runtime library optimizations:" );
  6376. InstallAsm; InstallSSE; InstallSSE2; InstallSSE3;
  6377. KernelLog.String( " done." ); KernelLog.Ln;
  6378. END Install;
  6379. PROCEDURE SetParameters*( context: Commands.Context );
  6380. BEGIN
  6381. context.arg.SkipWhitespace; context.arg.Int(cBlockSize,TRUE);
  6382. context.arg.SkipWhitespace; context.arg.Int(nrProcesses,TRUE);
  6383. IF nrProcesses > maxProcesses THEN
  6384. nrProcesses := maxProcesses
  6385. ELSIF nrProcesses = 0 THEN nrProcesses := LONGINT (Machine.NumberOfProcessors());
  6386. END;
  6387. KernelLog.String( "BlockSize=" ); KernelLog.Int( cBlockSize, 0 );
  6388. KernelLog.String( ", NrProcesses = " ); KernelLog.Int( nrProcesses, 0 ); KernelLog.Ln;
  6389. END SetParameters;
  6390. BEGIN
  6391. cBlockSize := 0; (* automatic *)
  6392. nrProcesses := LONGINT (Machine.NumberOfProcessors()); (* automatic *)
  6393. allocT := 0; copyT := 0; compT := 0;
  6394. NEW( cachePool );
  6395. END FoxArrayBaseOptimized.
  6396. System.Free ArrayBaseOptimized ~
  6397. ArrayBaseOptimized.Install ~
  6398. ArrayBaseOptimized.InstallSSE2 ~
  6399. ArrayBaseOptimized.InstallSSE ~
  6400. ArrayBaseOptimized.InstallAsm ~
  6401. ArrayBaseOptimized.InstallMatMul dynamic ~
  6402. ArrayBaseOptimized.InstallMatMul scalarproduct ~
  6403. ArrayBaseOptimized.InstallMatMul transposed ~
  6404. ArrayBaseOptimized.InstallMatMul naive ~
  6405. ArrayBaseOptimized.InstallMatMul stride ~
  6406. ArrayBaseOptimized.InstallMatMul blocked ~
  6407. ArrayBaseOptimized.SetParameters 0 1 ~ (* BlockSize, NrProcesses *)