2
0

I386.FoxArrayBaseOptimized.Mod 197 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229
  1. MODULE FoxArrayBaseOptimized; (** AUTHOR "fof"; PURPOSE ""; **)
  2. IMPORT SYSTEM, ArrayBase := FoxArrayBase, Machine, KernelLog, Commands ;
  3. CONST
  4. L2CacheSize = 512 * 1024; (* L1CacheSize = 16 * 1024; *)
  5. (* parameters for blocking matrix multiplication *)
  6. L1BlockN = 5; (* L1 block size -> nr of columns in a block that can be processed using L1 chache *)
  7. L2BARatio = 1;
  8. L0BlockKR = 4; (* L0 block size -> nr of elements that can be processed at once for type REAL *)
  9. L1MaxBlockKR = 336; (* L1CacheSize/SIZEOF(REAL)/2/6*)
  10. L2BlockSize = 81920;
  11. L0BlockKX = 2; (* L0 block size -> nr of elements that can be processed at once for type LONGREAL *)
  12. L1MaxBlockKX = 256; (* > L1CacheSize/SIZEOF(LONGREAL)/2/6*)
  13. (*
  14. DefaultL2CacheSize = 81920;
  15. L2SizeR = L2CacheSize DIV 8; MaxBlockKR = 336; (* ca L1CacheSize/SIZEOF(REAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  16. L2SizeX = L2CacheSize DIV 8; MaxBlockKX = 256; (* bit more than L1CacheSize/SIZEL(LONGREAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  17. *)
  18. debug = FALSE; parallel = TRUE; SSE = TRUE;
  19. MaxCachePoolSize = 0 (* disabled *) (* 646*1024*1024 *) (* enabled *) ;
  20. maxProcesses = 48;
  21. cMatMulDynamic* = -1; cMatMulScalarProduct* = 0;
  22. cMatMulNaive* = 1; cMatMulTransposed* = 2;
  23. cMatMulStride* = 3; cMatMulBlocked* = 4;
  24. VAR
  25. cBlockSize*: LONGINT; nrProcesses*: LONGINT;
  26. lastUsedBlockSize*: SIZE;
  27. allocT-, copyT-, zeroT-, compT-: HUGEINT;
  28. TYPE
  29. Cache = POINTER TO RECORD
  30. p: ANY;
  31. adr: ADDRESS; size: SIZE;
  32. prev, next: Cache;
  33. END;
  34. CachePool = OBJECT
  35. (*! provide heuristics for overal size *)
  36. VAR first, last: Cache;
  37. PROCEDURE & Init*;
  38. BEGIN
  39. NEW( first ); first.size := 0; (* sentinel *)
  40. NEW( last ); last.size := MAX( SIZE ); (* sentinel *)
  41. first.next := last; first.prev := NIL; last.prev := first; last.next := NIL;
  42. END Init;
  43. PROCEDURE Acquire( size: SIZE ): Cache;
  44. VAR c: Cache; t: HUGEINT;
  45. BEGIN {EXCLUSIVE}
  46. IF size = 0 THEN RETURN first END;
  47. Tic( t );
  48. c := last;
  49. WHILE (c.prev.size >= size) DO
  50. c := c.prev;
  51. END;
  52. IF c = last THEN
  53. NEW( c ); SYSTEM.NEW( c.p, size + 16 );
  54. c.adr := Align( c.p , 16 );
  55. c.size := size;
  56. ELSE
  57. c.prev.next := c.next;
  58. c.next.prev := c.prev;
  59. c.prev := NIL; c.next := NIL;
  60. END;
  61. Toc( t, allocT ); RETURN c;
  62. END Acquire;
  63. PROCEDURE Release( c: Cache );
  64. VAR t: Cache;
  65. BEGIN {EXCLUSIVE}
  66. IF (c=first) OR (c=NIL) THEN RETURN END;
  67. ASSERT(c.size > 0);
  68. IF c.size > MaxCachePoolSize THEN RETURN END;
  69. t := first;
  70. WHILE (t.size <= c.size) DO t := t.next; END;
  71. c.prev := t.prev; c.next := t; t.prev := c; c.prev.next := c;
  72. END Release;
  73. END CachePool;
  74. ComputationObj = OBJECT
  75. VAR done: BOOLEAN;
  76. PROCEDURE & Init*;
  77. BEGIN
  78. done := FALSE;
  79. END Init;
  80. PROCEDURE Compute; (*abstract*)
  81. END Compute;
  82. PROCEDURE Wait;
  83. BEGIN {EXCLUSIVE}
  84. AWAIT( done );
  85. END Wait;
  86. BEGIN {ACTIVE, EXCLUSIVE}
  87. Compute; done := TRUE;
  88. END ComputationObj;
  89. MatMulHObjR = OBJECT (ComputationObj)
  90. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  91. add: BOOLEAN;
  92. PROCEDURE & InitR*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  93. add: BOOLEAN );
  94. BEGIN
  95. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  96. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  97. SELF.IncC := IncC; SELF.StrideC := StrideC;
  98. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  99. SELF.Cols := Cols; SELF.add := add;
  100. END InitR;
  101. PROCEDURE Compute;
  102. BEGIN
  103. MatMulHBlockR( MatrixA, MatrixB, MatrixC, Stride, IncC,
  104. StrideC, RowsA, RowsB, Cols, add );
  105. END Compute;
  106. END MatMulHObjR;
  107. MatMulHObjX = OBJECT (ComputationObj)
  108. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  109. add: BOOLEAN;
  110. PROCEDURE & InitX*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  111. add: BOOLEAN );
  112. BEGIN
  113. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  114. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  115. SELF.IncC := IncC; SELF.StrideC := StrideC;
  116. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  117. SELF.Cols := Cols; SELF.add := add;
  118. END InitX;
  119. PROCEDURE Compute;
  120. BEGIN
  121. MatMulHBlockX( MatrixA, MatrixB, MatrixC, Stride, IncC,
  122. StrideC, RowsA, RowsB, Cols, add );
  123. END Compute;
  124. END MatMulHObjX;
  125. MultiplyObjectR = OBJECT (ComputationObj);
  126. VAR adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK:SIZE;
  127. start, finished: BOOLEAN;
  128. PROCEDURE & InitR*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  129. BEGIN
  130. Init; start := FALSE; finished := FALSE;
  131. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  132. SELF.M := M; SELF.N := N; SELF.K := K;
  133. SELF.IncC := IncC; SELF.StrideC := StrideC;
  134. SELF.L2BlockM := L2BlockM;
  135. SELF.L2BlockN := L2BlockN;
  136. SELF.L2BlockK := L2BlockK;
  137. END InitR;
  138. PROCEDURE Compute;
  139. BEGIN
  140. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  141. L2BlockN, L2BlockK );
  142. END Compute;
  143. END MultiplyObjectR;
  144. MultiplyObjectX = OBJECT (ComputationObj);
  145. VAR adrA, adrB:ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE;
  146. start, finished: BOOLEAN;
  147. PROCEDURE & InitX*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  148. BEGIN
  149. Init; start := FALSE; finished := FALSE;
  150. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  151. SELF.M := M; SELF.N := N; SELF.K := K;
  152. SELF.IncC := IncC; SELF.StrideC := StrideC;
  153. SELF.L2BlockM := L2BlockM;
  154. SELF.L2BlockN := L2BlockN;
  155. SELF.L2BlockK := L2BlockK;
  156. END InitX;
  157. PROCEDURE Compute;
  158. BEGIN
  159. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  160. L2BlockN, L2BlockK );
  161. END Compute;
  162. END MultiplyObjectX;
  163. VAR
  164. (* ran: Random.Generator; (* testing *)*)
  165. cachePool: CachePool;
  166. (*********** Part 0: assembler routines ***************)
  167. PROCEDURE -L1Block1XA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  168. CODE {SYSTEM.i386, SYSTEM.FPU}
  169. MOV EAX, [ESP+K] ; EAX IS counter
  170. MOV EDX, [ESP+adrC]
  171. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  172. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  173. FLD QWORD [EDX] ; S.GET(dadr, x)
  174. loop8:
  175. CMP EAX, 8
  176. JL loop1
  177. FLD QWORD[EBX] ; S.GET(ladr, x)
  178. ADD EBX, 8 ; INC(ladr, incl)
  179. FLD QWORD[ECX] ; S.GET(ladr, y)
  180. ADD ECX, 8 ; INC(radr, incr)
  181. FMULP ; x := x*y
  182. FADDP ; z := z+x
  183. FLD QWORD[EBX] ; S.GET(ladr, x)
  184. ADD EBX, 8 ; INC(ladr, incl)
  185. FLD QWORD[ECX] ; S.GET(ladr, y)
  186. ADD ECX, 8 ; INC(radr, incr)
  187. FMULP ; x := x*y
  188. FADDP ; z := z+x
  189. FLD QWORD[EBX] ; S.GET(ladr, x)
  190. ADD EBX, 8 ; INC(ladr, incl)
  191. FLD QWORD[ECX] ; S.GET(ladr, y)
  192. ADD ECX, 8 ; INC(radr, incr)
  193. FMULP ; x := x*y
  194. FADDP ; z := z+x
  195. FLD QWORD[EBX] ; S.GET(ladr, x)
  196. ADD EBX, 8 ; INC(ladr, incl)
  197. FLD QWORD[ECX] ; S.GET(ladr, y)
  198. ADD ECX, 8 ; INC(radr, incr)
  199. FMULP ; x := x*y
  200. FADDP ; z := z+x
  201. FLD QWORD[EBX] ; S.GET(ladr, x)
  202. ADD EBX, 8 ; INC(ladr, incl)
  203. FLD QWORD[ECX] ; S.GET(ladr, y)
  204. ADD ECX, 8 ; INC(radr, incr)
  205. FMULP ; x := x*y
  206. FADDP ; z := z+x
  207. FLD QWORD[EBX] ; S.GET(ladr, x)
  208. ADD EBX, 8 ; INC(ladr, incl)
  209. FLD QWORD[ECX] ; S.GET(ladr, y)
  210. ADD ECX, 8 ; INC(radr, incr)
  211. FMULP ; x := x*y
  212. FADDP ; z := z+x
  213. FLD QWORD[EBX] ; S.GET(ladr, x)
  214. ADD EBX, 8 ; INC(ladr, incl)
  215. FLD QWORD[ECX] ; S.GET(ladr, y)
  216. ADD ECX, 8 ; INC(radr, incr)
  217. FMULP ; x := x*y
  218. FADDP ; z := z+x
  219. FLD QWORD[EBX] ; S.GET(ladr, x)
  220. ADD EBX, 8 ; INC(ladr, incl)
  221. FLD QWORD[ECX] ; S.GET(ladr, y)
  222. ADD ECX, 8 ; INC(radr, incr)
  223. FMULP ; x := x*y
  224. FADDP ; z := z+x
  225. SUB EAX, 8 ; DEC(len)
  226. JMP loop8 ;
  227. loop1:
  228. CMP EAX, 0 ; WHILE len > 0 DO
  229. JLE endL
  230. FLD QWORD[EBX] ; S.GET(ladr, x)
  231. ADD EBX, 8 ; INC(ladr, incl)
  232. FLD QWORD[ECX] ; S.GET(ladr, y)
  233. ADD ECX, 8 ; INC(radr, incr)
  234. FMULP ; x := x*y
  235. FADDP ; z := z+x
  236. DEC EAX ; DEC(len)
  237. JMP loop1 ;
  238. endL:
  239. FSTP QWORD[EDX] ; S.PUT(dadr, x)
  240. FWAIT ;
  241. ADD ESP, 16 ;
  242. END L1Block1XA;
  243. PROCEDURE -L1Block1XSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  244. (*
  245. matrixA, matrixB must be stored in special format
  246. K>0 guaranteed
  247. *)
  248. CODE {SYSTEM.i386, SYSTEM.SSE2}
  249. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  250. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  251. MOV EDX, [ESP+K] ; EDX IS counter
  252. XORPD XMM2, XMM2 ;
  253. kLoop8: ;
  254. CMP EDX, 8 ;
  255. JL kLoop2 ;
  256. MOVAPD XMM7, [EBX] ;
  257. MOVAPD XMM0, [ECX] ;
  258. ADD ECX, 16 ;
  259. ADD EBX, 16 ;
  260. MOVAPD XMM6, [EBX] ;
  261. MOVAPD XMM1, [ECX] ;
  262. ADD ECX, 16 ;
  263. ADD EBX, 16 ;
  264. MULPD XMM0, XMM7 ;
  265. ADDPD XMM2, XMM0 ;
  266. MOVAPD XMM5, [EBX] ;
  267. MOVAPD XMM3, [ECX] ;
  268. ADD ECX, 16 ;
  269. ADD EBX, 16 ;
  270. MULPD XMM1, XMM6 ;
  271. ADDPD XMM2, XMM1 ;
  272. MOVAPD XMM7, [EBX] ;
  273. MOVAPD XMM0, [ECX] ;
  274. ADD ECX, 16 ;
  275. ADD EBX, 16 ;
  276. MULPD XMM3, XMM5 ;
  277. ADDPD XMM2, XMM3 ;
  278. MULPD XMM0, XMM7 ;
  279. ADDPD XMM2, XMM0 ;
  280. SUB EDX, 8 ;
  281. JMP kLoop8 ;
  282. kLoop2: ;
  283. CMP EDX, 0 ;
  284. JLE horizontalAdd ;
  285. MOVAPD XMM7, [EBX] ;
  286. MOVAPD XMM0, [ECX] ;
  287. ADD ECX, 16 ;
  288. ADD EBX, 16 ;
  289. MULPD XMM0, XMM7 ;
  290. ADDPD XMM2, XMM0 ;
  291. SUB EDX, 2
  292. JMP kLoop2 ;
  293. horizontalAdd:
  294. MOV EDI, [ESP+adrC] ;
  295. MOVAPD XMM1, XMM2 ;
  296. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  297. ADDPD XMM2, XMM1 ;
  298. ADDSD XMM2, [EDI] ;
  299. MOVSD [EDI], XMM2 ;
  300. endL:
  301. ADD ESP, 16 ;
  302. END L1Block1XSSE;
  303. PROCEDURE -L1Block5XSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  304. (*
  305. matrixA and matrix B are stored in special format !
  306. K > 0 is guaranteed
  307. *)
  308. CODE {SYSTEM.i386, SYSTEM.SSE2}
  309. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  310. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  311. MOV EDX, [ESP+K] ; EDX IS counter
  312. XORPD XMM2, XMM2 ;
  313. XORPD XMM3, XMM3 ;
  314. XORPD XMM4, XMM4 ;
  315. XORPD XMM5, XMM5 ;
  316. XORPD XMM6, XMM6 ;
  317. kLoop8: ;
  318. CMP EDX, 8 ;
  319. JL kLoop2
  320. ; (*-- 0 -- *) ;
  321. MOVAPD XMM7, [EBX] ; get 4 elements OF A
  322. ADD EBX, 16 ;
  323. MOVAPD XMM0, [ECX] ; get 4 elements OF B
  324. ADD ECX, 16 ;
  325. MOVAPD XMM1, [ECX] ; get 4 elements OF B
  326. ADD ECX, 16 ;
  327. MULPD XMM0, XMM7 ;
  328. ADDPD XMM2, XMM0 ;
  329. MOVAPD XMM0, [ECX] ;
  330. ADD ECX, 16 ;
  331. MULPD XMM1, XMM7 ;
  332. ADDPD XMM3, XMM1 ;
  333. MOVAPD XMM1, [ECX] ;
  334. ADD ECX, 16 ;
  335. MULPD XMM0, XMM7 ;
  336. ADDPD XMM4, XMM0 ;
  337. MOVAPD XMM0, [ECX] ;
  338. ADD ECX, 16 ;
  339. MULPD XMM1, XMM7 ;
  340. ADDPD XMM5, XMM1 ;
  341. MOVAPD XMM1, [ECX] ;
  342. ADD ECX, 16 ;
  343. MULPD XMM0, XMM7 ;
  344. ADDPD XMM6, XMM0
  345. ; (*-- 2 -- *) ;
  346. MOVAPD XMM7, [EBX] ;
  347. ADD EBX, 16 ;
  348. MOVAPD XMM0, [ECX] ;
  349. ADD ECX, 16 ;
  350. MULPD XMM1, XMM7 ;
  351. ADDPD XMM2, XMM1 ;
  352. MOVAPD XMM1, [ECX] ;
  353. ADD ECX, 16 ;
  354. MULPD XMM0, XMM7 ;
  355. ADDPD XMM3, XMM0 ;
  356. MOVAPD XMM0, [ECX] ;
  357. ADD ECX, 16 ;
  358. MULPD XMM1, XMM7 ;
  359. ADDPD XMM4, XMM1 ;
  360. MOVAPD XMM1, [ECX] ;
  361. ADD ECX, 16 ;
  362. MULPD XMM0, XMM7 ;
  363. ADDPD XMM5, XMM0 ;
  364. MOVAPD XMM0, [ECX] ;
  365. ADD ECX, 16 ;
  366. MULPD XMM1, XMM7 ;
  367. ADDPD XMM6, XMM1
  368. ; (*-- 4 -- *) ;
  369. MOVAPD XMM7, [EBX] ;
  370. ADD EBX, 16 ;
  371. MOVAPD XMM1, [ECX] ;
  372. ADD ECX, 16 ;
  373. MULPD XMM0, XMM7 ;
  374. ADDPD XMM2, XMM0 ;
  375. MOVAPD XMM0, [ECX] ;
  376. ADD ECX, 16 ;
  377. MULPD XMM1, XMM7 ;
  378. ADDPD XMM3, XMM1 ;
  379. MOVAPD XMM1, [ECX] ;
  380. ADD ECX, 16 ;
  381. MULPD XMM0, XMM7 ;
  382. ADDPD XMM4, XMM0 ;
  383. MOVAPD XMM0, [ECX] ;
  384. ADD ECX, 16 ;
  385. MULPD XMM1, XMM7 ;
  386. ADDPD XMM5, XMM1 ;
  387. MOVAPD XMM1, [ECX] ;
  388. ADD ECX, 16 ;
  389. MULPD XMM0, XMM7 ;
  390. ADDPD XMM6, XMM0
  391. ; (*-- 6 -- *) ;
  392. MOVAPD XMM7, [EBX] ;
  393. ADD EBX, 16 ;
  394. MOVAPD XMM0, [ECX] ;
  395. ADD ECX, 16 ;
  396. MULPD XMM1, XMM7 ;
  397. ADDPD XMM2, XMM1 ;
  398. MOVAPD XMM1, [ECX] ;
  399. ADD ECX, 16 ;
  400. MULPD XMM0, XMM7 ;
  401. ADDPD XMM3, XMM0 ;
  402. MOVAPD XMM0, [ECX] ;
  403. ADD ECX, 16 ;
  404. MULPD XMM1, XMM7 ;
  405. ADDPD XMM4, XMM1 ;
  406. MOVAPD XMM1, [ECX] ;
  407. ADD ECX, 16 ;
  408. MULPD XMM0, XMM7 ;
  409. ADDPD XMM5, XMM0 ;
  410. MULPD XMM1, XMM7 ;
  411. ADDPD XMM6, XMM1 ;
  412. SUB EDX, 8
  413. JMP kLoop8 ;
  414. kLoop2: ;
  415. CMP EDX, 0 ;
  416. JLE horizontalAdd ;
  417. MOVAPD XMM7, [EBX] ;
  418. ADD EBX, 16 ;
  419. MOVAPD XMM0, [ECX] ;
  420. ADD ECX, 16 ;
  421. MOVAPD XMM1, [ECX] ;
  422. ADD ECX, 16 ;
  423. MULPD XMM0, XMM7 ;
  424. ADDPD XMM2, XMM0 ;
  425. MOVAPD XMM0, [ECX] ;
  426. ADD ECX, 16 ;
  427. MULPD XMM1, XMM7 ;
  428. ADDPD XMM3, XMM1 ;
  429. MOVAPD XMM1, [ECX] ;
  430. ADD ECX, 16 ;
  431. MULPD XMM0, XMM7 ;
  432. ADDPD XMM4, XMM0 ;
  433. MOVAPD XMM0, [ECX] ;
  434. ADD ECX, 16 ;
  435. MULPD XMM1, XMM7 ;
  436. ADDPD XMM5, XMM1 ;
  437. MULPD XMM0, XMM7 ;
  438. ADDPD XMM6, XMM0 ;
  439. SUB EDX, 2
  440. JMP kLoop2 ;
  441. horizontalAdd: ; add and store
  442. MOV EDI, [ESP+adrC] ;
  443. MOV EAX, [ESP+IncC] ;
  444. MOVAPD XMM1, XMM2 ;
  445. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  446. ADDPD XMM2, XMM1 ;
  447. ADDSD XMM2, [EDI] ;
  448. MOVSD [EDI], XMM2 ;
  449. ADD EDI, EAX ;
  450. MOVAPD XMM1, XMM3 ;
  451. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  452. ADDPD XMM3, XMM1 ;
  453. ADDSD XMM3, [EDI] ;
  454. MOVSD [EDI], XMM3 ;
  455. ADD EDI, EAX ;
  456. MOVAPD XMM1, XMM4 ;
  457. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  458. ADDPD XMM4, XMM1 ;
  459. ADDSD XMM4, [EDI] ;
  460. MOVSD [EDI], XMM4 ;
  461. ADD EDI, EAX ;
  462. MOVAPD XMM1, XMM5 ;
  463. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  464. ADDPD XMM5, XMM1 ;
  465. ADDSD XMM5, [EDI] ;
  466. MOVSD [EDI], XMM5 ;
  467. ADD EDI, EAX ;
  468. MOVAPD XMM1, XMM6 ;
  469. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  470. ADDPD XMM6, XMM1 ;
  471. ADDSD XMM6, [EDI] ;
  472. MOVSD [EDI], XMM6 ;
  473. endL:
  474. ADD ESP, 20 ;
  475. END L1Block5XSSE;
  476. PROCEDURE -L1Block1RA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  477. CODE {SYSTEM.i386, SYSTEM.FPU}
  478. MOV EAX, [ESP+K] ; EAX IS counter
  479. MOV EDX, [ESP+adrC]
  480. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  481. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  482. FLD DWORD [EDX] ; S.GET(dadr, x)
  483. loop16:
  484. CMP EAX, 16
  485. JL loop1
  486. FLD DWORD[EBX] ; S.GET(ladr, x)
  487. ADD EBX, 4 ; INC(ladr, incl)
  488. FLD DWORD[ECX] ; S.GET(ladr, y)
  489. ADD ECX, 4 ; INC(radr, incr)
  490. FMULP ; x := x*y
  491. FADDP ; z := z+x
  492. FLD DWORD[EBX] ; S.GET(ladr, x)
  493. ADD EBX, 4 ; INC(ladr, incl)
  494. FLD DWORD[ECX] ; S.GET(ladr, y)
  495. ADD ECX, 4 ; INC(radr, incr)
  496. FMULP ; x := x*y
  497. FADDP ; z := z+x
  498. FLD DWORD[EBX] ; S.GET(ladr, x)
  499. ADD EBX, 4 ; INC(ladr, incl)
  500. FLD DWORD[ECX] ; S.GET(ladr, y)
  501. ADD ECX, 4 ; INC(radr, incr)
  502. FMULP ; x := x*y
  503. FADDP ; z := z+x
  504. FLD DWORD[EBX] ; S.GET(ladr, x)
  505. ADD EBX, 4 ; INC(ladr, incl)
  506. FLD DWORD[ECX] ; S.GET(ladr, y)
  507. ADD ECX, 4 ; INC(radr, incr)
  508. FMULP ; x := x*y
  509. FADDP ; z := z+x
  510. FLD DWORD[EBX] ; S.GET(ladr, x)
  511. ADD EBX, 4 ; INC(ladr, incl)
  512. FLD DWORD[ECX] ; S.GET(ladr, y)
  513. ADD ECX, 4 ; INC(radr, incr)
  514. FMULP ; x := x*y
  515. FADDP ; z := z+x
  516. FLD DWORD[EBX] ; S.GET(ladr, x)
  517. ADD EBX, 4 ; INC(ladr, incl)
  518. FLD DWORD[ECX] ; S.GET(ladr, y)
  519. ADD ECX, 4 ; INC(radr, incr)
  520. FMULP ; x := x*y
  521. FADDP ; z := z+x
  522. FLD DWORD[EBX] ; S.GET(ladr, x)
  523. ADD EBX, 4 ; INC(ladr, incl)
  524. FLD DWORD[ECX] ; S.GET(ladr, y)
  525. ADD ECX, 4 ; INC(radr, incr)
  526. FMULP ; x := x*y
  527. FADDP ; z := z+x
  528. FLD DWORD[EBX] ; S.GET(ladr, x)
  529. ADD EBX, 4 ; INC(ladr, incl)
  530. FLD DWORD[ECX] ; S.GET(ladr, y)
  531. ADD ECX, 4 ; INC(radr, incr)
  532. FMULP ; x := x*y
  533. FADDP ; z := z+x
  534. FLD DWORD[EBX] ; S.GET(ladr, x)
  535. ADD EBX, 4 ; INC(ladr, incl)
  536. FLD DWORD[ECX] ; S.GET(ladr, y)
  537. ADD ECX, 4 ; INC(radr, incr)
  538. FMULP ; x := x*y
  539. FADDP ; z := z+x
  540. FLD DWORD[EBX] ; S.GET(ladr, x)
  541. ADD EBX, 4 ; INC(ladr, incl)
  542. FLD DWORD[ECX] ; S.GET(ladr, y)
  543. ADD ECX, 4 ; INC(radr, incr)
  544. FMULP ; x := x*y
  545. FADDP ; z := z+x
  546. FLD DWORD[EBX] ; S.GET(ladr, x)
  547. ADD EBX, 4 ; INC(ladr, incl)
  548. FLD DWORD[ECX] ; S.GET(ladr, y)
  549. ADD ECX, 4 ; INC(radr, incr)
  550. FMULP ; x := x*y
  551. FADDP ; z := z+x
  552. FLD DWORD[EBX] ; S.GET(ladr, x)
  553. ADD EBX, 4 ; INC(ladr, incl)
  554. FLD DWORD[ECX] ; S.GET(ladr, y)
  555. ADD ECX, 4 ; INC(radr, incr)
  556. FMULP ; x := x*y
  557. FADDP ; z := z+x
  558. FLD DWORD[EBX] ; S.GET(ladr, x)
  559. ADD EBX, 4 ; INC(ladr, incl)
  560. FLD DWORD[ECX] ; S.GET(ladr, y)
  561. ADD ECX, 4 ; INC(radr, incr)
  562. FMULP ; x := x*y
  563. FADDP ; z := z+x
  564. FLD DWORD[EBX] ; S.GET(ladr, x)
  565. ADD EBX, 4 ; INC(ladr, incl)
  566. FLD DWORD[ECX] ; S.GET(ladr, y)
  567. ADD ECX, 4 ; INC(radr, incr)
  568. FMULP ; x := x*y
  569. FADDP ; z := z+x
  570. FLD DWORD[EBX] ; S.GET(ladr, x)
  571. ADD EBX, 4 ; INC(ladr, incl)
  572. FLD DWORD[ECX] ; S.GET(ladr, y)
  573. ADD ECX, 4 ; INC(radr, incr)
  574. FMULP ; x := x*y
  575. FADDP ; z := z+x
  576. FLD DWORD[EBX] ; S.GET(ladr, x)
  577. ADD EBX, 4 ; INC(ladr, incl)
  578. FLD DWORD[ECX] ; S.GET(ladr, y)
  579. ADD ECX, 4 ; INC(radr, incr)
  580. FMULP ; x := x*y
  581. FADDP ; z := z+x
  582. SUB EAX, 16 ; DEC(len)
  583. JMP loop16 ;
  584. loop1:
  585. CMP EAX, 0 ; WHILE len > 0 DO
  586. JLE endL
  587. FLD DWORD[EBX] ; S.GET(ladr, x)
  588. ADD EBX, 4 ; INC(ladr, incl)
  589. FLD DWORD[ECX] ; S.GET(ladr, y)
  590. ADD ECX, 4 ; INC(radr, incr)
  591. FMULP ; x := x*y
  592. FADDP ; z := z+x
  593. DEC EAX ; DEC(len)
  594. JMP loop1 ;
  595. endL:
  596. FSTP DWORD[EDX] ; S.PUT(dadr, x)
  597. FWAIT ;
  598. ADD ESP, 16 ;
  599. END L1Block1RA;
  600. PROCEDURE -L1Block1RSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  601. (*
  602. matrixA, matrixB must be stored in special format
  603. K>0 guaranteed
  604. *)
  605. CODE {SYSTEM.i386, SYSTEM.SSE}
  606. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  607. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  608. MOV EDX, [ESP+K] ; EDX IS counter
  609. XORPS XMM2, XMM2 ;
  610. kLoop16: ;
  611. CMP EDX, 16 ;
  612. JL kLoop4 ;
  613. MOVAPS XMM7, [EBX] ;
  614. MOVAPS XMM0, [ECX] ;
  615. ADD ECX, 16 ;
  616. ADD EBX, 16 ;
  617. MOVAPS XMM6, [EBX] ;
  618. MOVAPS XMM1, [ECX] ;
  619. ADD ECX, 16 ;
  620. ADD EBX, 16 ;
  621. MULPS XMM0, XMM7 ;
  622. ADDPS XMM2, XMM0 ;
  623. MOVAPS XMM5, [EBX] ;
  624. MOVAPS XMM3, [ECX] ;
  625. ADD ECX, 16 ;
  626. ADD EBX, 16 ;
  627. MULPS XMM1, XMM6 ;
  628. ADDPS XMM2, XMM1 ;
  629. MOVAPS XMM7, [EBX] ;
  630. MOVAPS XMM0, [ECX] ;
  631. ADD ECX, 16 ;
  632. ADD EBX, 16 ;
  633. MULPS XMM3, XMM5 ;
  634. ADDPS XMM2, XMM3 ;
  635. MULPS XMM0, XMM7 ;
  636. ADDPS XMM2, XMM0 ;
  637. SUB EDX, 16 ;
  638. JMP kLoop16 ;
  639. kLoop4: ;
  640. CMP EDX, 0 ;
  641. JLE horizontalAdd ;
  642. MOVAPS XMM7, [EBX] ;
  643. MOVAPS XMM0, [ECX] ;
  644. ADD ECX, 16 ;
  645. ADD EBX, 16 ;
  646. MULPS XMM0, XMM7 ;
  647. ADDPS XMM2, XMM0 ;
  648. SUB EDX, 4
  649. JMP kLoop4 ;
  650. horizontalAdd:
  651. MOV EDI, [ESP+adrC] ;
  652. MOVLHPS XMM1, XMM2 ;
  653. ADDPS XMM1, XMM2 ;
  654. SHUFPS XMM2, XMM1, 48 ;
  655. ADDPS XMM2, XMM1 ;
  656. MOVHLPS XMM2, XMM2 ;
  657. ADDSS XMM2, [EDI] ;
  658. MOVSS [EDI], XMM2 ;
  659. endL:
  660. ADD ESP, 16 ;
  661. END L1Block1RSSE;
  662. PROCEDURE -L1Block5RSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  663. (*
  664. matrixA and matrix B are stored in special format !
  665. K > 0 is guaranteed
  666. *)
  667. CODE {SYSTEM.i386, SYSTEM.SSE}
  668. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  669. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  670. MOV EDX, [ESP+K] ; EDX IS counter
  671. XORPS XMM2, XMM2 ;
  672. XORPS XMM3, XMM3 ;
  673. XORPS XMM4, XMM4 ;
  674. XORPS XMM5, XMM5 ;
  675. XORPS XMM6, XMM6 ;
  676. kLoop16: ;
  677. CMP EDX, 16 ;
  678. JL kLoop4 ; (*-- 0 -- *)
  679. MOVAPS XMM7, [EBX] ; get 4 elements OF A
  680. ADD EBX, 16 ;
  681. MOVAPS XMM0, [ECX] ; get 4 elements OF B
  682. ADD ECX, 16 ;
  683. MOVAPS XMM1, [ECX] ; get 4 elements OF B
  684. ADD ECX, 16 ;
  685. MULPS XMM0, XMM7 ;
  686. ADDPS XMM2, XMM0 ;
  687. MOVAPS XMM0, [ECX] ;
  688. ADD ECX, 16 ;
  689. MULPS XMM1, XMM7 ;
  690. ADDPS XMM3, XMM1 ;
  691. MOVAPS XMM1, [ECX] ;
  692. ADD ECX, 16 ;
  693. MULPS XMM0, XMM7 ;
  694. ADDPS XMM4, XMM0 ;
  695. MOVAPS XMM0, [ECX] ;
  696. ADD ECX, 16 ;
  697. MULPS XMM1, XMM7 ;
  698. ADDPS XMM5, XMM1 ;
  699. MOVAPS XMM1, [ECX] ;
  700. ADD ECX, 16 ;
  701. MULPS XMM0, XMM7 ;
  702. ADDPS XMM6, XMM0
  703. ; (*-- 4 -- *) ;
  704. MOVAPS XMM7, [EBX] ;
  705. ADD EBX, 16 ;
  706. MOVAPS XMM0, [ECX] ;
  707. ADD ECX, 16 ;
  708. MULPS XMM1, XMM7 ;
  709. ADDPS XMM2, XMM1 ;
  710. MOVAPS XMM1, [ECX] ;
  711. ADD ECX, 16 ;
  712. MULPS XMM0, XMM7 ;
  713. ADDPS XMM3, XMM0 ;
  714. MOVAPS XMM0, [ECX] ;
  715. ADD ECX, 16 ;
  716. MULPS XMM1, XMM7 ;
  717. ADDPS XMM4, XMM1 ;
  718. MOVAPS XMM1, [ECX] ;
  719. ADD ECX, 16 ;
  720. MULPS XMM0, XMM7 ;
  721. ADDPS XMM5, XMM0 ;
  722. MOVAPS XMM0, [ECX] ;
  723. ADD ECX, 16 ;
  724. MULPS XMM1, XMM7 ;
  725. ADDPS XMM6, XMM1
  726. ; (*-- 8 -- *) ;
  727. MOVAPS XMM7, [EBX] ;
  728. ADD EBX, 16 ;
  729. MOVAPS XMM1, [ECX] ;
  730. ADD ECX, 16 ;
  731. MULPS XMM0, XMM7 ;
  732. ADDPS XMM2, XMM0 ;
  733. MOVAPS XMM0, [ECX] ;
  734. ADD ECX, 16 ;
  735. MULPS XMM1, XMM7 ;
  736. ADDPS XMM3, XMM1 ;
  737. MOVAPS XMM1, [ECX] ;
  738. ADD ECX, 16 ;
  739. MULPS XMM0, XMM7 ;
  740. ADDPS XMM4, XMM0 ;
  741. MOVAPS XMM0, [ECX] ;
  742. ADD ECX, 16 ;
  743. MULPS XMM1, XMM7 ;
  744. ADDPS XMM5, XMM1 ;
  745. MOVAPS XMM1, [ECX] ;
  746. ADD ECX, 16 ;
  747. MULPS XMM0, XMM7 ;
  748. ADDPS XMM6, XMM0
  749. ; (*-- 12 -- *) ;
  750. MOVAPS XMM7, [EBX] ;
  751. ADD EBX, 16 ;
  752. MOVAPS XMM0, [ECX] ;
  753. ADD ECX, 16 ;
  754. MULPS XMM1, XMM7 ;
  755. ADDPS XMM2, XMM1 ;
  756. MOVAPS XMM1, [ECX] ;
  757. ADD ECX, 16 ;
  758. MULPS XMM0, XMM7 ;
  759. ADDPS XMM3, XMM0 ;
  760. MOVAPS XMM0, [ECX] ;
  761. ADD ECX, 16 ;
  762. MULPS XMM1, XMM7 ;
  763. ADDPS XMM4, XMM1 ;
  764. MOVAPS XMM1, [ECX] ;
  765. ADD ECX, 16 ;
  766. MULPS XMM0, XMM7 ;
  767. ADDPS XMM5, XMM0 ;
  768. MULPS XMM1, XMM7 ;
  769. ADDPS XMM6, XMM1 ;
  770. SUB EDX, 16
  771. JMP kLoop16 ;
  772. kLoop4: ;
  773. CMP EDX, 0 ;
  774. JLE horizontalAdd ;
  775. MOVAPS XMM7, [EBX] ;
  776. ADD EBX, 16 ;
  777. MOVAPS XMM0, [ECX] ;
  778. ADD ECX, 16 ;
  779. MOVAPS XMM1, [ECX] ;
  780. ADD ECX, 16 ;
  781. MULPS XMM0, XMM7 ;
  782. ADDPS XMM2, XMM0 ;
  783. MOVAPS XMM0, [ECX] ;
  784. ADD ECX, 16 ;
  785. MULPS XMM1, XMM7 ;
  786. ADDPS XMM3, XMM1 ;
  787. MOVAPS XMM1, [ECX] ;
  788. ADD ECX, 16 ;
  789. MULPS XMM0, XMM7 ;
  790. ADDPS XMM4, XMM0 ;
  791. MOVAPS XMM0, [ECX] ;
  792. ADD ECX, 16 ;
  793. MULPS XMM1, XMM7 ;
  794. ADDPS XMM5, XMM1 ;
  795. MULPS XMM0, XMM7 ;
  796. ADDPS XMM6, XMM0 ;
  797. SUB EDX, 4
  798. JMP kLoop4 ;
  799. horizontalAdd: ; add and store
  800. MOV EDI, [ESP+adrC] ;
  801. MOV EAX, [ESP+IncC] ;
  802. MOVLHPS XMM1, XMM2 ;
  803. ADDPS XMM1, XMM2 ;
  804. SHUFPS XMM2, XMM1, 48 ;
  805. ADDPS XMM2, XMM1 ;
  806. MOVHLPS XMM2, XMM2 ;
  807. ADDSS XMM2, [EDI] ;
  808. MOVSS [EDI], XMM2 ;
  809. ADD EDI, EAX ;
  810. MOVLHPS XMM1, XMM3 ;
  811. ADDPS XMM1, XMM3 ;
  812. SHUFPS XMM3, XMM1, 48 ;
  813. ADDPS XMM3, XMM1 ;
  814. MOVHLPS XMM3, XMM3 ;
  815. ADDSS XMM3, [EDI] ;
  816. MOVSS [EDI], XMM3 ;
  817. ADD EDI, EAX ;
  818. MOVLHPS XMM1, XMM4 ;
  819. ADDPS XMM1, XMM4 ;
  820. SHUFPS XMM4, XMM1, 48 ;
  821. ADDPS XMM4, XMM1 ;
  822. MOVHLPS XMM4, XMM4 ;
  823. ADDSS XMM4, [EDI] ;
  824. MOVSS [EDI], XMM4 ;
  825. ADD EDI, EAX ;
  826. MOVLHPS XMM1, XMM5 ;
  827. ADDPS XMM1, XMM5 ;
  828. SHUFPS XMM5, XMM1, 48 ;
  829. ADDPS XMM5, XMM1 ;
  830. MOVHLPS XMM5, XMM5 ;
  831. ADDSS XMM5, [EDI] ;
  832. MOVSS [EDI], XMM5 ;
  833. ADD EDI, EAX ;
  834. MOVLHPS XMM1, XMM6 ;
  835. ADDPS XMM1, XMM6 ;
  836. SHUFPS XMM6, XMM1, 48 ;
  837. ADDPS XMM6, XMM1 ;
  838. MOVHLPS XMM6, XMM6 ;
  839. ADDSS XMM6, [EDI] ;
  840. MOVSS [EDI], XMM6 ;
  841. endL:
  842. ADD ESP, 20 ;
  843. END L1Block5RSSE;
  844. PROCEDURE -Align4( adr: ADDRESS ): ADDRESS;
  845. CODE {SYSTEM.i386}
  846. MOV EAX, [ESP+adr] ;
  847. NEG EAX ;
  848. AND EAX, 3H ;
  849. ADD EAX, [ESP+adr] ;
  850. ADD ESP, 4
  851. END Align4;
  852. PROCEDURE -Align2( adr: ADDRESS ): ADDRESS;
  853. CODE {SYSTEM.i386}
  854. MOV EAX, [ESP+adr] ;
  855. NEG EAX ;
  856. AND EAX, 1H ;
  857. ADD EAX, [ESP+adr] ;
  858. ADD ESP, 4
  859. END Align2;
  860. PROCEDURE -ZeroR( adr: ADDRESS; count: SIZE );
  861. (** For 32 bit types *)
  862. CODE {SYSTEM.i386}
  863. MOV EDI, [ESP+adr] ; address OF dest index
  864. MOV ECX, [ESP+count] ; counter
  865. MOV EAX, 0 ; value
  866. CLD ; incremental
  867. REP ;
  868. STOSD ;
  869. ADD ESP, 8 ;
  870. END ZeroR;
  871. PROCEDURE -ZeroX( adr: ADDRESS; count: SIZE );
  872. (** For 64 bit types *)
  873. CODE {SYSTEM.i386}
  874. MOV EDI, [ESP+adr] ; address OF dest index
  875. MOV ECX, [ESP+count] ; counter
  876. SHL ECX, 1 ;
  877. MOV EAX, 0 ; value
  878. CLD ; incremental
  879. REP ;
  880. STOSD ;
  881. ADD ESP, 8 ;
  882. END ZeroX;
  883. PROCEDURE -ZeroRI( adr: SIZE; inc, count: SIZE );
  884. (** For 32 bit types *)
  885. CODE {SYSTEM.i386}
  886. MOV EDI, [ESP+adr] ; address OF dest index
  887. MOV EBX, [ESP+inc] ;
  888. MOV ECX, [ESP+count] ; counter
  889. CMP EBX, 4 ;
  890. JE fastzero ;
  891. MOV EAX, 0 ;
  892. loopL:
  893. CMP ECX, 0 ;
  894. JLE endL ;
  895. MOV [EDI], EAX ;
  896. ADD EDI, EBX ;
  897. DEC ECX ;
  898. JMP loopL ;
  899. fastzero:
  900. MOV EAX, 0 ; value
  901. CLD ; incremental
  902. REP ;
  903. STOSD ;
  904. endL:
  905. ADD ESP, 12 ;
  906. END ZeroRI;
  907. PROCEDURE -ZeroXI( adr: ADDRESS; inc, count: SIZE );
  908. (** For 32 bit types *)
  909. CODE {SYSTEM.i386}
  910. MOV EDI, [ESP+adr] ; address OF dest index
  911. MOV EBX, [ESP+inc] ;
  912. MOV ECX, [ESP+count] ; counter
  913. MOV EAX, 0 ;
  914. CMP EBX, 8 ;
  915. JE fastzero ;
  916. loopL:
  917. CMP ECX, 0 ;
  918. JLE endL ;
  919. MOV [EDI], EAX ;
  920. MOV [EDI+4], EAX ;
  921. ADD EDI, EBX ;
  922. DEC ECX ;
  923. JMP loopL ;
  924. fastzero:
  925. SHL ECX, 1 ;
  926. CLD ; incremental
  927. REP ;
  928. STOSD ;
  929. endL:
  930. ADD ESP, 12 ;
  931. END ZeroXI;
  932. PROCEDURE -MovR( from, to0, frominc, count: SIZE );
  933. CODE {SYSTEM.i386}
  934. MOV EDI, [ESP+to0] ; TO
  935. MOV ESI, [ESP+from] ; from
  936. MOV ECX, [ESP+count] ; count
  937. MOV EBX, [ESP+frominc] ; inc
  938. CMP EBX, 4 ;
  939. JE fastmove ;
  940. loopL:
  941. CMP ECX, 0 ;
  942. JLE endL ;
  943. MOV EAX, [ESI] ;
  944. MOV [EDI], EAX ;
  945. ADD ESI, EBX ;
  946. ADD EDI, 4 ;
  947. DEC ECX ;
  948. JMP loopL ;
  949. fastmove:
  950. CLD ; incremental
  951. REP ;
  952. MOVSD ; move rest IN one byte steps
  953. endL:
  954. ADD ESP, 16 ;
  955. END MovR;
  956. PROCEDURE -MovX( from, to0: ADDRESS; frominc, count:SIZE );
  957. CODE {SYSTEM.i386}
  958. MOV EDI, [ESP+to0] ; TO
  959. MOV ESI, [ESP+from] ; from
  960. MOV ECX, [ESP+count] ; count
  961. MOV EBX, [ESP+frominc] ; inc
  962. CMP EBX, 8 ;
  963. JE fastmove ;
  964. loopL:
  965. CMP ECX, 0 ;
  966. JLE endL ;
  967. MOV EAX, [ESI] ;
  968. MOV [EDI], EAX ;
  969. MOV EAX, [ESI+4] ;
  970. MOV [EDI+4], EAX ;
  971. ADD ESI, EBX ;
  972. ADD EDI, 8 ;
  973. DEC ECX ;
  974. JMP loopL ;
  975. fastmove:
  976. SHL ECX, 1 ;
  977. CLD ; incremental
  978. REP ;
  979. MOVSD ; move rest IN one byte steps
  980. endL:
  981. ADD ESP, 16 ;
  982. END MovX;
  983. PROCEDURE -MovR5( src: ADDRESS; inc, stride: SIZE; dest: ADDRESS; count: SIZE);
  984. CODE {SYSTEM.i386}
  985. MOV ESI, [ESP+src] ; src
  986. MOV EBX, [ESP+inc] ; inc
  987. MOV ECX, [ESP+stride] ; stride
  988. MOV EDI, [ESP+dest] ; dest
  989. loopL:
  990. MOV EAX, [ESP+count] ; count
  991. CMP EAX, 0 ;
  992. JLE endL ;
  993. SUB EAX, 4 ;
  994. MOV [ESP+count], EAX ;
  995. MOV EDX, ESI ;
  996. MOV EAX, [EDX] ;
  997. MOV [EDI], EAX ;
  998. ADD EDX, EBX ;
  999. MOV EAX, [EDX] ;
  1000. MOV [EDI+16], EAX ;
  1001. ADD EDX, EBX ;
  1002. MOV EAX, [EDX] ;
  1003. MOV [EDI+32], EAX ;
  1004. ADD EDX, EBX ;
  1005. MOV EAX, [EDX] ;
  1006. MOV [EDI+48], EAX ;
  1007. ADD EDX, EBX ;
  1008. MOV EAX, [EDX] ;
  1009. MOV [EDI+64], EAX ;
  1010. ADD ESI, ECX ;
  1011. ADD EDI, 4 ;
  1012. MOV EDX, ESI ;
  1013. MOV EAX, [EDX] ;
  1014. MOV [EDI], EAX ;
  1015. ADD EDX, EBX ;
  1016. MOV EAX, [EDX] ;
  1017. MOV [EDI+16], EAX ;
  1018. ADD EDX, EBX ;
  1019. MOV EAX, [EDX] ;
  1020. MOV [EDI+32], EAX ;
  1021. ADD EDX, EBX ;
  1022. MOV EAX, [EDX] ;
  1023. MOV [EDI+48], EAX ;
  1024. ADD EDX, EBX ;
  1025. MOV EAX, [EDX] ;
  1026. MOV [EDI+64], EAX ;
  1027. ADD ESI, ECX ;
  1028. ADD EDI, 4 ;
  1029. MOV EDX, ESI ;
  1030. MOV EAX, [EDX] ;
  1031. MOV [EDI], EAX ;
  1032. ADD EDX, EBX ;
  1033. MOV EAX, [EDX] ;
  1034. MOV [EDI+16], EAX ;
  1035. ADD EDX, EBX ;
  1036. MOV EAX, [EDX] ;
  1037. MOV [EDI+32], EAX ;
  1038. ADD EDX, EBX ;
  1039. MOV EAX, [EDX] ;
  1040. MOV [EDI+48], EAX ;
  1041. ADD EDX, EBX ;
  1042. MOV EAX, [EDX] ;
  1043. MOV [EDI+64], EAX ;
  1044. ADD ESI, ECX ;
  1045. ADD EDI, 4 ;
  1046. MOV EDX, ESI ;
  1047. MOV EAX, [EDX] ;
  1048. MOV [EDI], EAX ;
  1049. ADD EDX, EBX ;
  1050. MOV EAX, [EDX] ;
  1051. MOV [EDI+16], EAX ;
  1052. ADD EDX, EBX ;
  1053. MOV EAX, [EDX] ;
  1054. MOV [EDI+32], EAX ;
  1055. ADD EDX, EBX ;
  1056. MOV EAX, [EDX] ;
  1057. MOV [EDI+48], EAX ;
  1058. ADD EDX, EBX ;
  1059. MOV EAX, [EDX] ;
  1060. MOV [EDI+64], EAX ;
  1061. ADD ESI, ECX ;
  1062. ADD EDI, 4 ;
  1063. ADD EDI, 64 ;
  1064. JMP loopL ;
  1065. endL:
  1066. ADD ESP, 20 ;
  1067. END MovR5;
  1068. (* *)
  1069. PROCEDURE AddAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1070. CODE {SYSTEM.i386, SYSTEM.FPU}
  1071. MOV EAX, [EBP+len] ;
  1072. MOV EBX, [EBP+ladr] ;
  1073. MOV ECX, [EBP+radr] ;
  1074. MOV EDX, [EBP+dadr] ;
  1075. start:
  1076. CMP EAX, 0 ;
  1077. JLE endL ;
  1078. FLD QWORD [EBX] ;
  1079. ADD EBX, [EBP+linc] ;
  1080. FLD QWORD [ECX] ;
  1081. ADD ECX, [EBP+rinc] ;
  1082. FADDP ;
  1083. FSTP QWORD [EDX] ;
  1084. ADD EDX, [EBP+dinc] ;
  1085. DEC EAX ;
  1086. JMP start ;
  1087. endL:
  1088. FWAIT ;
  1089. END AddAXAXLoopA;
  1090. PROCEDURE AddARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1091. CODE {SYSTEM.i386, SYSTEM.FPU}
  1092. MOV EAX, [EBP+len] ;
  1093. MOV EBX, [EBP+ladr] ;
  1094. MOV ECX, [EBP+radr] ;
  1095. MOV EDX, [EBP+dadr] ;
  1096. start:
  1097. CMP EAX, 0 ;
  1098. JLE endL ;
  1099. FLD DWORD [EBX] ;
  1100. ADD EBX, [EBP+linc] ;
  1101. FLD DWORD [ECX] ;
  1102. ADD ECX, [EBP+rinc] ;
  1103. FADDP ;
  1104. FSTP DWORD [EDX] ;
  1105. ADD EDX, [EBP+dinc] ;
  1106. DEC EAX ;
  1107. JMP start ;
  1108. endL:
  1109. FWAIT ;
  1110. END AddARARLoopA;
  1111. PROCEDURE AddAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1112. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1113. MOV EAX, [EBP+len] ;
  1114. CMP EAX, 0 ;
  1115. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1116. MOV EBX, [EBP+ladr] ;
  1117. MOV ECX, [EBP+radr] ;
  1118. MOV EDX, [EBP+dadr] ;
  1119. ; check IF data are contiguous IN memory
  1120. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1121. JNE single ; not continuous- > simplest method
  1122. CMP [EBP+rinc], 8 ; check right FOR contiunuity
  1123. JNE single ; not continuous- > simplest method
  1124. CMP [EBP+dinc], 8 ; check destination FOR contiunuity
  1125. JNE single ; not continuous- > simplest method
  1126. ; check FOR alignment
  1127. MOV ESI, EBX ;
  1128. AND ESI, 7 ; ladr MOD 8
  1129. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1130. JNE unaligned ; not 64 bit aligned
  1131. MOV ESI, ECX ;
  1132. AND ESI, 7 ; radr MOD 8
  1133. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1134. JNE unaligned ; not 64 bit aligned
  1135. MOV ESI, EDX ;
  1136. AND ESI, 7 ; dadr MOD 8
  1137. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1138. JNE unaligned ; not 64 bit aligned
  1139. MOV ESI, EBX ;
  1140. AND ESI, 8 ; 16 byte alignment
  1141. MOV EDI, ECX ;
  1142. AND EDI, 8 ; 16 byte alignment
  1143. CMP ESI, EDI ;
  1144. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1145. MOV EDI, EDX ;
  1146. AND EDI, 8 ; 16 byte alignment
  1147. CMP ESI, EDI ;
  1148. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1149. CMP ESI, 8 ;
  1150. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1151. ; one single element processing TO achieve 128 bt alignment
  1152. MOVSD XMM1, [EBX] ;
  1153. MOVSD XMM0, [ECX] ;
  1154. ADDSD XMM0, XMM1 ;
  1155. MOVSD [EDX], XMM0 ;
  1156. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1157. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1158. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1159. DEC EAX ; one element has been processed
  1160. aligned:
  1161. aligned8:
  1162. CMP EAX, 8 ;
  1163. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1164. MOVAPD XMM0, [EBX] ;
  1165. MOVAPD XMM1, [EBX+16] ;
  1166. MOVAPD XMM2, [EBX+32] ;
  1167. MOVAPD XMM3, [EBX+48] ;
  1168. ADD EBX, 64 ;
  1169. MOVAPD XMM4, [ECX] ;
  1170. MOVAPD XMM5, [ECX+16] ;
  1171. MOVAPD XMM6, [ECX+32] ;
  1172. MOVAPD XMM7, [ECX+48] ;
  1173. ADD ECX, 64 ;
  1174. ADDPD XMM0, XMM4 ;
  1175. ADDPD XMM1, XMM5 ;
  1176. ADDPD XMM2, XMM6 ;
  1177. ADDPD XMM3, XMM7 ;
  1178. MOVAPD [EDX], XMM0 ;
  1179. MOVAPD [EDX+16], XMM1 ;
  1180. MOVAPD [EDX+32], XMM2 ;
  1181. MOVAPD [EDX+48], XMM3 ;
  1182. ADD EDX, 64 ;
  1183. SUB EAX, 8 ;
  1184. JMP aligned8 ;
  1185. ; LOOP FOR 2 pieces aligned
  1186. aligned2: ;
  1187. CMP EAX, 2 ;
  1188. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1189. MOVAPD XMM0, [EBX] ;
  1190. ADD EBX, 16 ;
  1191. MOVAPD XMM1, [ECX] ;
  1192. ADD ECX, 16 ;
  1193. ADDPD XMM0, XMM1 ;
  1194. MOVAPD [EDX], XMM0 ;
  1195. ADD EDX, 16 ;
  1196. SUB EAX, 2 ;
  1197. JMP aligned2 ;
  1198. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1199. unaligned: ;
  1200. unaligned8: ;
  1201. CMP EAX, 8 ;
  1202. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1203. MOVUPD XMM0, [EBX] ;
  1204. MOVUPD XMM1, [EBX+16] ;
  1205. MOVUPD XMM2, [EBX+32] ;
  1206. MOVUPD XMM3, [EBX+48] ;
  1207. ADD EBX, 64 ;
  1208. MOVUPD XMM4, [ECX] ;
  1209. MOVUPD XMM5, [ECX+16] ;
  1210. MOVUPD XMM6, [ECX+32] ;
  1211. MOVUPD XMM7, [ECX+48] ;
  1212. ADD ECX, 64 ;
  1213. ADDPD XMM0, XMM4 ;
  1214. ADDPD XMM1, XMM5 ;
  1215. ADDPD XMM2, XMM6 ;
  1216. ADDPD XMM3, XMM7 ;
  1217. MOVUPD [EDX], XMM0 ;
  1218. MOVUPD [EDX+16], XMM1 ;
  1219. MOVUPD [EDX+32], XMM2 ;
  1220. MOVUPD [EDX+48], XMM3 ;
  1221. ADD EDX, 64 ;
  1222. SUB EAX, 8 ;
  1223. JMP unaligned8 ;
  1224. ; LOOP FOR 2 pieces aligned
  1225. unaligned2: ;
  1226. CMP EAX, 2 ;
  1227. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1228. MOVUPD XMM0, [EBX] ;
  1229. ADD EBX, 16 ;
  1230. MOVUPD XMM1, [ECX] ;
  1231. ADD ECX, 16 ;
  1232. ADDPD XMM0, XMM1 ;
  1233. MOVUPD [EDX], XMM0 ;
  1234. ADD EDX, 16 ;
  1235. SUB EAX, 2 ;
  1236. JMP unaligned2 ;
  1237. ; one piece left OR non-contiguous data
  1238. single:
  1239. singlepieces: ;
  1240. CMP EAX, 0 ;
  1241. JLE endL ; len <= 0- > EXIT
  1242. MOVSD XMM0, [EBX]
  1243. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1244. MOVSD XMM1, [ECX]
  1245. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1246. ADDSD XMM0, XMM1 ;
  1247. MOVSD [EDX], XMM0
  1248. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1249. DEC EAX ; DEC(len)
  1250. JMP singlepieces ;
  1251. endL:
  1252. END AddAXAXLoopSSE;
  1253. PROCEDURE AddARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1254. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1255. MOV EAX, [EBP+len] ;
  1256. CMP EAX, 0 ;
  1257. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1258. MOV EBX, [EBP+ladr] ;
  1259. MOV ECX, [EBP+radr] ;
  1260. MOV EDX, [EBP+dadr] ;
  1261. ; check IF data are contiguous IN memory
  1262. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1263. JNE single ; not continuous- > simplest method
  1264. CMP [EBP+rinc], 4 ; check right FOR contiunuity
  1265. JNE single ; not continuous- > simplest method
  1266. CMP [EBP+dinc], 4 ; check destination FOR contiunuity
  1267. JNE single ; not continuous- > simplest method
  1268. ; check FOR alignment
  1269. MOV ESI, EBX ;
  1270. AND ESI, 3 ; ladr MOD 4
  1271. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1272. JNE unaligned ; not 32 bit aligned
  1273. MOV ESI, ECX ;
  1274. AND ESI, 3 ; radr MOD 4
  1275. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1276. JNE unaligned ; not 32 bit aligned
  1277. MOV ESI, EDX ;
  1278. AND ESI, 3 ; dadr MOD 4
  1279. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1280. JNE unaligned ; not 32 bit aligned
  1281. MOV ESI, EBX ;
  1282. AND ESI, 8+4 ; 16 byte alignment?
  1283. MOV EDI, ECX ;
  1284. AND EDI, 8+4 ; 16 byte alignment?
  1285. CMP ESI, EDI ;
  1286. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1287. MOV EDI, EDX ;
  1288. AND EDI, 8+4 ; 16 byte alignment
  1289. CMP ESI, EDI ;
  1290. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1291. CMP ESI, 0 ;
  1292. JE aligned ; already aligned
  1293. align:
  1294. ; one single element processing UNTIL 128 bt alignment achieved
  1295. MOVSS XMM1, [EBX] ;
  1296. MOVSS XMM0, [ECX] ;
  1297. ADDSS XMM0, XMM1 ;
  1298. MOVSS [EDX], XMM0 ;
  1299. ADD EBX, 4 ;
  1300. ADD ECX, 4 ;
  1301. ADD EDX, 4 ;
  1302. DEC EAX ; one element has been processed ;
  1303. CMP EAX, 0 ; all elements already processed?
  1304. JLE single ;
  1305. MOV ESI, EBX ;
  1306. AND ESI, 8+4 ;
  1307. CMP ESI, 0 ;
  1308. JNE align ;
  1309. aligned:
  1310. aligned16:
  1311. CMP EAX, 16 ;
  1312. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1313. MOVAPS XMM0, [EBX] ;
  1314. MOVAPS XMM1, [EBX+16] ;
  1315. MOVAPS XMM2, [EBX+32] ;
  1316. MOVAPS XMM3, [EBX+48] ;
  1317. ADD EBX, 64 ;
  1318. MOVAPS XMM4, [ECX] ;
  1319. MOVAPS XMM5, [ECX+16] ;
  1320. MOVAPS XMM6, [ECX+32] ;
  1321. MOVAPS XMM7, [ECX+48] ;
  1322. ADD ECX, 64 ;
  1323. ADDPS XMM0, XMM4 ;
  1324. ADDPS XMM1, XMM5 ;
  1325. ADDPS XMM2, XMM6 ;
  1326. ADDPS XMM3, XMM7 ;
  1327. MOVAPS [EDX], XMM0 ;
  1328. MOVAPS [EDX+16], XMM1 ;
  1329. MOVAPS [EDX+32], XMM2 ;
  1330. MOVAPS [EDX+48], XMM3 ;
  1331. ADD EDX, 64 ;
  1332. SUB EAX, 16 ;
  1333. JMP aligned16 ;
  1334. ; LOOP FOR 2 pieces aligned
  1335. aligned4: ;
  1336. CMP EAX, 4 ;
  1337. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1338. MOVAPS XMM0, [EBX] ;
  1339. ADD EBX, 16 ;
  1340. MOVAPS XMM1, [ECX] ;
  1341. ADD ECX, 16 ;
  1342. ADDPS XMM0, XMM1 ;
  1343. MOVAPS [EDX], XMM0 ;
  1344. ADD EDX, 16 ;
  1345. SUB EAX, 4 ;
  1346. JMP aligned4 ;
  1347. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1348. unaligned: ;
  1349. unaligned16: ;
  1350. CMP EAX, 16 ;
  1351. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1352. MOVUPS XMM0, [EBX] ;
  1353. MOVUPS XMM1, [EBX+16] ;
  1354. MOVUPS XMM2, [EBX+32] ;
  1355. MOVUPS XMM3, [EBX+48] ;
  1356. ADD EBX, 64 ;
  1357. MOVUPS XMM4, [ECX] ;
  1358. MOVUPS XMM5, [ECX+16] ;
  1359. MOVUPS XMM6, [ECX+32] ;
  1360. MOVUPS XMM7, [ECX+48] ;
  1361. ADD ECX, 64 ;
  1362. ADDPS XMM0, XMM4 ;
  1363. ADDPS XMM1, XMM5 ;
  1364. ADDPS XMM2, XMM6 ;
  1365. ADDPS XMM3, XMM7 ;
  1366. MOVUPS [EDX], XMM0 ;
  1367. MOVUPS [EDX+16], XMM1 ;
  1368. MOVUPS [EDX+32], XMM2 ;
  1369. MOVUPS [EDX+48], XMM3 ;
  1370. ADD EDX, 64 ;
  1371. SUB EAX, 16 ;
  1372. JMP unaligned16 ;
  1373. ; LOOP FOR 2 pieces aligned
  1374. unaligned4: ;
  1375. CMP EAX, 4 ;
  1376. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1377. MOVUPS XMM0, [EBX] ;
  1378. ADD EBX, 16 ;
  1379. MOVUPS XMM1, [ECX] ;
  1380. ADD ECX, 16 ;
  1381. ADDPS XMM0, XMM1 ;
  1382. MOVUPS [EDX], XMM0 ;
  1383. ADD EDX, 16 ;
  1384. SUB EAX, 4 ;
  1385. JMP unaligned4 ;
  1386. ; one piece left OR non-contiguous data
  1387. single:
  1388. singlepieces: ;
  1389. CMP EAX, 0 ;
  1390. JLE endL ; len <= 0- > EXIT
  1391. MOVSS XMM0, [EBX]
  1392. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1393. MOVSS XMM1, [ECX]
  1394. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1395. ADDSS XMM0, XMM1 ;
  1396. MOVSS [EDX], XMM0
  1397. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1398. DEC EAX ; DEC(len)
  1399. JMP singlepieces ;
  1400. endL:
  1401. END AddARARLoopSSE;
  1402. (* *)
  1403. PROCEDURE SubAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1404. CODE {SYSTEM.i386, SYSTEM.FPU}
  1405. MOV EAX, [EBP+len] ;
  1406. MOV EBX, [EBP+ladr] ;
  1407. MOV ECX, [EBP+radr] ;
  1408. MOV EDX, [EBP+dadr] ;
  1409. start:
  1410. CMP EAX, 0 ;
  1411. JLE endL ;
  1412. FLD QWORD [EBX] ;
  1413. ADD EBX, [EBP+linc] ;
  1414. FLD QWORD [ECX] ;
  1415. ADD ECX, [EBP+rinc] ;
  1416. FSUBP ;
  1417. FSTP QWORD [EDX] ;
  1418. ADD EDX, [EBP+dinc] ;
  1419. DEC EAX ;
  1420. JMP start ;
  1421. endL:
  1422. FWAIT ;
  1423. END SubAXAXLoopA;
  1424. PROCEDURE SubARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1425. CODE {SYSTEM.i386, SYSTEM.FPU}
  1426. MOV EAX, [EBP+len] ;
  1427. MOV EBX, [EBP+ladr] ;
  1428. MOV ECX, [EBP+radr] ;
  1429. MOV EDX, [EBP+dadr] ;
  1430. start:
  1431. CMP EAX, 0 ;
  1432. JLE endL ;
  1433. FLD DWORD [EBX] ;
  1434. ADD EBX, [EBP+linc] ;
  1435. FLD DWORD [ECX] ;
  1436. ADD ECX, [EBP+rinc] ;
  1437. FSUBP ;
  1438. FSTP DWORD [EDX] ;
  1439. ADD EDX, [EBP+dinc] ;
  1440. DEC EAX ;
  1441. JMP start ;
  1442. endL:
  1443. FWAIT ;
  1444. END SubARARLoopA;
  1445. PROCEDURE SubAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1446. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1447. MOV EAX, [EBP+len] ;
  1448. CMP EAX, 0 ;
  1449. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1450. MOV EBX, [EBP+ladr] ;
  1451. MOV ECX, [EBP+radr] ;
  1452. MOV EDX, [EBP+dadr] ;
  1453. ; check IF data are contiguous IN memory
  1454. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1455. JNE single ; not continuous- > simplest method
  1456. CMP [EBP+rinc], 8 ; check right FOR contiunuity
  1457. JNE single ; not continuous- > simplest method
  1458. CMP [EBP+dinc], 8 ; check destination FOR contiunuity
  1459. JNE single ; not continuous- > simplest method
  1460. ; check FOR alignment
  1461. MOV ESI, EBX ;
  1462. AND ESI, 7 ; ladr MOD 8
  1463. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1464. JNE unaligned ; not 64 bit aligned
  1465. MOV ESI, ECX ;
  1466. AND ESI, 7 ; radr MOD 8
  1467. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1468. JNE unaligned ; not 64 bit aligned
  1469. MOV ESI, EDX ;
  1470. AND ESI, 7 ; dadr MOD 8
  1471. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1472. JNE unaligned ; not 64 bit aligned
  1473. MOV ESI, EBX ;
  1474. AND ESI, 8 ; 16 byte alignment
  1475. MOV EDI, ECX ;
  1476. AND EDI, 8 ; 16 byte alignment
  1477. CMP ESI, EDI ;
  1478. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1479. MOV EDI, EDX ;
  1480. AND EDI, 8 ; 16 byte alignment
  1481. CMP ESI, EDI ;
  1482. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1483. CMP ESI, 8 ;
  1484. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1485. ; one single element processing TO achieve 128 bt alignment
  1486. MOVSD XMM1, [EBX] ;
  1487. MOVSD XMM0, [ECX] ;
  1488. SUBSD XMM0, XMM1 ;
  1489. MOVSD [EDX], XMM0 ;
  1490. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1491. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1492. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1493. DEC EAX ; one element has been processed
  1494. aligned:
  1495. aligned8:
  1496. CMP EAX, 8 ;
  1497. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1498. MOVAPD XMM0, [EBX] ;
  1499. MOVAPD XMM1, [EBX+16] ;
  1500. MOVAPD XMM2, [EBX+32] ;
  1501. MOVAPD XMM3, [EBX+48] ;
  1502. ADD EBX, 64 ;
  1503. MOVAPD XMM4, [ECX] ;
  1504. MOVAPD XMM5, [ECX+16] ;
  1505. MOVAPD XMM6, [ECX+32] ;
  1506. MOVAPD XMM7, [ECX+48] ;
  1507. ADD ECX, 64 ;
  1508. SUBPD XMM0, XMM4 ;
  1509. SUBPD XMM1, XMM5 ;
  1510. SUBPD XMM2, XMM6 ;
  1511. SUBPD XMM3, XMM7 ;
  1512. MOVAPD [EDX], XMM0 ;
  1513. MOVAPD [EDX+16], XMM1 ;
  1514. MOVAPD [EDX+32], XMM2 ;
  1515. MOVAPD [EDX+48], XMM3 ;
  1516. ADD EDX, 64 ;
  1517. SUB EAX, 8 ;
  1518. JMP aligned8 ;
  1519. ; LOOP FOR 2 pieces aligned
  1520. aligned2: ;
  1521. CMP EAX, 2 ;
  1522. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1523. MOVAPD XMM0, [EBX] ;
  1524. ADD EBX, 16 ;
  1525. MOVAPD XMM1, [ECX] ;
  1526. ADD ECX, 16 ;
  1527. SUBPD XMM0, XMM1 ;
  1528. MOVAPD [EDX], XMM0 ;
  1529. ADD EDX, 16 ;
  1530. SUB EAX, 2 ;
  1531. JMP aligned2 ;
  1532. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1533. unaligned: ;
  1534. unaligned8: ;
  1535. CMP EAX, 8 ;
  1536. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1537. MOVUPD XMM0, [EBX] ;
  1538. MOVUPD XMM1, [EBX+16] ;
  1539. MOVUPD XMM2, [EBX+32] ;
  1540. MOVUPD XMM3, [EBX+48] ;
  1541. ADD EBX, 64 ;
  1542. MOVUPD XMM4, [ECX] ;
  1543. MOVUPD XMM5, [ECX+16] ;
  1544. MOVUPD XMM6, [ECX+32] ;
  1545. MOVUPD XMM7, [ECX+48] ;
  1546. ADD ECX, 64 ;
  1547. SUBPD XMM0, XMM4 ;
  1548. SUBPD XMM1, XMM5 ;
  1549. SUBPD XMM2, XMM6 ;
  1550. SUBPD XMM3, XMM7 ;
  1551. MOVUPD [EDX], XMM0 ;
  1552. MOVUPD [EDX+16], XMM1 ;
  1553. MOVUPD [EDX+32], XMM2 ;
  1554. MOVUPD [EDX+48], XMM3 ;
  1555. ADD EDX, 64 ;
  1556. SUB EAX, 8 ;
  1557. JMP unaligned8 ;
  1558. ; LOOP FOR 2 pieces aligned
  1559. unaligned2: ;
  1560. CMP EAX, 2 ;
  1561. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1562. MOVUPD XMM0, [EBX] ;
  1563. ADD EBX, 16 ;
  1564. MOVUPD XMM1, [ECX] ;
  1565. ADD ECX, 16 ;
  1566. SUBPD XMM0, XMM1 ;
  1567. MOVUPD [EDX], XMM0 ;
  1568. ADD EDX, 16 ;
  1569. SUB EAX, 2 ;
  1570. JMP unaligned2 ;
  1571. ; one piece left OR non-contiguous data
  1572. single:
  1573. singlepieces: ;
  1574. CMP EAX, 0 ;
  1575. JLE endL ; len <= 0- > EXIT
  1576. MOVSD XMM0, [EBX]
  1577. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1578. MOVSD XMM1, [ECX]
  1579. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1580. SUBSD XMM0, XMM1 ;
  1581. MOVSD [EDX], XMM0
  1582. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1583. DEC EAX ; DEC(len)
  1584. JMP singlepieces ;
  1585. endL:
  1586. END SubAXAXLoopSSE;
  1587. PROCEDURE SubARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1588. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1589. MOV EAX, [EBP+len] ;
  1590. CMP EAX, 0 ;
  1591. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1592. MOV EBX, [EBP+ladr] ;
  1593. MOV ECX, [EBP+radr] ;
  1594. MOV EDX, [EBP+dadr] ;
  1595. ; check IF data are contiguous IN memory
  1596. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1597. JNE single ; not continuous- > simplest method
  1598. CMP [EBP+rinc], 4 ; check right FOR contiunuity
  1599. JNE single ; not continuous- > simplest method
  1600. CMP [EBP+dinc], 4 ; check destination FOR contiunuity
  1601. JNE single ; not continuous- > simplest method
  1602. ; check FOR alignment
  1603. MOV ESI, EBX ;
  1604. AND ESI, 3 ; ladr MOD 4
  1605. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1606. JNE unaligned ; not 32 bit aligned
  1607. MOV ESI, ECX ;
  1608. AND ESI, 3 ; radr MOD 4
  1609. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1610. JNE unaligned ; not 32 bit aligned
  1611. MOV ESI, EDX ;
  1612. AND ESI, 3 ; dadr MOD 4
  1613. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1614. JNE unaligned ; not 32 bit aligned
  1615. MOV ESI, EBX ;
  1616. AND ESI, 8+4 ; 16 byte alignment?
  1617. MOV EDI, ECX ;
  1618. AND EDI, 8+4 ; 16 byte alignment?
  1619. CMP ESI, EDI ;
  1620. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1621. MOV EDI, EDX ;
  1622. AND EDI, 8+4 ; 16 byte alignment
  1623. CMP ESI, EDI ;
  1624. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1625. CMP ESI, 0 ;
  1626. JE aligned ; already aligned
  1627. align:
  1628. ; one single element processing UNTIL 128 bt alignment achieved
  1629. MOVSS XMM1, [EBX] ;
  1630. MOVSS XMM0, [ECX] ;
  1631. SUBSS XMM0, XMM1 ;
  1632. MOVSS [EDX], XMM0 ;
  1633. ADD EBX, 4 ;
  1634. ADD ECX, 4 ;
  1635. ADD EDX, 4 ;
  1636. DEC EAX ; one element has been processed ;
  1637. CMP EAX, 0 ; all elements already processed?
  1638. JLE single ;
  1639. MOV ESI, EBX ;
  1640. AND ESI, 8+4 ;
  1641. CMP ESI, 0 ;
  1642. JNE align ;
  1643. aligned:
  1644. aligned16:
  1645. CMP EAX, 16 ;
  1646. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1647. MOVAPS XMM0, [EBX] ;
  1648. MOVAPS XMM1, [EBX+16] ;
  1649. MOVAPS XMM2, [EBX+32] ;
  1650. MOVAPS XMM3, [EBX+48] ;
  1651. ADD EBX, 64 ;
  1652. MOVAPS XMM4, [ECX] ;
  1653. MOVAPS XMM5, [ECX+16] ;
  1654. MOVAPS XMM6, [ECX+32] ;
  1655. MOVAPS XMM7, [ECX+48] ;
  1656. ADD ECX, 64 ;
  1657. SUBPS XMM0, XMM4 ;
  1658. SUBPS XMM1, XMM5 ;
  1659. SUBPS XMM2, XMM6 ;
  1660. SUBPS XMM3, XMM7 ;
  1661. MOVAPS [EDX], XMM0 ;
  1662. MOVAPS [EDX+16], XMM1 ;
  1663. MOVAPS [EDX+32], XMM2 ;
  1664. MOVAPS [EDX+48], XMM3 ;
  1665. ADD EDX, 64 ;
  1666. SUB EAX, 16 ;
  1667. JMP aligned16 ;
  1668. ; LOOP FOR 2 pieces aligned
  1669. aligned4: ;
  1670. CMP EAX, 4 ;
  1671. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1672. MOVAPS XMM0, [EBX] ;
  1673. ADD EBX, 16 ;
  1674. MOVAPS XMM1, [ECX] ;
  1675. ADD ECX, 16 ;
  1676. SUBPS XMM0, XMM1 ;
  1677. MOVAPS [EDX], XMM0 ;
  1678. ADD EDX, 16 ;
  1679. SUB EAX, 4 ;
  1680. JMP aligned4 ;
  1681. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1682. unaligned: ;
  1683. unaligned16: ;
  1684. CMP EAX, 16 ;
  1685. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1686. MOVUPS XMM0, [EBX] ;
  1687. MOVUPS XMM1, [EBX+16] ;
  1688. MOVUPS XMM2, [EBX+32] ;
  1689. MOVUPS XMM3, [EBX+48] ;
  1690. ADD EBX, 64 ;
  1691. MOVUPS XMM4, [ECX] ;
  1692. MOVUPS XMM5, [ECX+16] ;
  1693. MOVUPS XMM6, [ECX+32] ;
  1694. MOVUPS XMM7, [ECX+48] ;
  1695. ADD ECX, 64 ;
  1696. SUBPS XMM0, XMM4 ;
  1697. SUBPS XMM1, XMM5 ;
  1698. SUBPS XMM2, XMM6 ;
  1699. SUBPS XMM3, XMM7 ;
  1700. MOVUPS [EDX], XMM0 ;
  1701. MOVUPS [EDX+16], XMM1 ;
  1702. MOVUPS [EDX+32], XMM2 ;
  1703. MOVUPS [EDX+48], XMM3 ;
  1704. ADD EDX, 64 ;
  1705. SUB EAX, 16 ;
  1706. JMP unaligned16 ;
  1707. ; LOOP FOR 2 pieces aligned
  1708. unaligned4: ;
  1709. CMP EAX, 4 ;
  1710. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1711. MOVUPS XMM0, [EBX] ;
  1712. ADD EBX, 16 ;
  1713. MOVUPS XMM1, [ECX] ;
  1714. ADD ECX, 16 ;
  1715. SUBPS XMM0, XMM1 ;
  1716. MOVUPS [EDX], XMM0 ;
  1717. ADD EDX, 16 ;
  1718. SUB EAX, 4 ;
  1719. JMP unaligned4 ;
  1720. ; one piece left OR non-contiguous data
  1721. single:
  1722. singlepieces: ;
  1723. CMP EAX, 0 ;
  1724. JLE endL ; len <= 0- > EXIT
  1725. MOVSS XMM0, [EBX]
  1726. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1727. MOVSS XMM1, [ECX]
  1728. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1729. SUBSS XMM0, XMM1 ;
  1730. MOVSS [EDX], XMM0
  1731. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1732. DEC EAX ; DEC(len)
  1733. JMP singlepieces ;
  1734. endL:
  1735. END SubARARLoopSSE;
  1736. (* *)
  1737. PROCEDURE EMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1738. CODE {SYSTEM.i386, SYSTEM.FPU}
  1739. MOV EAX, [EBP+len] ;
  1740. MOV EBX, [EBP+ladr] ;
  1741. MOV ECX, [EBP+radr] ;
  1742. MOV EDX, [EBP+dadr] ;
  1743. start:
  1744. CMP EAX, 0 ;
  1745. JLE endL ;
  1746. FLD QWORD [EBX] ;
  1747. ADD EBX, [EBP+linc] ;
  1748. FLD QWORD [ECX] ;
  1749. ADD ECX, [EBP+rinc] ;
  1750. FMULP ;
  1751. FSTP QWORD [EDX] ;
  1752. ADD EDX, [EBP+dinc] ;
  1753. DEC EAX ;
  1754. JMP start ;
  1755. endL:
  1756. FWAIT ;
  1757. END EMulAXAXLoopA;
  1758. PROCEDURE EMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1759. CODE {SYSTEM.i386, SYSTEM.FPU}
  1760. MOV EAX, [EBP+len] ;
  1761. MOV EBX, [EBP+ladr] ;
  1762. MOV ECX, [EBP+radr] ;
  1763. MOV EDX, [EBP+dadr] ;
  1764. start:
  1765. CMP EAX, 0 ;
  1766. JLE endL ;
  1767. FLD DWORD [EBX] ;
  1768. ADD EBX, [EBP+linc] ;
  1769. FLD DWORD [ECX] ;
  1770. ADD ECX, [EBP+rinc] ;
  1771. FMULP ;
  1772. FSTP DWORD [EDX] ;
  1773. ADD EDX, [EBP+dinc] ;
  1774. DEC EAX ;
  1775. JMP start ;
  1776. endL:
  1777. FWAIT ;
  1778. END EMulARARLoopA;
  1779. PROCEDURE EMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1780. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1781. MOV EAX, [EBP+len] ;
  1782. CMP EAX, 0 ;
  1783. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1784. MOV EBX, [EBP+ladr] ;
  1785. MOV ECX, [EBP+radr] ;
  1786. MOV EDX, [EBP+dadr] ;
  1787. ; check IF data are contiguous IN memory
  1788. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1789. JNE single ; not continuous- > simplest method
  1790. CMP [EBP+rinc], 8 ; check right FOR contiunuity
  1791. JNE single ; not continuous- > simplest method
  1792. CMP [EBP+dinc], 8 ; check destination FOR contiunuity
  1793. JNE single ; not continuous- > simplest method
  1794. ; check FOR alignment
  1795. MOV ESI, EBX ;
  1796. AND ESI, 7 ; ladr MOD 8
  1797. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1798. JNE unaligned ; not 64 bit aligned
  1799. MOV ESI, ECX ;
  1800. AND ESI, 7 ; radr MOD 8
  1801. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1802. JNE unaligned ; not 64 bit aligned
  1803. MOV ESI, EDX ;
  1804. AND ESI, 7 ; dadr MOD 8
  1805. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1806. JNE unaligned ; not 64 bit aligned
  1807. MOV ESI, EBX ;
  1808. AND ESI, 8 ; 16 byte alignment
  1809. MOV EDI, ECX ;
  1810. AND EDI, 8 ; 16 byte alignment
  1811. CMP ESI, EDI ;
  1812. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1813. MOV EDI, EDX ;
  1814. AND EDI, 8 ; 16 byte alignment
  1815. CMP ESI, EDI ;
  1816. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1817. CMP ESI, 8 ;
  1818. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1819. ; one single element processing TO achieve 128 bt alignment
  1820. MOVSD XMM1, [EBX] ;
  1821. MOVSD XMM0, [ECX] ;
  1822. MULSD XMM0, XMM1 ;
  1823. MOVSD [EDX], XMM0 ;
  1824. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1825. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1826. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1827. DEC EAX ; one element has been processed
  1828. aligned:
  1829. aligned8:
  1830. CMP EAX, 8 ;
  1831. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1832. MOVAPD XMM0, [EBX] ;
  1833. MOVAPD XMM1, [EBX+16] ;
  1834. MOVAPD XMM2, [EBX+32] ;
  1835. MOVAPD XMM3, [EBX+48] ;
  1836. ADD EBX, 64 ;
  1837. MOVAPD XMM4, [ECX] ;
  1838. MOVAPD XMM5, [ECX+16] ;
  1839. MOVAPD XMM6, [ECX+32] ;
  1840. MOVAPD XMM7, [ECX+48] ;
  1841. ADD ECX, 64 ;
  1842. MULPD XMM0, XMM4 ;
  1843. MULPD XMM1, XMM5 ;
  1844. MULPD XMM2, XMM6 ;
  1845. MULPD XMM3, XMM7 ;
  1846. MOVAPD [EDX], XMM0 ;
  1847. MOVAPD [EDX+16], XMM1 ;
  1848. MOVAPD [EDX+32], XMM2 ;
  1849. MOVAPD [EDX+48], XMM3 ;
  1850. ADD EDX, 64 ;
  1851. SUB EAX, 8 ;
  1852. JMP aligned8 ;
  1853. ; LOOP FOR 2 pieces aligned
  1854. aligned2: ;
  1855. CMP EAX, 2 ;
  1856. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1857. MOVAPD XMM0, [EBX] ;
  1858. ADD EBX, 16 ;
  1859. MOVAPD XMM1, [ECX] ;
  1860. ADD ECX, 16 ;
  1861. MULPD XMM0, XMM1 ;
  1862. MOVAPD [EDX], XMM0 ;
  1863. ADD EDX, 16 ;
  1864. SUB EAX, 2 ;
  1865. JMP aligned2 ;
  1866. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1867. unaligned: ;
  1868. unaligned8: ;
  1869. CMP EAX, 8 ;
  1870. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1871. MOVUPD XMM0, [EBX] ;
  1872. MOVUPD XMM1, [EBX+16] ;
  1873. MOVUPD XMM2, [EBX+32] ;
  1874. MOVUPD XMM3, [EBX+48] ;
  1875. ADD EBX, 64 ;
  1876. MOVUPD XMM4, [ECX] ;
  1877. MOVUPD XMM5, [ECX+16] ;
  1878. MOVUPD XMM6, [ECX+32] ;
  1879. MOVUPD XMM7, [ECX+48] ;
  1880. ADD ECX, 64 ;
  1881. MULPD XMM0, XMM4 ;
  1882. MULPD XMM1, XMM5 ;
  1883. MULPD XMM2, XMM6 ;
  1884. MULPD XMM3, XMM7 ;
  1885. MOVUPD [EDX], XMM0 ;
  1886. MOVUPD [EDX+16], XMM1 ;
  1887. MOVUPD [EDX+32], XMM2 ;
  1888. MOVUPD [EDX+48], XMM3 ;
  1889. ADD EDX, 64 ;
  1890. SUB EAX, 8 ;
  1891. JMP unaligned8 ;
  1892. ; LOOP FOR 2 pieces aligned
  1893. unaligned2: ;
  1894. CMP EAX, 2 ;
  1895. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1896. MOVUPD XMM0, [EBX] ;
  1897. ADD EBX, 16 ;
  1898. MOVUPD XMM1, [ECX] ;
  1899. ADD ECX, 16 ;
  1900. MULPD XMM0, XMM1 ;
  1901. MOVUPD [EDX], XMM0 ;
  1902. ADD EDX, 16 ;
  1903. SUB EAX, 2 ;
  1904. JMP unaligned2 ;
  1905. ; one piece left OR non-contiguous data
  1906. single:
  1907. singlepieces: ;
  1908. CMP EAX, 0 ;
  1909. JLE endL ; len <= 0- > EXIT
  1910. MOVSD XMM0, [EBX]
  1911. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1912. MOVSD XMM1, [ECX]
  1913. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1914. MULSD XMM0, XMM1 ;
  1915. MOVSD [EDX], XMM0
  1916. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1917. DEC EAX ; DEC(len)
  1918. JMP singlepieces ;
  1919. endL:
  1920. END EMulAXAXLoopSSE;
  1921. PROCEDURE EMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1922. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1923. MOV EAX, [EBP+len] ;
  1924. CMP EAX, 0 ;
  1925. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1926. MOV EBX, [EBP+ladr] ;
  1927. MOV ECX, [EBP+radr] ;
  1928. MOV EDX, [EBP+dadr] ;
  1929. ; check IF data are contiguous IN memory
  1930. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1931. JNE single ; not continuous- > simplest method
  1932. CMP [EBP+rinc], 4 ; check right FOR contiunuity
  1933. JNE single ; not continuous- > simplest method
  1934. CMP [EBP+dinc], 4 ; check destination FOR contiunuity
  1935. JNE single ; not continuous- > simplest method
  1936. ; check FOR alignment
  1937. MOV ESI, EBX ;
  1938. AND ESI, 3 ; ladr MOD 4
  1939. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1940. JNE unaligned ; not 32 bit aligned
  1941. MOV ESI, ECX ;
  1942. AND ESI, 3 ; radr MOD 4
  1943. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1944. JNE unaligned ; not 32 bit aligned
  1945. MOV ESI, EDX ;
  1946. AND ESI, 3 ; dadr MOD 4
  1947. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1948. JNE unaligned ; not 32 bit aligned
  1949. MOV ESI, EBX ;
  1950. AND ESI, 8+4 ; 16 byte alignment?
  1951. MOV EDI, ECX ;
  1952. AND EDI, 8+4 ; 16 byte alignment?
  1953. CMP ESI, EDI ;
  1954. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1955. MOV EDI, EDX ;
  1956. AND EDI, 8+4 ; 16 byte alignment
  1957. CMP ESI, EDI ;
  1958. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1959. CMP ESI, 0 ;
  1960. JE aligned ; already aligned
  1961. align:
  1962. ; one single element processing UNTIL 128 bt alignment achieved
  1963. MOVSS XMM1, [EBX] ;
  1964. MOVSS XMM0, [ECX] ;
  1965. MULSS XMM0, XMM1 ;
  1966. MOVSS [EDX], XMM0 ;
  1967. ADD EBX, 4 ;
  1968. ADD ECX, 4 ;
  1969. ADD EDX, 4 ;
  1970. DEC EAX ; one element has been processed ;
  1971. CMP EAX, 0 ; all elements already processed?
  1972. JLE single ;
  1973. MOV ESI, EBX ;
  1974. AND ESI, 8+4 ;
  1975. CMP ESI, 0 ;
  1976. JNE align ;
  1977. aligned:
  1978. aligned16:
  1979. CMP EAX, 16 ;
  1980. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1981. MOVAPS XMM0, [EBX] ;
  1982. MOVAPS XMM1, [EBX+16] ;
  1983. MOVAPS XMM2, [EBX+32] ;
  1984. MOVAPS XMM3, [EBX+48] ;
  1985. ADD EBX, 64 ;
  1986. MOVAPS XMM4, [ECX] ;
  1987. MOVAPS XMM5, [ECX+16] ;
  1988. MOVAPS XMM6, [ECX+32] ;
  1989. MOVAPS XMM7, [ECX+48] ;
  1990. ADD ECX, 64 ;
  1991. MULPS XMM0, XMM4 ;
  1992. MULPS XMM1, XMM5 ;
  1993. MULPS XMM2, XMM6 ;
  1994. MULPS XMM3, XMM7 ;
  1995. MOVAPS [EDX], XMM0 ;
  1996. MOVAPS [EDX+16], XMM1 ;
  1997. MOVAPS [EDX+32], XMM2 ;
  1998. MOVAPS [EDX+48], XMM3 ;
  1999. ADD EDX, 64 ;
  2000. SUB EAX, 16 ;
  2001. JMP aligned16 ;
  2002. ; LOOP FOR 2 pieces aligned
  2003. aligned4: ;
  2004. CMP EAX, 4 ;
  2005. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2006. MOVAPS XMM0, [EBX] ;
  2007. ADD EBX, 16 ;
  2008. MOVAPS XMM1, [ECX] ;
  2009. ADD ECX, 16 ;
  2010. MULPS XMM0, XMM1 ;
  2011. MOVAPS [EDX], XMM0 ;
  2012. ADD EDX, 16 ;
  2013. SUB EAX, 4 ;
  2014. JMP aligned4 ;
  2015. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2016. unaligned: ;
  2017. unaligned16: ;
  2018. CMP EAX, 16 ;
  2019. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  2020. MOVUPS XMM0, [EBX] ;
  2021. MOVUPS XMM1, [EBX+16] ;
  2022. MOVUPS XMM2, [EBX+32] ;
  2023. MOVUPS XMM3, [EBX+48] ;
  2024. ADD EBX, 64 ;
  2025. MOVUPS XMM4, [ECX] ;
  2026. MOVUPS XMM5, [ECX+16] ;
  2027. MOVUPS XMM6, [ECX+32] ;
  2028. MOVUPS XMM7, [ECX+48] ;
  2029. ADD ECX, 64 ;
  2030. MULPS XMM0, XMM4 ;
  2031. MULPS XMM1, XMM5 ;
  2032. MULPS XMM2, XMM6 ;
  2033. MULPS XMM3, XMM7 ;
  2034. MOVUPS [EDX], XMM0 ;
  2035. MOVUPS [EDX+16], XMM1 ;
  2036. MOVUPS [EDX+32], XMM2 ;
  2037. MOVUPS [EDX+48], XMM3 ;
  2038. ADD EDX, 64 ;
  2039. SUB EAX, 16 ;
  2040. JMP unaligned16 ;
  2041. ; LOOP FOR 2 pieces aligned
  2042. unaligned4: ;
  2043. CMP EAX, 4 ;
  2044. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2045. MOVUPS XMM0, [EBX] ;
  2046. ADD EBX, 16 ;
  2047. MOVUPS XMM1, [ECX] ;
  2048. ADD ECX, 16 ;
  2049. MULPS XMM0, XMM1 ;
  2050. MOVUPS [EDX], XMM0 ;
  2051. ADD EDX, 16 ;
  2052. SUB EAX, 4 ;
  2053. JMP unaligned4 ;
  2054. ; one piece left OR non-contiguous data
  2055. single:
  2056. singlepieces: ;
  2057. CMP EAX, 0 ;
  2058. JLE endL ; len <= 0- > EXIT
  2059. MOVSS XMM0, [EBX]
  2060. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2061. MOVSS XMM1, [ECX]
  2062. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  2063. MULSS XMM0, XMM1 ;
  2064. MOVSS [EDX], XMM0
  2065. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2066. DEC EAX ; DEC(len)
  2067. JMP singlepieces ;
  2068. endL:
  2069. END EMulARARLoopSSE;
  2070. PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  2071. CODE {SYSTEM.i386, SYSTEM.FPU}
  2072. MOV EAX, [EBP+len] ; eax := len
  2073. MOV EBX, [EBP+ladr] ; ebx := ladr
  2074. MOV ECX, [EBP+radr] ; ecx := radr
  2075. MOV EDX, [EBP+dadr] ; edx := dadr
  2076. FLD QWORD [EDX] ; S.GET(dadr, x)
  2077. start:
  2078. CMP EAX, 0 ; WHILE len > 0 DO
  2079. JLE endL
  2080. FLD QWORD [EBX] ; S.GET(ladr, x)
  2081. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2082. FLD QWORD [ECX] ; S.GET(ladr, y)
  2083. FMULP ; x := x*y
  2084. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  2085. FADDP ; z := z+x
  2086. DEC EAX ; DEC(len)
  2087. JMP start ;
  2088. endL:
  2089. FSTP QWORD [EDX] ; S.PUT(dadr, x)
  2090. FWAIT ;
  2091. END SPAXAXLoopA;
  2092. PROCEDURE SPARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  2093. CODE {SYSTEM.i386, SYSTEM.FPU}
  2094. MOV EAX, [EBP+len] ; eax := len
  2095. MOV EBX, [EBP+ladr] ; ebx := ladr
  2096. MOV ECX, [EBP+radr] ; ecx := radr
  2097. MOV EDX, [EBP+dadr] ; edx := dadr
  2098. FLD DWORD [EDX] ; S.GET(dadr, x)
  2099. start:
  2100. CMP EAX, 0 ; WHILE len > 0 DO
  2101. JLE endL
  2102. FLD DWORD [EBX] ; S.GET(ladr, x)
  2103. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2104. FLD DWORD [ECX] ; S.GET(ladr, y)
  2105. FMULP ; x := x*y
  2106. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  2107. FADDP ; z := z+x
  2108. DEC EAX ; DEC(len)
  2109. JMP start ;
  2110. endL:
  2111. FSTP DWORD [EDX] ; S.PUT(dadr, x)
  2112. FWAIT ;
  2113. END SPARARLoopA;
  2114. (* sse version of scalar product *)
  2115. PROCEDURE SPAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  2116. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2117. ; register initialization
  2118. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2119. CMP EAX, 0 ;
  2120. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2121. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2122. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  2123. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2124. XORPD XMM0, XMM0 ;
  2125. MOVSD XMM0, [EDX] ; destination- > low bytes OF xmm0
  2126. CMP [EBP+linc], 8 ; check left FOR contiunuity
  2127. JNE single ; not continuous- > simplest method
  2128. CMP [EBP+rinc], 8 ; check dest FOR continuity
  2129. JNE single ; not continuous- > simplest method
  2130. ; check FOR alignment
  2131. MOV ESI, EBX ;
  2132. AND ESI, 7 ; ladr MOD 8
  2133. CMP ESI, 0 ; ECX = 0- > 64 Bit alignment
  2134. JNE unaligned ; not 64 bit aligned
  2135. MOV ESI, ECX ;
  2136. AND ESI, 7 ; radr MOD 8
  2137. CMP ESI, 0 ; = 0- > 64 Bit alignment
  2138. JNE unaligned ; not 64 bit aligned
  2139. MOV ESI, EBX ;
  2140. AND ESI, 8 ; 16 byte alignment
  2141. MOV EDI, ECX ;
  2142. AND EDI, 8 ; 16 byte alignment
  2143. CMP ESI, EDI ;
  2144. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2145. CMP ESI, 8 ;
  2146. JNE aligned ; ladr and dadr already 128 bit aligned
  2147. ; one single element processing TO achieve 128 bt alignment
  2148. MOVSD XMM1, [EBX] ;
  2149. MOVSD XMM2, [ECX] ;
  2150. MULSD XMM1, XMM2 ;
  2151. ADDSD XMM0, XMM1 ;
  2152. ADD EBX, 8 ; now EBX IS 16 byte aligned
  2153. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  2154. DEC EAX ; one element has been processed
  2155. ; LOOP FOR 4 pieces aligned
  2156. aligned:
  2157. aligned6:
  2158. CMP EAX, 6 ;
  2159. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2160. MOVAPD XMM1, [EBX] ;
  2161. MOVAPD XMM2, [EBX+16] ;
  2162. MOVAPD XMM3, [EBX+32] ;
  2163. MOVAPD XMM4, [ECX] ;
  2164. MOVAPD XMM5, [ECX+16] ;
  2165. MOVAPD XMM6, [ECX+32] ;
  2166. MULPD XMM1, XMM4 ;
  2167. ADDPD XMM0, XMM1 ;
  2168. MULPD XMM2, XMM5 ;
  2169. ADDPD XMM0, XMM2 ;
  2170. MULPD XMM3, XMM6 ;
  2171. ADDPD XMM0, XMM3 ;
  2172. ADD EBX, 48 ;
  2173. ADD ECX, 48 ;
  2174. SUB EAX, 6 ;
  2175. JMP aligned6 ;
  2176. ; LOOP FOR 2 pieces aligned
  2177. aligned2:
  2178. CMP EAX, 2 ;
  2179. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2180. MOVAPD XMM1, [EBX] ;
  2181. MOVAPD XMM2, [ECX] ;
  2182. MULPD XMM1, XMM2 ;
  2183. ADDPD XMM0, XMM1 ;
  2184. ADD EBX, 16 ;
  2185. ADD ECX, 16 ;
  2186. SUB EAX, 2 ;
  2187. JMP aligned2 ;
  2188. unaligned:
  2189. unaligned6:
  2190. CMP EAX, 6 ;
  2191. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  2192. MOVUPD XMM1, [EBX] ;
  2193. MOVUPD XMM2, [EBX+16] ;
  2194. MOVUPD XMM3, [EBX+32] ;
  2195. MOVUPD XMM4, [ECX] ;
  2196. MOVUPD XMM5, [ECX+16] ;
  2197. MOVUPD XMM6, [ECX+32] ;
  2198. MULPD XMM1, XMM4 ;
  2199. ADDPD XMM0, XMM1 ;
  2200. MULPD XMM2, XMM5 ;
  2201. ADDPD XMM0, XMM2 ;
  2202. MULPD XMM3, XMM6 ;
  2203. ADDPD XMM0, XMM3 ;
  2204. ADD EBX, 48 ;
  2205. ADD ECX, 48 ;
  2206. SUB EAX, 6 ;
  2207. JMP unaligned6 ;
  2208. ; LOOP FOR 2 pieces aligned
  2209. unaligned2:
  2210. CMP EAX, 2 ;
  2211. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2212. MOVUPD XMM1, [EBX] ;
  2213. MOVUPD XMM2, [ECX] ;
  2214. MULPD XMM1, XMM2 ;
  2215. ADDPD XMM0, XMM1 ;
  2216. ADD EBX, 16 ;
  2217. ADD ECX, 16 ;
  2218. SUB EAX, 2 ;
  2219. JMP unaligned2 ;
  2220. horizontaladd: ;
  2221. MOVAPD XMM1, XMM0 ;
  2222. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2223. ADDPD XMM0, XMM1 ;
  2224. JMP singlepieces ;
  2225. single:
  2226. singlepieces: ;
  2227. CMP EAX, 0 ;
  2228. JLE store ; len <= 0- > EXIT
  2229. MOVSD XMM1, [EBX]
  2230. MOVSD XMM2, [ECX]
  2231. MULSD XMM1, XMM2
  2232. ADDSD XMM0, XMM1
  2233. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2234. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  2235. DEC EAX ; DEC(len)
  2236. JMP singlepieces ;
  2237. store:
  2238. MOVSD [EDX], XMM0 ;
  2239. endL:
  2240. END SPAXAXLoopSSE;
  2241. (* sse version of scalar product *)
  2242. PROCEDURE SPARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  2243. CODE {SYSTEM.i386, SYSTEM.SSE}
  2244. ; register initialization
  2245. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2246. CMP EAX, 0 ;
  2247. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2248. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2249. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  2250. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2251. XORPS XMM0, XMM0 ;
  2252. MOVSS XMM0, [EDX] ; destination- > low bytes OF xmm0
  2253. CMP [EBP+linc], 4 ; check left FOR contiunuity
  2254. JNE single ; not continuous- > simplest method
  2255. CMP [EBP+rinc], 4 ; check dest FOR continuity
  2256. JNE single ; not continuous- > simplest method
  2257. ; check FOR alignment
  2258. MOV ESI, EBX ;
  2259. AND ESI, 3 ; ladr MOD 4
  2260. CMP ESI, 0 ; ECX = 0- > 32 Bit alignment
  2261. JNE unaligned ; not 32 bit aligned
  2262. MOV ESI, ECX ;
  2263. AND ESI, 3 ; radr MOD 4
  2264. CMP ESI, 0 ; = 0- > 32 Bit alignment
  2265. JNE unaligned ; not 32 bit aligned
  2266. MOV ESI, EBX ;
  2267. AND ESI, 8+4 ; 16 byte alignment
  2268. MOV EDI, ECX ;
  2269. AND EDI, 8+4 ; 16 byte alignment
  2270. CMP ESI, EDI ;
  2271. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2272. CMP ESI, 0 ;
  2273. JE aligned ; already aligned
  2274. align:
  2275. ; one single element processing UNTIL 128 bt alignment achieved
  2276. MOVSS XMM1, [EBX] ;
  2277. MOVSS XMM2, [ECX] ;
  2278. MULSS XMM1, XMM2 ;
  2279. ADDSS XMM0, XMM1 ;
  2280. ADD EBX, 4 ;
  2281. ADD ECX, 4 ;
  2282. DEC EAX ; one element has been processed ;
  2283. CMP EAX, 0 ; all elements already processed?
  2284. JLE single ;
  2285. MOV ESI, EBX ;
  2286. AND ESI, 8+4 ;
  2287. CMP ESI, 0 ;
  2288. JNE align ;
  2289. aligned:
  2290. aligned12:
  2291. CMP EAX, 12 ;
  2292. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2293. MOVAPS XMM1, [EBX] ;
  2294. MOVAPS XMM2, [EBX+16] ;
  2295. MOVAPS XMM3, [EBX+32] ;
  2296. MOVAPS XMM4, [ECX] ;
  2297. MOVAPS XMM5, [ECX+16] ;
  2298. MOVAPS XMM6, [ECX+32] ;
  2299. MULPS XMM1, XMM4 ;
  2300. ADDPS XMM0, XMM1 ;
  2301. MULPS XMM2, XMM5 ;
  2302. ADDPS XMM0, XMM2 ;
  2303. MULPS XMM3, XMM6 ;
  2304. ADDPS XMM0, XMM3 ;
  2305. ADD EBX, 48 ;
  2306. ADD ECX, 48 ;
  2307. SUB EAX, 12 ;
  2308. JMP aligned12 ;
  2309. ; LOOP FOR 2 pieces aligned
  2310. aligned4:
  2311. CMP EAX, 4 ;
  2312. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2313. MOVAPS XMM1, [EBX] ;
  2314. MOVAPS XMM2, [ECX] ;
  2315. MULPS XMM1, XMM2 ;
  2316. ADDPS XMM0, XMM1 ;
  2317. ADD EBX, 16 ;
  2318. ADD ECX, 16 ;
  2319. SUB EAX, 4 ;
  2320. JMP aligned4 ;
  2321. unaligned:
  2322. unaligned12:
  2323. CMP EAX, 12 ;
  2324. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  2325. MOVUPS XMM1, [EBX] ;
  2326. MOVUPS XMM2, [EBX+16] ;
  2327. MOVUPS XMM3, [EBX+32] ;
  2328. MOVUPS XMM4, [ECX] ;
  2329. MOVUPS XMM5, [ECX+16] ;
  2330. MOVUPS XMM6, [ECX+32] ;
  2331. MULPS XMM1, XMM4 ;
  2332. ADDPS XMM0, XMM1 ;
  2333. MULPS XMM2, XMM5 ;
  2334. ADDPS XMM0, XMM2 ;
  2335. MULPS XMM3, XMM6 ;
  2336. ADDPS XMM0, XMM3 ;
  2337. ADD EBX, 48 ;
  2338. ADD ECX, 48 ;
  2339. SUB EAX, 12 ;
  2340. JMP unaligned12 ;
  2341. ; LOOP FOR 2 pieces aligned
  2342. unaligned4:
  2343. CMP EAX, 4 ;
  2344. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2345. MOVUPS XMM1, [EBX] ;
  2346. MOVUPS XMM2, [ECX] ;
  2347. MULPS XMM1, XMM2 ;
  2348. ADDPS XMM0, XMM1 ;
  2349. ADD EBX, 16 ;
  2350. ADD ECX, 16 ;
  2351. SUB EAX, 4 ;
  2352. JMP unaligned4 ;
  2353. horizontaladd: ;
  2354. MOVAPS XMM1, XMM0 ;
  2355. ; 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *)
  2356. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  2357. ADDPS XMM1, XMM0 ;
  2358. MOVAPS XMM0, XMM1
  2359. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  2360. ADDPS XMM0, XMM1 ;
  2361. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  2362. JMP singlepieces ;
  2363. single:
  2364. singlepieces: ;
  2365. CMP EAX, 0 ;
  2366. JLE store ; len <= 0- > EXIT
  2367. MOVSS XMM1, [EBX]
  2368. MOVSS XMM2, [ECX]
  2369. MULSS XMM1, XMM2
  2370. ADDSS XMM0, XMM1
  2371. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2372. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  2373. DEC EAX ; DEC(len)
  2374. JMP singlepieces ;
  2375. store:
  2376. MOVSS [EDX], XMM0 ;
  2377. endL:
  2378. END SPARARLoopSSE;
  2379. PROCEDURE MulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2380. CODE {SYSTEM.i386, SYSTEM.FPU}
  2381. MOV EAX, [EBP+len] ; eax := len
  2382. MOV EBX, [EBP+ladr] ; ebx := ladr
  2383. MOV ECX, [EBP+radr] ; ecx := radr
  2384. MOV EDX, [EBP+dadr] ; edx := dadr
  2385. start:
  2386. CMP EAX, 0 ; WHILE len > 0 DO
  2387. JLE endL
  2388. FLD QWORD [EBX] ; S.GET(ladr, x)
  2389. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2390. FLD QWORD [ECX] ; S.GET(ladr, y)
  2391. FMULP ; x := x*y
  2392. FSTP QWORD [EDX]
  2393. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2394. DEC EAX ; DEC(len)
  2395. JMP start ;
  2396. endL:
  2397. FWAIT ;
  2398. END MulAXSXLoopA;
  2399. PROCEDURE MulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2400. CODE {SYSTEM.i386, SYSTEM.FPU}
  2401. MOV EAX, [EBP+len] ; eax := len
  2402. MOV EBX, [EBP+ladr] ; ebx := ladr
  2403. MOV ECX, [EBP+radr] ; ecx := radr
  2404. MOV EDX, [EBP+dadr] ; edx := dadr
  2405. start:
  2406. CMP EAX, 0 ; WHILE len > 0 DO
  2407. JLE endL
  2408. FLD DWORD [EBX] ; S.GET(ladr, x)
  2409. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2410. FLD DWORD [ECX] ; S.GET(ladr, y)
  2411. FMULP ; x := x*y
  2412. FSTP DWORD [EDX]
  2413. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2414. DEC EAX ; DEC(len)
  2415. JMP start ;
  2416. endL:
  2417. FWAIT ;
  2418. END MulARSRLoopA;
  2419. PROCEDURE IncMulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2420. CODE {SYSTEM.i386, SYSTEM.FPU}
  2421. MOV EAX, [EBP+len] ; eax := len
  2422. MOV EBX, [EBP+ladr] ; ebx := ladr
  2423. MOV ECX, [EBP+radr] ; ecx := radr
  2424. MOV EDX, [EBP+dadr] ; edx := dadr
  2425. start:
  2426. CMP EAX, 0 ; WHILE len > 0 DO
  2427. JLE endL
  2428. FLD QWORD [EBX] ; S.GET(ladr, x)
  2429. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2430. FLD QWORD [ECX] ; S.GET(ladr, y)
  2431. FMULP ; x := x*y
  2432. FLD QWORD [EDX+8] ;
  2433. FADDP ;
  2434. FSTP QWORD [EDX]
  2435. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2436. DEC EAX ; DEC(len)
  2437. JMP start ;
  2438. endL:
  2439. FWAIT ;
  2440. END IncMulAXSXLoopA;
  2441. PROCEDURE IncMulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2442. CODE {SYSTEM.i386, SYSTEM.FPU}
  2443. MOV EAX, [EBP+len] ; eax := len
  2444. MOV EBX, [EBP+ladr] ; ebx := ladr
  2445. MOV ECX, [EBP+radr] ; ecx := radr
  2446. MOV EDX, [EBP+dadr] ; edx := dadr
  2447. start:
  2448. CMP EAX, 0 ; WHILE len > 0 DO
  2449. JLE endL
  2450. FLD DWORD [EBX] ; S.GET(ladr, x)
  2451. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2452. FLD DWORD [ECX] ; S.GET(ladr, y)
  2453. FMULP ; x := x*y
  2454. FLD DWORD [EDX+8] ;
  2455. FADDP ;
  2456. FSTP DWORD [EDX]
  2457. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2458. DEC EAX ; DEC(len)
  2459. JMP start ;
  2460. endL:
  2461. FWAIT ;
  2462. END IncMulARSRLoopA;
  2463. PROCEDURE MulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2464. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2465. (*
  2466. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2467. 2.) process starting unaligned data ( using single instructions)
  2468. 3.) process aligned data
  2469. 4.) process remaining unaligned data (using single instructions)
  2470. *)
  2471. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2472. ; register initialization
  2473. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2474. CMP EAX, 0 ;
  2475. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2476. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2477. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2478. MOV ECX, [EBP+radr] ;
  2479. MOVSD XMM0, [ECX] ;
  2480. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2481. ; check IF data are contiguous IN memory
  2482. CMP [EBP+linc], 8 ; check left FOR contiunuity
  2483. JNE single ; not continuous- > simplest method
  2484. CMP [EBP+dinc], 8 ; check dest FOR continuity
  2485. JNE single ; not continuous- > simplest method
  2486. ; check FOR alignment
  2487. MOV ECX, EBX ;
  2488. AND ECX, 7 ; ladr MOD 8
  2489. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2490. JNE unaligned ; not 64 bit aligned
  2491. MOV ECX, EDX ;
  2492. AND ECX, 7 ; dadr MOD 8
  2493. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2494. JNE unaligned ; not 64 bit aligned
  2495. MOV ESI, EBX ;
  2496. AND ESI, 8 ; 16 byte alignment
  2497. MOV EDI, EDX ;
  2498. AND EDI, 8 ; 16 byte alignment
  2499. CMP ESI, EDI ;
  2500. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2501. CMP ESI, 8 ;
  2502. JNE aligned ; ladr and dadr already 128 bit aligned
  2503. ; one single element processing TO achieve 128 bt alignment
  2504. MOVSD XMM1, [EBX] ;
  2505. MULSD XMM1, XMM0 ;
  2506. MOVSD [EDX], XMM1 ;
  2507. ADD EBX, 8 ; now EBX IS 16 byte aligned
  2508. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  2509. DEC EAX ; one element has been processed
  2510. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2511. aligned:
  2512. aligned8:
  2513. CMP EAX, 8 ;
  2514. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2515. MOVAPD XMM1, [EBX] ;
  2516. MOVAPD XMM2, [EBX+16] ;
  2517. MOVAPD XMM3, [EBX+32] ;
  2518. MOVAPD XMM4, [EBX+48] ;
  2519. ADD EBX, 64 ;
  2520. MULPD XMM1, XMM0 ;
  2521. MULPD XMM2, XMM0 ;
  2522. MULPD XMM3, XMM0 ;
  2523. MULPD XMM4, XMM0 ;
  2524. MOVAPD [EDX], XMM1 ;
  2525. MOVAPD [EDX+16], XMM2 ;
  2526. MOVAPD [EDX+32], XMM3 ;
  2527. MOVAPD [EDX+48], XMM4 ;
  2528. ADD EDX, 64 ;
  2529. SUB EAX, 8 ;
  2530. JMP aligned8 ;
  2531. ; LOOP FOR 2 pieces aligned
  2532. aligned2: ;
  2533. CMP EAX, 2 ;
  2534. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2535. MOVAPD XMM1, [EBX] ;
  2536. ADD EBX, 16 ;
  2537. MULPD XMM1, XMM0 ;
  2538. MOVAPD [EDX], XMM1 ;
  2539. ADD EDX, 16 ;
  2540. SUB EAX, 2 ;
  2541. JMP aligned2 ;
  2542. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2543. unaligned: ;
  2544. unaligned8: ;
  2545. CMP EAX, 8 ;
  2546. JL unaligned2 ; len < 12- > EXIT
  2547. MOVUPD XMM1, [EBX] ;
  2548. MOVUPD XMM2, [EBX+16] ;
  2549. MOVUPD XMM3, [EBX+32] ;
  2550. MOVUPD XMM4, [EBX+48] ;
  2551. ADD EBX, 64
  2552. MULPD XMM1, XMM0 ;
  2553. MULPD XMM2, XMM0 ;
  2554. MULPD XMM3, XMM0 ;
  2555. MULPD XMM4, XMM0 ;
  2556. MOVUPD [EDX], XMM1 ;
  2557. MOVUPD [EDX+16], XMM2 ;
  2558. MOVUPD [EDX+32], XMM3 ;
  2559. MOVUPD [EDX+48], XMM4 ;
  2560. ADD EDX, 64 ;
  2561. SUB EAX, 8 ;
  2562. JMP unaligned8 ;
  2563. ; LOOP FOR 2 pieces unaligned
  2564. unaligned2: ;
  2565. CMP EAX, 2 ;
  2566. JL singlepieces ; len < 2- > EXIT
  2567. MOVUPD XMM1, [EBX] ;
  2568. ADD EBX, 16 ;
  2569. MULPD XMM1, XMM0 ;
  2570. MOVUPD [EDX], XMM1 ;
  2571. ADD EDX, 16 ;
  2572. SUB EAX, 2 ;
  2573. JMP unaligned2 ;
  2574. ; one piece left OR non-contiguous data
  2575. single:
  2576. singlepieces: ;
  2577. CMP EAX, 0 ;
  2578. JLE endL ; len <= 0- > EXIT
  2579. MOVSD XMM1, [EBX]
  2580. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2581. MULSD XMM1, XMM0
  2582. MOVSD [EDX], XMM1
  2583. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2584. DEC EAX ; DEC(len)
  2585. JMP singlepieces ;
  2586. endL:
  2587. END MulAXSXLoopSSE;
  2588. PROCEDURE MulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2589. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2590. (*
  2591. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2592. 2.) process starting unaligned data ( using single instructions)
  2593. 3.) process aligned data
  2594. 4.) process remaining unaligned data (using single instructions)
  2595. *)
  2596. CODE {SYSTEM.i386, SYSTEM.SSE}
  2597. ; register initialization
  2598. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2599. CMP EAX, 0 ;
  2600. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2601. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2602. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2603. MOV ECX, [EBP+radr] ;
  2604. MOVSS XMM0, [ECX] ;
  2605. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2606. ; check IF data are contiguous IN memory
  2607. CMP [EBP+linc], 4 ; check left FOR contiunuity
  2608. JNE single ; not continuous- > simplest method
  2609. CMP [EBP+dinc], 4 ; check dest FOR continuity
  2610. JNE single ; not continuous- > simplest method
  2611. ; check FOR alignment
  2612. MOV ECX, EBX ;
  2613. AND ECX, 3 ; ladr MOD 4
  2614. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2615. JNE unaligned ; not 32 bit aligned
  2616. MOV ECX, EDX ;
  2617. AND ECX, 3 ; dadr MOD 4
  2618. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2619. JNE unaligned ; not 64 bit aligned
  2620. MOV ESI, EBX ;
  2621. AND ESI, 8+4 ; 16 byte alignment
  2622. MOV EDI, EDX ;
  2623. AND EDI, 8+4 ; 16 byte alignment
  2624. CMP ESI, EDI ;
  2625. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2626. CMP ESI, 0 ;
  2627. JE aligned ; already aligned
  2628. align:
  2629. ; one single element processing UNTIL 128 bt alignment achieved
  2630. MOVSS XMM1, [EBX] ;
  2631. MULSS XMM1, XMM0 ;
  2632. MOVSS [EDX], XMM1 ;
  2633. ADD EBX, 4 ;
  2634. ADD EDX, 4 ;
  2635. DEC EAX ; one element has been processed ;
  2636. CMP EAX, 0 ; all elements already processed?
  2637. JLE single
  2638. MOV ESI, EBX ;
  2639. AND ESI, 8+4 ;
  2640. CMP ESI, 0 ;
  2641. JNE align ;
  2642. aligned:
  2643. aligned16:
  2644. CMP EAX, 16 ;
  2645. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2646. MOVAPS XMM1, [EBX] ;
  2647. MOVAPS XMM2, [EBX+16] ;
  2648. MOVAPS XMM3, [EBX+32] ;
  2649. MOVAPS XMM4, [EBX+48] ;
  2650. ADD EBX, 64 ;
  2651. MULPS XMM1, XMM0 ;
  2652. MULPS XMM2, XMM0 ;
  2653. MULPS XMM3, XMM0 ;
  2654. MULPS XMM4, XMM0 ;
  2655. MOVAPS [EDX], XMM1 ;
  2656. MOVAPS [EDX+16], XMM2 ;
  2657. MOVAPS [EDX+32], XMM3 ;
  2658. MOVAPS [EDX+48], XMM4 ;
  2659. ADD EDX, 64 ;
  2660. SUB EAX, 16 ;
  2661. JMP aligned16 ;
  2662. ; LOOP FOR 2 pieces aligned
  2663. aligned4: ;
  2664. CMP EAX, 4 ;
  2665. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2666. MOVAPS XMM1, [EBX] ;
  2667. ADD EBX, 16 ;
  2668. MULPS XMM1, XMM0 ;
  2669. MOVAPS [EDX], XMM1 ;
  2670. ADD EDX, 16 ;
  2671. SUB EAX, 4 ;
  2672. JMP aligned4 ;
  2673. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2674. unaligned: ;
  2675. unaligned16: ;
  2676. CMP EAX, 16 ;
  2677. JL unaligned4 ; len < 12- > EXIT
  2678. MOVUPS XMM1, [EBX] ;
  2679. MOVUPS XMM2, [EBX+16] ;
  2680. MOVUPS XMM3, [EBX+32] ;
  2681. MOVUPS XMM4, [EBX+48] ;
  2682. ADD EBX, 64
  2683. MULPS XMM1, XMM0 ;
  2684. MULPS XMM2, XMM0 ;
  2685. MULPS XMM3, XMM0 ;
  2686. MULPS XMM4, XMM0 ;
  2687. MOVUPS [EDX], XMM1 ;
  2688. MOVUPS [EDX+16], XMM2 ;
  2689. MOVUPS [EDX+32], XMM3 ;
  2690. MOVUPS [EDX+48], XMM4 ;
  2691. ADD EDX, 64 ;
  2692. SUB EAX, 16 ;
  2693. JMP unaligned16 ;
  2694. ; LOOP FOR 2 pieces unaligned
  2695. unaligned4: ;
  2696. CMP EAX, 4 ;
  2697. JL singlepieces ; len < 2- > EXIT
  2698. MOVUPS XMM1, [EBX] ;
  2699. ADD EBX, 16 ;
  2700. MULPS XMM1, XMM0 ;
  2701. MOVUPS [EDX], XMM1 ;
  2702. ADD EDX, 16 ;
  2703. SUB EAX, 4 ;
  2704. JMP unaligned4 ;
  2705. ; one piece left OR non-contiguous data
  2706. single:
  2707. singlepieces: ;
  2708. CMP EAX, 0 ;
  2709. JLE endL ; len <= 0- > EXIT
  2710. MOVSS XMM1, [EBX]
  2711. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2712. MULSS XMM1, XMM0
  2713. MOVSS [EDX], XMM1
  2714. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2715. DEC EAX ; DEC(len)
  2716. JMP singlepieces ;
  2717. endL:
  2718. END MulARSRLoopSSE;
  2719. PROCEDURE IncMulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2720. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2721. (*
  2722. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2723. 2.) process starting unaligned data ( using single instructions)
  2724. 3.) process aligned data
  2725. 4.) process remaining unaligned data (using single instructions)
  2726. *)
  2727. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2728. ; register initialization
  2729. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2730. CMP EAX, 0 ;
  2731. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2732. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2733. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2734. MOV ECX, [EBP+radr] ;
  2735. MOVSD XMM0, [ECX] ;
  2736. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2737. ; check IF data are contiguous IN memory
  2738. CMP [EBP+linc], 8 ; check left FOR contiunuity
  2739. JNE single ; not continuous- > simplest method
  2740. CMP [EBP+dinc], 8 ; check dest FOR continuity
  2741. JNE single ; not continuous- > simplest method
  2742. ; check FOR alignment
  2743. MOV ECX, EBX ;
  2744. AND ECX, 7 ; ladr MOD 8
  2745. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2746. JNE unaligned ; not 64 bit aligned
  2747. MOV ECX, EDX ;
  2748. AND ECX, 7 ; dadr MOD 8
  2749. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2750. JNE unaligned ; not 64 bit aligned
  2751. MOV ESI, EBX ;
  2752. AND ESI, 8 ; 16 byte alignment
  2753. MOV EDI, EDX ;
  2754. AND EDI, 8 ; 16 byte alignment
  2755. CMP ESI, EDI ;
  2756. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2757. CMP ESI, 8 ;
  2758. JNE aligned ; ladr and dadr already 128 bit aligned
  2759. ; one single element processing TO achieve 128 bt alignment
  2760. MOVSD XMM1, [EBX] ;
  2761. MULSD XMM1, XMM0 ;
  2762. MOVSD XMM2, [EDX] ;
  2763. ADDSD XMM1, XMM2 ;
  2764. MOVSD [EDX], XMM1 ;
  2765. ADD EBX, 8 ; now EBX IS 16 byte aligned
  2766. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  2767. DEC EAX ; one element has been processed
  2768. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2769. aligned:
  2770. aligned8:
  2771. CMP EAX, 8 ;
  2772. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2773. MOVAPD XMM1, [EBX] ;
  2774. MOVAPD XMM2, [EBX+16] ;
  2775. MOVAPD XMM3, [EBX+32] ;
  2776. MOVAPD XMM4, [EBX+48] ;
  2777. ADD EBX, 64 ;
  2778. MULPD XMM1, XMM0 ;
  2779. MULPD XMM2, XMM0 ;
  2780. MULPD XMM3, XMM0 ;
  2781. MULPD XMM4, XMM0 ;
  2782. MOVAPD XMM5, [EDX] ;
  2783. ADDPD XMM1, XMM5
  2784. MOVAPD [EDX], XMM1 ;
  2785. MOVAPD XMM6, [EDX+16] ;
  2786. ADDPD XMM2, XMM6
  2787. MOVAPD [EDX+16], XMM2 ;
  2788. MOVAPD XMM7, [EDX+32] ;
  2789. ADDPD XMM3, XMM7
  2790. MOVAPD [EDX+32], XMM3 ;
  2791. MOVAPD XMM5, [EDX+48] ;
  2792. ADDPD XMM4, XMM5
  2793. MOVAPD [EDX+48], XMM4 ;
  2794. ADD EDX, 64 ;
  2795. SUB EAX, 8 ;
  2796. JMP aligned8 ;
  2797. ; LOOP FOR 2 pieces aligned
  2798. aligned2: ;
  2799. CMP EAX, 2 ;
  2800. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2801. MOVAPD XMM1, [EBX] ;
  2802. ADD EBX, 16 ;
  2803. MULPD XMM1, XMM0 ;
  2804. MOVAPD XMM2, [EDX] ;
  2805. ADDPD XMM1, XMM2
  2806. MOVAPD [EDX], XMM1 ;
  2807. ADD EDX, 16 ;
  2808. SUB EAX, 2 ;
  2809. JMP aligned2 ;
  2810. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2811. unaligned: ;
  2812. unaligned8: ;
  2813. CMP EAX, 8 ;
  2814. JL unaligned2 ; len < 12- > EXIT
  2815. MOVUPD XMM1, [EBX] ;
  2816. MOVUPD XMM2, [EBX+16] ;
  2817. MOVUPD XMM3, [EBX+32] ;
  2818. MOVUPD XMM4, [EBX+48] ;
  2819. ADD EBX, 64
  2820. MULPD XMM1, XMM0 ;
  2821. MULPD XMM2, XMM0 ;
  2822. MULPD XMM3, XMM0 ;
  2823. MULPD XMM4, XMM0 ;
  2824. MOVUPD XMM5, [EDX] ;
  2825. ADDPD XMM1, XMM5
  2826. MOVUPD [EDX], XMM1 ;
  2827. MOVUPD XMM6, [EDX+16] ;
  2828. ADDPD XMM2, XMM6
  2829. MOVUPD [EDX+16], XMM2 ;
  2830. MOVUPD XMM7, [EDX+32] ;
  2831. ADDPD XMM3, XMM7
  2832. MOVUPD [EDX+32], XMM3 ;
  2833. MOVUPD XMM5, [EDX+48] ;
  2834. ADDPD XMM4, XMM5
  2835. MOVUPD [EDX+48], XMM4 ;
  2836. ADD EDX, 64 ;
  2837. SUB EAX, 8 ;
  2838. JMP unaligned8 ;
  2839. ; LOOP FOR 2 pieces unaligned
  2840. unaligned2: ;
  2841. CMP EAX, 2 ;
  2842. JL singlepieces ; len < 2- > EXIT
  2843. MOVUPD XMM1, [EBX] ;
  2844. ADD EBX, 16 ;
  2845. MULPD XMM1, XMM0 ;
  2846. MOVUPD XMM2, [EDX] ;
  2847. ADDPD XMM1, XMM2
  2848. MOVUPD [EDX], XMM1 ;
  2849. ADD EDX, 16 ;
  2850. SUB EAX, 2 ;
  2851. JMP unaligned2 ;
  2852. ; one piece left OR non-contiguous data
  2853. single:
  2854. singlepieces: ;
  2855. CMP EAX, 0 ;
  2856. JLE endL ; len <= 0- > EXIT
  2857. MOVSD XMM1, [EBX]
  2858. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2859. MULSD XMM1, XMM0
  2860. MOVSD XMM2, [EDX] ;
  2861. ADDSD XMM1, XMM2
  2862. MOVSD [EDX], XMM1
  2863. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2864. DEC EAX ; DEC(len)
  2865. JMP singlepieces ;
  2866. endL:
  2867. END IncMulAXSXLoopSSE;
  2868. PROCEDURE IncMulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2869. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2870. (*
  2871. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2872. 2.) process starting unaligned data ( using single instructions)
  2873. 3.) process aligned data
  2874. 4.) process remaining unaligned data (using single instructions)
  2875. *)
  2876. CODE {SYSTEM.i386, SYSTEM.SSE}
  2877. ; register initialization
  2878. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2879. CMP EAX, 0 ;
  2880. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2881. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2882. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2883. MOV ECX, [EBP+radr] ;
  2884. MOVSS XMM0, [ECX] ;
  2885. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2886. ; check IF data are contiguous IN memory
  2887. CMP [EBP+linc], 4 ; check left FOR contiunuity
  2888. JNE single ; not continuous- > simplest method
  2889. CMP [EBP+dinc], 4 ; check dest FOR continuity
  2890. JNE single ; not continuous- > simplest method
  2891. ; check FOR alignment
  2892. MOV ECX, EBX ;
  2893. AND ECX, 3 ; ladr MOD 4
  2894. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2895. JNE unaligned ; not 32 bit aligned
  2896. MOV ECX, EDX ;
  2897. AND ECX, 3 ; dadr MOD 4
  2898. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2899. JNE unaligned ; not 64 bit aligned
  2900. MOV ESI, EBX ;
  2901. AND ESI, 8+4 ; 16 byte alignment
  2902. MOV EDI, EDX ;
  2903. AND EDI, 8+4 ; 16 byte alignment
  2904. CMP ESI, EDI ;
  2905. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2906. CMP ESI, 0 ;
  2907. JE aligned ; already aligned
  2908. align:
  2909. ; one single element processing UNTIL 128 bt alignment achieved
  2910. MOVSS XMM1, [EBX] ;
  2911. MULSS XMM1, XMM0 ;
  2912. MOVSS XMM2, [EDX] ;
  2913. ADDSS XMM1, XMM2 ;
  2914. MOVSS [EDX], XMM1 ;
  2915. ADD EBX, 4 ;
  2916. ADD EDX, 4 ;
  2917. DEC EAX ; one element has been processed ;
  2918. CMP EAX, 0 ; all elements already processed?
  2919. JLE single
  2920. MOV ESI, EBX ;
  2921. AND ESI, 8+4 ;
  2922. CMP ESI, 0 ;
  2923. JNE align ;
  2924. aligned:
  2925. aligned16:
  2926. CMP EAX, 16 ;
  2927. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2928. MOVAPS XMM1, [EBX] ;
  2929. MOVAPS XMM2, [EBX+16] ;
  2930. MOVAPS XMM3, [EBX+32] ;
  2931. MOVAPS XMM4, [EBX+48] ;
  2932. ADD EBX, 64 ;
  2933. MULPS XMM1, XMM0 ;
  2934. MULPS XMM2, XMM0 ;
  2935. MULPS XMM3, XMM0 ;
  2936. MULPS XMM4, XMM0 ;
  2937. MOVAPS XMM5, [EDX] ;
  2938. ADDPS XMM1, XMM5 ;
  2939. MOVAPS [EDX], XMM1 ;
  2940. MOVAPS XMM6, [EDX+16] ;
  2941. ADDPS XMM2, XMM6 ;
  2942. MOVAPS [EDX+16], XMM2 ;
  2943. MOVAPS XMM7, [EDX+32] ;
  2944. ADDPS XMM3, XMM7 ;
  2945. MOVAPS [EDX+32], XMM3 ;
  2946. MOVAPS XMM5, [EDX+48] ;
  2947. ADDPS XMM4, XMM5 ;
  2948. MOVAPS [EDX+48], XMM4 ;
  2949. ADD EDX, 64 ;
  2950. SUB EAX, 16 ;
  2951. JMP aligned16 ;
  2952. ; LOOP FOR 2 pieces aligned
  2953. aligned4: ;
  2954. CMP EAX, 4 ;
  2955. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2956. MOVAPS XMM1, [EBX] ;
  2957. ADD EBX, 16 ;
  2958. MULPS XMM1, XMM0 ;
  2959. MOVAPS XMM2, [EDX] ;
  2960. ADDPS XMM1, XMM2 ;
  2961. MOVAPS [EDX], XMM1 ;
  2962. ADD EDX, 16 ;
  2963. SUB EAX, 4 ;
  2964. JMP aligned4 ;
  2965. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2966. unaligned: ;
  2967. unaligned16: ;
  2968. CMP EAX, 16 ;
  2969. JL unaligned4 ; len < 12- > EXIT
  2970. MOVUPS XMM1, [EBX] ;
  2971. MOVUPS XMM2, [EBX+16] ;
  2972. MOVUPS XMM3, [EBX+32] ;
  2973. MOVUPS XMM4, [EBX+48] ;
  2974. ADD EBX, 64
  2975. MULPS XMM1, XMM0 ;
  2976. MULPS XMM2, XMM0 ;
  2977. MULPS XMM3, XMM0 ;
  2978. MULPS XMM4, XMM0 ;
  2979. MOVUPS XMM5, [EDX] ;
  2980. ADDPS XMM1, XMM5 ;
  2981. MOVUPS [EDX], XMM1 ;
  2982. MOVUPS XMM6, [EDX+16] ;
  2983. ADDPS XMM2, XMM6 ;
  2984. MOVUPS [EDX+16], XMM2 ;
  2985. MOVUPS XMM7, [EDX+32] ;
  2986. ADDPS XMM3, XMM7 ;
  2987. MOVUPS [EDX+32], XMM3 ;
  2988. MOVUPS XMM5, [EDX+48] ;
  2989. ADDPS XMM4, XMM5 ;
  2990. MOVUPS [EDX+48], XMM4 ;
  2991. ADD EDX, 64 ;
  2992. SUB EAX, 16 ;
  2993. JMP unaligned16 ;
  2994. ; LOOP FOR 2 pieces unaligned
  2995. unaligned4: ;
  2996. CMP EAX, 4 ;
  2997. JL singlepieces ; len < 2- > EXIT
  2998. MOVUPS XMM1, [EBX] ;
  2999. ADD EBX, 16 ;
  3000. MULPS XMM1, XMM0 ;
  3001. MOVUPS XMM2, [EDX] ;
  3002. ADDPS XMM1, XMM2 ;
  3003. MOVUPS [EDX], XMM1 ;
  3004. ADD EDX, 16 ;
  3005. SUB EAX, 4 ;
  3006. JMP unaligned4 ;
  3007. ; one piece left OR non-contiguous data
  3008. single:
  3009. singlepieces: ;
  3010. CMP EAX, 0 ;
  3011. JLE endL ; len <= 0- > EXIT
  3012. MOVSS XMM1, [EBX]
  3013. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  3014. MULSS XMM1, XMM0
  3015. MOVSS XMM2, [EDX] ;
  3016. ADDSS XMM1, XMM2 ;
  3017. MOVSS [EDX], XMM1
  3018. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  3019. DEC EAX ; DEC(len)
  3020. JMP singlepieces ;
  3021. endL:
  3022. END IncMulARSRLoopSSE;
  3023. (*
  3024. PROCEDURE AlignedSPXSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  3025. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3026. ; ; register initialization
  3027. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  3028. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  3029. MOV ESI, [EBP+radr] ; ESI reserved for radr
  3030. MOV EAX, [EBP+len] ; EAX reserverd for length
  3031. MOV ECX, [EBP+stride] ; ECX reserved for stride
  3032. XORPD XMM2, XMM2 ;
  3033. XORPD XMM3, XMM3 ;
  3034. XORPD XMM4, XMM4 ;
  3035. XORPD XMM5, XMM5 ;
  3036. XORPD XMM6, XMM6 ;
  3037. XOR EDI, EDI ;
  3038. aligned4:
  3039. CMP EAX, 4 ;
  3040. JL aligned2 ; ; len < 4- > exit to singlepieces
  3041. MOV ESI, [EBP+radr] ;
  3042. ADD ESI, EDI ;
  3043. MOVAPD XMM7, [EBX] ;
  3044. MOVAPD XMM0, [ESI] ;
  3045. ADD ESI, ECX ;
  3046. MOVAPD XMM1, [ESI] ;
  3047. MULPD XMM0, XMM7 ;
  3048. ADDPD XMM2, XMM0 ;
  3049. ADD ESI, ECX ;
  3050. MOVAPD XMM0, [ESI] ;
  3051. MULPD XMM1, XMM7 ;
  3052. ADDPD XMM3, XMM1 ;
  3053. ADD ESI, ECX ;
  3054. MOVAPD XMM1, [ESI] ;
  3055. MULPD XMM0, XMM7 ;
  3056. ADDPD XMM4, XMM0 ;
  3057. ADD ESI, ECX ;
  3058. MOVAPD XMM0, [ESI] ;
  3059. MULPD XMM1, XMM7 ;
  3060. ADDPD XMM5, XMM1 ;
  3061. MULPD XMM0, XMM7 ;
  3062. ADDPD XMM6, XMM0 ;
  3063. ADD EBX, 16 ;
  3064. ADD EDI, 16 ;
  3065. MOV ESI, [EBP+radr] ;
  3066. ADD ESI, EDI ;
  3067. MOVAPD XMM7, [EBX] ;
  3068. MOVAPD XMM0, [ESI] ;
  3069. ADD ESI, ECX ;
  3070. MOVAPD XMM1, [ESI] ;
  3071. MULPD XMM0, XMM7 ;
  3072. ADDPD XMM2, XMM0 ;
  3073. ADD ESI, ECX ;
  3074. MOVAPD XMM0, [ESI] ;
  3075. MULPD XMM1, XMM7 ;
  3076. ADDPD XMM3, XMM1 ;
  3077. ADD ESI, ECX ;
  3078. MOVAPD XMM1, [ESI] ;
  3079. MULPD XMM0, XMM7 ;
  3080. ADDPD XMM4, XMM0 ;
  3081. ADD ESI, ECX ;
  3082. MOVAPD XMM0, [ESI] ;
  3083. MULPD XMM1, XMM7 ;
  3084. ADDPD XMM5, XMM1 ;
  3085. MULPD XMM0, XMM7 ;
  3086. ADDPD XMM6, XMM0 ;
  3087. ADD EBX, 16 ;
  3088. ADD EDI, 16 ;
  3089. SUB EAX, 4 ;
  3090. JMP aligned4 ;
  3091. aligned2:
  3092. CMP EAX, 2 ;
  3093. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3094. MOV ESI, [EBP+radr] ;
  3095. ADD ESI, EDI ;
  3096. MOVAPD XMM7, [EBX] ;
  3097. MOVAPD XMM0, [ESI] ;
  3098. ADD ESI, ECX ;
  3099. MOVAPD XMM1, [ESI] ;
  3100. MULPD XMM0, XMM7 ;
  3101. ADDPD XMM2, XMM0 ;
  3102. ADD ESI, ECX ;
  3103. MOVAPD XMM0, [ESI] ;
  3104. MULPD XMM1, XMM7 ;
  3105. ADDPD XMM3, XMM1 ;
  3106. ADD ESI, ECX ;
  3107. MOVAPD XMM1, [ESI] ;
  3108. MULPD XMM0, XMM7 ;
  3109. ADDPD XMM4, XMM0 ;
  3110. ADD ESI, ECX ;
  3111. MOVAPD XMM0, [ESI] ;
  3112. MULPD XMM1, XMM7 ;
  3113. ADDPD XMM5, XMM1 ;
  3114. MULPD XMM0, XMM7 ;
  3115. ADDPD XMM6, XMM0 ;
  3116. ADD EBX, 16 ;
  3117. ADD EDI, 16 ;
  3118. SUB EAX, 2 ;
  3119. JMP aligned2 ;
  3120. horizontaladd: ;
  3121. MOVAPD XMM1, XMM2 ;
  3122. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3123. ADDPD XMM2, XMM1 ;
  3124. MOVAPD XMM1, XMM3 ;
  3125. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3126. ADDPD XMM3, XMM1 ;
  3127. MOVAPD XMM1, XMM4 ;
  3128. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3129. ADDPD XMM4, XMM1 ;
  3130. MOVAPD XMM1, XMM5 ;
  3131. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3132. ADDPD XMM5, XMM1 ;
  3133. MOVAPD XMM1, XMM6 ;
  3134. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3135. ADDPD XMM6, XMM1 ;
  3136. singlepieces: ;
  3137. CMP EAX, 0 ;
  3138. JLE store ; len <= 0- > exit
  3139. MOV ESI, [EBP+radr] ;
  3140. MOVSD XMM7, [EBX] ;
  3141. MOVSD XMM0, [ESI+EDI] ;
  3142. ADD ESI, ECX ;
  3143. MOVSD XMM1, [ESI+EDI] ;
  3144. MULSD XMM0, XMM7 ;
  3145. ADDSD XMM2, XMM0 ;
  3146. ADD ESI, ECX ;
  3147. MOVSD XMM0, [ESI+EDI] ;
  3148. MULSD XMM1, XMM7 ;
  3149. ADDSD XMM3, XMM1 ;
  3150. ADD ESI, ECX ;
  3151. MOVSD XMM1, [ESI+EDI] ;
  3152. MULSD XMM0, XMM7 ;
  3153. ADDSD XMM4, XMM0 ;
  3154. ADD ESI, ECX ;
  3155. MOVSD XMM1, [ESI+EDI] ;
  3156. MULSD XMM0, XMM7 ;
  3157. ADDSD XMM4, XMM0 ;
  3158. ADD ESI, ECX ;
  3159. MOVSD XMM0, [ESI+EDI] ;
  3160. MULSD XMM1, XMM7 ;
  3161. ADDSD XMM5, XMM1 ;
  3162. MULSD XMM0, XMM7 ;
  3163. ADDSD XMM6, XMM0 ;
  3164. ADD EBX, 4 (* INC(ladr,incl) *)
  3165. ADD EDI, 4 (* INC(radr,incr) *)
  3166. DEC EAX ; DEC(len)
  3167. JMP singlepieces ;
  3168. store:
  3169. MOVSD [EDX], XMM2 ;
  3170. ADD EDX, [EBP+incd] ;
  3171. MOVSD [EDX], XMM3 ;
  3172. ADD EDX, [EBP+incd] ;
  3173. MOVSD [EDX], XMM4 ;
  3174. ADD EDX, [EBP+incd] ;
  3175. MOVSD [EDX], XMM5 ;
  3176. ADD EDX, [EBP+incd] ;
  3177. MOVSD [EDX], XMM6 ;
  3178. end:
  3179. END AlignedSPXSSE5;
  3180. *)
  3181. (* sse version of scalar product *)
  3182. PROCEDURE AlignedSPXSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  3183. add: BOOLEAN );
  3184. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3185. ; register initialization
  3186. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  3187. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  3188. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  3189. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  3190. XORPD XMM0, XMM0 ;
  3191. CMP [EBP+add], 0 ; add?
  3192. JE aligned8 ; no add
  3193. MOVSD XMM0, [EDX] ;
  3194. aligned8:
  3195. CMP EAX, 8 ;
  3196. JL aligned2 ; len < 4- > EXIT TO singlepieces
  3197. MOVAPD XMM1, [EBX] ;
  3198. MOVAPD XMM2, [EBX+16] ;
  3199. MOVAPD XMM3, [EBX+32] ;
  3200. MOVAPD XMM4, [ECX] ;
  3201. MOVAPD XMM5, [ECX+16] ;
  3202. MOVAPD XMM6, [ECX+32] ;
  3203. MULPD XMM1, XMM4 ;
  3204. ADDPD XMM0, XMM1 ;
  3205. MULPD XMM2, XMM5 ;
  3206. ADDPD XMM0, XMM2 ;
  3207. MULPD XMM3, XMM6 ;
  3208. ADDPD XMM0, XMM3 ;
  3209. MOVAPD XMM7, [EBX+48] ;
  3210. MOVAPD XMM1, [ECX+48] ;
  3211. MULPD XMM1, XMM7 ;
  3212. ADDPD XMM0, XMM1 ;
  3213. ADD EBX, 64 ;
  3214. ADD ECX, 64 ;
  3215. SUB EAX, 8 ;
  3216. JMP aligned8 ;
  3217. ; LOOP FOR 2 pieces aligned
  3218. aligned4:
  3219. CMP EAX, 4 ;
  3220. JL aligned2 ; ; len < 4- > EXIT TO singlepieces
  3221. MOVAPD XMM1, [EBX] ;
  3222. MOVAPD XMM2, [ECX] ;
  3223. MOVAPD XMM3, [EBX+16] ;
  3224. MOVAPD XMM4, [ECX+16] ;
  3225. MULPD XMM1, XMM2 ;
  3226. ADDPD XMM0, XMM1 ;
  3227. MULPD XMM3, XMM4 ;
  3228. ADDPD XMM0, XMM3 ;
  3229. ADD EBX, 32 ;
  3230. ADD ECX, 32 ;
  3231. SUB EAX, 4 ;
  3232. JMP aligned4 ;
  3233. aligned2:
  3234. CMP EAX, 2 ;
  3235. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  3236. MOVAPD XMM1, [EBX] ;
  3237. MOVAPD XMM2, [ECX] ;
  3238. MULPD XMM1, XMM2 ;
  3239. ADDPD XMM0, XMM1 ;
  3240. ADD EBX, 16 ;
  3241. ADD ECX, 16 ;
  3242. SUB EAX, 2 ;
  3243. JMP aligned2 ;
  3244. horizontaladd: ;
  3245. MOVAPD XMM1, XMM0 ;
  3246. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  3247. ADDPD XMM0, XMM1 ;
  3248. singlepieces: ;
  3249. CMP EAX, 0 ;
  3250. JLE store ; len <= 0- > EXIT
  3251. MOVSD XMM1, [EBX]
  3252. MOVSD XMM2, [ECX]
  3253. MULSD XMM1, XMM2
  3254. ADDSD XMM0, XMM1
  3255. ADD EBX, 8 ; INC(ladr, incl)
  3256. ADD ECX, 8 ; INC(radr, incr)
  3257. DEC EAX ; DEC(len)
  3258. JMP singlepieces ;
  3259. store:
  3260. MOVSD [EDX], XMM0 ;
  3261. endL:
  3262. END AlignedSPXSSE;
  3263. (*
  3264. PROCEDURE AlignedSPRSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  3265. CODE {SYSTEM.i386, SYSTEM.SSE}
  3266. ; register initialization
  3267. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  3268. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  3269. MOV ESI, [EBP+radr] ; ECX reserved for radr
  3270. MOV EAX, [EBP+len] ; EAX reserverd for length
  3271. MOV ECX, [EBP+stride] ;
  3272. XORPS XMM2, XMM2 ;
  3273. XORPS XMM3, XMM3 ;
  3274. XORPS XMM4, XMM4 ;
  3275. XORPS XMM5, XMM5 ;
  3276. XORPS XMM6, XMM6 ;
  3277. XOR EDI, EDI ;
  3278. aligned8:
  3279. CMP EAX, 8 ;
  3280. JL aligned4 ; ; len < 4- > exit to singlepieces
  3281. PREFETCH0 24[EBX] ;
  3282. ; PREFETCH0[ESI] ;
  3283. MOV ESI, [EBP+radr] ;
  3284. ADD ESI, EDI ;
  3285. MOVAPS XMM7, [EBX] ;
  3286. MOVAPS XMM0, [ESI] ;
  3287. ADD ESI, ECX ;
  3288. MOVAPS XMM1, [ESI] ;
  3289. MULPS XMM0, XMM7 ;
  3290. ADDPS XMM2, XMM0 ;
  3291. ADD ESI, ECX ;
  3292. MOVAPS XMM0, [ESI] ;
  3293. MULPS XMM1, XMM7 ;
  3294. ADDPS XMM3, XMM1 ;
  3295. ADD ESI, ECX ;
  3296. MOVAPS XMM1, [ESI] ;
  3297. MULPS XMM0, XMM7 ;
  3298. ADDPS XMM4, XMM0 ;
  3299. ADD ESI, ECX ;
  3300. MOVAPS XMM0, [ESI] ;
  3301. MULPS XMM1, XMM7 ;
  3302. ADDPS XMM5, XMM1 ;
  3303. MULPS XMM0, XMM7 ;
  3304. ADDPS XMM6, XMM0 ;
  3305. ADD EBX, 16 ;
  3306. ADD EDI, 16 ;
  3307. MOV ESI, [EBP+radr] ;
  3308. ADD ESI, EDI ;
  3309. MOVAPS XMM7, [EBX] ;
  3310. MOVAPS XMM0, [ESI] ;
  3311. ADD ESI, ECX ;
  3312. MOVAPS XMM1, [ESI] ;
  3313. MULPS XMM0, XMM7 ;
  3314. ADDPS XMM2, XMM0 ;
  3315. ADD ESI, ECX ;
  3316. MOVAPS XMM0, [ESI] ;
  3317. MULPS XMM1, XMM7 ;
  3318. ADDPS XMM3, XMM1 ;
  3319. ADD ESI, ECX ;
  3320. MOVAPS XMM1, [ESI] ;
  3321. MULPS XMM0, XMM7 ;
  3322. ADDPS XMM4, XMM0 ;
  3323. ADD ESI, ECX ;
  3324. MOVAPS XMM0, [ESI] ;
  3325. MULPS XMM1, XMM7 ;
  3326. ADDPS XMM5, XMM1 ;
  3327. MULPS XMM0, XMM7 ;
  3328. ADDPS XMM6, XMM0 ;
  3329. ADD EBX, 16 ;
  3330. ADD EDI, 16 ;
  3331. SUB EAX, 8 ;
  3332. JMP aligned8 ;
  3333. aligned4:
  3334. CMP EAX, 4 ;
  3335. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3336. MOV ESI, [EBP+radr] ;
  3337. ADD ESI, EDI ;
  3338. MOVAPS XMM7, [EBX] ;
  3339. MOVAPS XMM0, [ESI] ;
  3340. ADD ESI, ECX ;
  3341. MOVAPS XMM1, [ESI] ;
  3342. MULPS XMM0, XMM7 ;
  3343. ADDPS XMM2, XMM0 ;
  3344. ADD ESI, ECX ;
  3345. MOVAPS XMM0, [ESI] ;
  3346. MULPS XMM1, XMM7 ;
  3347. ADDPS XMM3, XMM1 ;
  3348. ADD ESI, ECX ;
  3349. MOVAPS XMM1, [ESI] ;
  3350. MULPS XMM0, XMM7 ;
  3351. ADDPS XMM4, XMM0 ;
  3352. ADD ESI, ECX ;
  3353. MOVAPS XMM0, [ESI] ;
  3354. MULPS XMM1, XMM7 ;
  3355. ADDPS XMM5, XMM1 ;
  3356. MULPS XMM0, XMM7 ;
  3357. ADDPS XMM6, XMM0 ;
  3358. ADD EBX, 16 ;
  3359. ADD EDI, 16 ;
  3360. SUB EAX, 4 ;
  3361. JMP aligned4 ;
  3362. horizontaladd: ;
  3363. MOVLHPS XMM1, XMM2 ;
  3364. ADDPS XMM1, XMM2 ;
  3365. SHUFPS XMM2, XMM1, 48 ;
  3366. ADDPS XMM2, XMM1 ;
  3367. MOVHLPS XMM2, XMM2 ;
  3368. MOVLHPS XMM1, XMM3 ;
  3369. ADDPS XMM1, XMM3 ;
  3370. SHUFPS XMM3, XMM1, 48 ;
  3371. ADDPS XMM3, XMM1 ;
  3372. MOVHLPS XMM3, XMM3 ;
  3373. MOVLHPS XMM1, XMM4 ;
  3374. ADDPS XMM1, XMM4 ;
  3375. SHUFPS XMM4, XMM1, 48 ;
  3376. ADDPS XMM4, XMM1 ;
  3377. MOVHLPS XMM4, XMM4 ;
  3378. MOVLHPS XMM1, XMM5 ;
  3379. ADDPS XMM1, XMM5 ;
  3380. SHUFPS XMM5, XMM1, 48 ;
  3381. ADDPS XMM5, XMM1 ;
  3382. MOVHLPS XMM5, XMM5 ;
  3383. MOVLHPS XMM1, XMM6 ;
  3384. ADDPS XMM1, XMM6 ;
  3385. SHUFPS XMM6, XMM1, 48 ;
  3386. ADDPS XMM6, XMM1 ;
  3387. MOVHLPS XMM6, XMM6 ;
  3388. singlepieces: ;
  3389. CMP EAX, 0 ;
  3390. JLE store ; len <= 0- > exit
  3391. MOV ESI, [EBP+radr] ;
  3392. MOVSS XMM7, [EBX] ;
  3393. MOVSS XMM0, [ESI+EDI] ;
  3394. ADD ESI, ECX ;
  3395. MOVSS XMM1, [ESI+EDI] ;
  3396. MULSS XMM0, XMM7 ;
  3397. ADDSS XMM2, XMM0 ;
  3398. ADD ESI, ECX ;
  3399. MOVSS XMM0, [ESI+EDI] ;
  3400. MULSS XMM1, XMM7 ;
  3401. ADDSS XMM3, XMM1 ;
  3402. ADD ESI, ECX ;
  3403. MOVSS XMM1, [ESI+EDI] ;
  3404. MULSS XMM0, XMM7 ;
  3405. ADDSS XMM4, XMM0 ;
  3406. ADD ESI, ECX ;
  3407. MOVSS XMM0, [ESI+EDI] ;
  3408. MULSS XMM1, XMM7 ;
  3409. ADDSS XMM5, XMM1 ;
  3410. MULSS XMM0, XMM7 ;
  3411. ADDSS XMM6, XMM0 ;
  3412. ADD EBX, 4 (* INC(ladr,incl) *)
  3413. ADD EDI, 4 (* INC(radr,incr) *)
  3414. DEC EAX ; DEC(len)
  3415. JMP singlepieces ;
  3416. store:
  3417. MOVSS [EDX], XMM2 ;
  3418. ADD EDX, [EBP+incd] ;
  3419. MOVSS [EDX], XMM3 ;
  3420. ADD EDX, [EBP+incd] ;
  3421. MOVSS [EDX], XMM4 ;
  3422. ADD EDX, [EBP+incd] ;
  3423. MOVSS [EDX], XMM5 ;
  3424. ADD EDX, [EBP+incd] ;
  3425. MOVSS [EDX], XMM6 ;
  3426. end:
  3427. END AlignedSPRSSE5;
  3428. *)
  3429. PROCEDURE AlignedSPRSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  3430. add: BOOLEAN );
  3431. CODE {SYSTEM.i386, SYSTEM.SSE}
  3432. ; register initialization
  3433. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  3434. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  3435. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  3436. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  3437. XORPS XMM0, XMM0 ;
  3438. CMP [EBP+add], 0 ; add?
  3439. JE aligned16 ; no add
  3440. MOVSS XMM0, [EDX] ;
  3441. aligned16:
  3442. CMP EAX, 16 ;
  3443. JL aligned8 ; len < 4- > EXIT TO singlepieces
  3444. MOVAPS XMM1, [EBX] ;
  3445. MOVAPS XMM4, [ECX] ;
  3446. MOVAPS XMM2, [EBX+16] ;
  3447. MOVAPS XMM5, [ECX+16] ;
  3448. MULPS XMM1, XMM4 ;
  3449. ADDPS XMM0, XMM1 ;
  3450. MOVAPS XMM3, [EBX+32] ;
  3451. MOVAPS XMM6, [ECX+32] ;
  3452. MULPS XMM2, XMM5 ;
  3453. ADDPS XMM0, XMM2 ;
  3454. MOVAPS XMM7, [EBX+48] ;
  3455. MOVAPS XMM1, [ECX+48] ;
  3456. MULPS XMM3, XMM6 ;
  3457. ADDPS XMM0, XMM3 ;
  3458. MULPS XMM1, XMM7 ;
  3459. ADDPS XMM0, XMM1 ;
  3460. ADD EBX, 64 ;
  3461. ADD ECX, 64 ;
  3462. SUB EAX, 16 ;
  3463. JMP aligned16 ;
  3464. ; LOOP FOR 8 pieces aligned
  3465. aligned8:
  3466. CMP EAX, 8 ;
  3467. JL aligned4 ; ; len < 4- > EXIT TO singlepieces
  3468. MOVAPS XMM1, [EBX] ;
  3469. MOVAPS XMM4, [ECX] ;
  3470. MOVAPS XMM2, [EBX+16] ;
  3471. MOVAPS XMM5, [ECX+16] ;
  3472. MULPS XMM1, XMM4 ;
  3473. ADDPS XMM0, XMM1 ;
  3474. MULPS XMM2, XMM5 ;
  3475. ADDPS XMM0, XMM2 ;
  3476. ADD EBX, 32 ;
  3477. ADD ECX, 32 ;
  3478. SUB EAX, 8 ;
  3479. JMP aligned8 ;
  3480. aligned4:
  3481. CMP EAX, 4 ;
  3482. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  3483. MOVAPS XMM1, [EBX] ;
  3484. MOVAPS XMM2, [ECX] ;
  3485. MULPS XMM1, XMM2 ;
  3486. ADDPS XMM0, XMM1 ;
  3487. ADD EBX, 16 ;
  3488. ADD ECX, 16 ;
  3489. SUB EAX, 4 ;
  3490. JMP aligned4 ;
  3491. horizontaladd: ;
  3492. MOVAPS XMM1, XMM0 ;
  3493. ; 1*0 (* dest 0 -> dest 0 *) + 4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  3494. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  3495. ADDPS XMM1, XMM0 ;
  3496. MOVAPS XMM0, XMM1
  3497. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  3498. ADDPS XMM0, XMM1 ;
  3499. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  3500. singlepieces: ;
  3501. CMP EAX, 0 ;
  3502. JLE store ; len <= 0- > EXIT
  3503. MOVSS XMM1, [EBX]
  3504. MOVSS XMM2, [ECX]
  3505. MULSS XMM1, XMM2
  3506. ADDSS XMM0, XMM1
  3507. ADD EBX, 4 ; INC(ladr, incl)
  3508. ADD ECX, 4 ; INC(radr, incr)
  3509. DEC EAX ; DEC(len)
  3510. JMP singlepieces ;
  3511. store:
  3512. MOVSS [EDX], XMM0 ;
  3513. endL:
  3514. END AlignedSPRSSE;
  3515. (*
  3516. (* sse version of scalar product *)
  3517. PROCEDURE AlignedSPRSSE( ladr, radr, dadr, rows, stride, dinc, len: LONGINT );
  3518. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3519. ; register initialization
  3520. MOV EDI, [EBP+radr] ; radr start
  3521. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  3522. MOV ESI, [EBP+rows] ; outer loop counter
  3523. outerloop:
  3524. CMP ESI, 0 ;
  3525. JLE end ;
  3526. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  3527. MOV ECX, EDI ; ECX reserved for radr
  3528. MOV EAX, [EBP+len] ; EAX reserverd for length
  3529. XORPS XMM0, XMM0 ;
  3530. aligned16:
  3531. CMP EAX, 16 ;
  3532. JL aligned8 ; len < 4- > exit to singlepieces
  3533. MOVAPS XMM1, [EBX] ;
  3534. MOVAPS XMM2, [EBX+16] ;
  3535. MOVAPS XMM3, [EBX+32] ;
  3536. MOVAPS XMM4, [ECX] ;
  3537. MOVAPS XMM5, [ECX+16] ;
  3538. MOVAPS XMM6, [ECX+32] ;
  3539. MULPS XMM1, XMM4 ;
  3540. ADDPS XMM0, XMM1 ;
  3541. MULPS XMM2, XMM5 ;
  3542. ADDPS XMM0, XMM2 ;
  3543. MULPS XMM3, XMM6 ;
  3544. ADDPS XMM0, XMM3 ;
  3545. MOVAPS XMM7, [EBX+48] ;
  3546. MOVAPS XMM1, [ECX+48] ;
  3547. MULPS XMM1, XMM7 ;
  3548. ADDPS XMM0, XMM1 ;
  3549. ADD EBX, 64 ;
  3550. ADD ECX, 64 ;
  3551. SUB EAX, 16 ;
  3552. JMP aligned16 ;
  3553. ; loop for 8 pieces aligned
  3554. aligned8:
  3555. CMP EAX, 8 ;
  3556. JL aligned4 ; ; len < 4- > exit to singlepieces
  3557. MOVAPS XMM1, [EBX] ;
  3558. MOVAPS XMM2, [EBX+16] ;
  3559. MOVAPS XMM4, [ECX] ;
  3560. MOVAPS XMM5, [ECX+16] ;
  3561. MULPS XMM1, XMM4 ;
  3562. ADDPS XMM0, XMM1 ;
  3563. MULPS XMM2, XMM5 ;
  3564. ADDPS XMM0, XMM2 ;
  3565. ADD EBX, 32 ;
  3566. ADD ECX, 32 ;
  3567. SUB EAX, 8 ;
  3568. JMP aligned8 ;
  3569. aligned4:
  3570. CMP EAX, 4 ;
  3571. JL horizontaladd ; ; len < 4- > exit to singlepieces
  3572. MOVAPS XMM1, [EBX] ;
  3573. MOVAPS XMM2, [ECX] ;
  3574. MULPS XMM1, XMM2 ;
  3575. ADDPS XMM0, XMM1 ;
  3576. ADD EBX, 16 ;
  3577. ADD ECX, 16 ;
  3578. SUB EAX, 4 ;
  3579. JMP aligned4 ;
  3580. horizontaladd: ;
  3581. MOVAPS XMM1, XMM0 ;
  3582. SHUFPS XMM1, XMM1, 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  3583. ADDPS XMM1, XMM0 ;
  3584. MOVAPS XMM0, XMM1
  3585. SHUFPS XMM0, XMM0, 16*3 ; (* src 3-> dest 2 *)
  3586. ADDPS XMM0, XMM1 ;
  3587. SHUFPS XMM0, XMM0, 1*2 ; (* dest 2 -> dest 0 *)
  3588. singlepieces: ;
  3589. CMP EAX, 0 ;
  3590. JLE store ; len <= 0- > exit
  3591. MOVSS XMM1, [EBX]
  3592. MOVSS XMM2, [ECX]
  3593. MULSS XMM1, XMM2
  3594. ADDSS XMM0, XMM1
  3595. ADD EBX, 4 (* INC(ladr,incl) *)
  3596. ADD ECX, 4 (* INC(radr,incr) *)
  3597. DEC EAX ; DEC(len)
  3598. JMP singlepieces ;
  3599. store:
  3600. MOVSS [EDX], XMM0 ;
  3601. ADD EDX, [EBP+dinc] ;
  3602. ADD EDI, [EBP+stride] ;
  3603. DEC ESI ;
  3604. JMP outerloop ;
  3605. end:
  3606. END AlignedSPRSSE;
  3607. *)
  3608. PROCEDURE Copy4( ladr, dadr: ADDRESS; linc, dinc, len: SIZE);
  3609. CODE {SYSTEM.i386}
  3610. MOV ESI, [EBP+ladr] ; ECX := ladr
  3611. MOV EDI, [EBP+dadr] ; EDX := dadr
  3612. MOV ECX, [EBP+len] ; EBX := len
  3613. MOV EAX, [EBP+linc] ;
  3614. CMP EAX, 4 ;
  3615. JNE loopL ;
  3616. MOV EAX, [EBP+dinc] ;
  3617. CMP EAX, 4 ;
  3618. JNE loopL ;
  3619. fastmove:
  3620. CLD ; incremental
  3621. REP ;
  3622. MOVSD ; move rest IN one byte steps
  3623. JMP endL ;
  3624. loopL:
  3625. CMP ECX, 0 ;
  3626. JLE endL ; WHILE ECX > 0 DO
  3627. MOV EAX, [ESI] ; EAX := SYSTEM.GET32(ESI)
  3628. MOV [EDI], EAX ; SYSTEM.PUT32(EDI, EAX))
  3629. ADD ESI, [EBP+linc] ; INC(ESI, linc)
  3630. ADD EDI, [EBP+dinc] ; INC(EDI, rinc)
  3631. DEC ECX ; DEC(ECX)
  3632. JMP loopL
  3633. endL:
  3634. END Copy4;
  3635. PROCEDURE Copy8( ladr, dadr: ADDRESS; linc, dinc, len: SIZE );
  3636. CODE {SYSTEM.i386}
  3637. MOV ESI, [EBP+ladr] ; ECX := ladr
  3638. MOV EDI, [EBP+dadr] ; EDX := dadr
  3639. MOV ECX, [EBP+len] ; EBX := len
  3640. MOV EAX, [EBP+linc] ;
  3641. CMP EAX, 8 ;
  3642. JNE loopL ;
  3643. MOV EAX, [EBP+dinc] ;
  3644. CMP EAX, 8 ;
  3645. JNE loopL ;
  3646. fastmove:
  3647. SHL ECX, 1 ;
  3648. CLD ; incremental
  3649. REP ;
  3650. MOVSD ; move rest IN one byte steps
  3651. JMP endL ;
  3652. loopL:
  3653. CMP ECX, 0 ;
  3654. JLE endL ; WHILE EBX > 0 DO
  3655. MOV EAX, [ESI] ; EAX := SYSTEM.GET32(ECX)
  3656. MOV [EDI], EAX ; SYSTEM.PUT32(EDX, EAX))
  3657. MOV EAX, [ESI+4] ; EAX := SYSTEM.GET32(ECX+4)
  3658. MOV [EDI+4], EAX ; SYSTEM.PUT32(EDX+4, EAX))
  3659. ADD ESI, [EBP+linc] ; INC(ECX, linc)
  3660. ADD EDI, [EBP+dinc] ; INC(EDX, rinc)
  3661. DEC ECX ; DEC(EBX)
  3662. JMP loopL
  3663. endL:
  3664. END Copy8;
  3665. PROCEDURE Transpose4A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3666. CODE {SYSTEM.i386}
  3667. startrows:
  3668. MOV EAX, [EBP+rows] ;
  3669. startouter:
  3670. CMP EAX, 0 ;
  3671. JLE endL ;
  3672. MOV ESI, [EBP+ladr] ;
  3673. MOV EDI, [EBP+dadr] ;
  3674. MOV EBX, [EBP+linc] ;
  3675. MOV ECX, [EBP+dstride] ;
  3676. MOV EAX, [EBP+cols] ;
  3677. startinner:
  3678. CMP EAX, 0 ;
  3679. JLE endinner ;
  3680. MOV EDX, [ESI] ;
  3681. MOV [EDI], EDX ;
  3682. ADD ESI, EBX ;
  3683. ADD EDI, ECX ;
  3684. DEC EAX ;
  3685. JMP startinner ;
  3686. endinner:
  3687. MOV ESI, [EBP+ladr] ;
  3688. ADD ESI, [EBP+lstride] ;
  3689. MOV [EBP+ladr], ESI
  3690. MOV EDI, [EBP+dadr] ;
  3691. ADD EDI, [EBP+dinc] ;
  3692. MOV [EBP+dadr], EDI ;
  3693. MOV EAX, [EBP+rows] ;
  3694. DEC EAX ;
  3695. MOV [EBP+rows], EAX ;
  3696. JMP startouter ;
  3697. endL:
  3698. END Transpose4A;
  3699. PROCEDURE Transpose4( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3700. VAR l, d, c: SIZE; BlockSize: SIZE;
  3701. BEGIN
  3702. BlockSize :=
  3703. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3704. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3705. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3706. BlockSize := MAX( 8, BlockSize );
  3707. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3708. WHILE (rows >= BlockSize) DO
  3709. c := cols; l := ladr; d := dadr;
  3710. WHILE (c >= BlockSize) DO
  3711. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3712. BlockSize );
  3713. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3714. INC( d, BlockSize * dstride );
  3715. END;
  3716. IF c > 0 THEN
  3717. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3718. END;
  3719. DEC( rows, BlockSize ); INC( ladr, BlockSize * lstride );
  3720. INC( dadr, BlockSize * dinc );
  3721. END;
  3722. IF (rows > 0) THEN
  3723. c := cols; l := ladr; d := dadr;
  3724. WHILE (c >= BlockSize) DO
  3725. Transpose4A( l, d, lstride, linc, dstride, dinc, rows,
  3726. BlockSize );
  3727. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3728. INC( d, BlockSize * dstride );
  3729. END;
  3730. IF c > 0 THEN
  3731. Transpose4A( l, d, lstride, linc, dstride, dinc, rows, c );
  3732. END;
  3733. END;
  3734. END Transpose4;
  3735. PROCEDURE Transpose8( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3736. VAR l, d, c: SIZE; BlockSize: SIZE;
  3737. BEGIN
  3738. BlockSize :=
  3739. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3740. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3741. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3742. BlockSize := MAX( 8, BlockSize );
  3743. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3744. WHILE (rows >= BlockSize) DO
  3745. c := cols; l := ladr; d := dadr;
  3746. WHILE (c >= BlockSize) DO
  3747. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3748. BlockSize );
  3749. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3750. INC( d, BlockSize * dstride );
  3751. END;
  3752. IF c > 0 THEN
  3753. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3754. END;
  3755. DEC( rows, BlockSize ); INC( ladr, lstride * BlockSize );
  3756. INC( dadr, dinc * BlockSize );
  3757. END;
  3758. IF (rows > 0) THEN
  3759. c := cols; l := ladr; d := dadr;
  3760. WHILE (c >= BlockSize) DO
  3761. Transpose8A( l, d, lstride, linc, dstride, dinc, rows,
  3762. BlockSize );
  3763. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3764. INC( d, BlockSize * dstride );
  3765. END;
  3766. IF c > 0 THEN
  3767. Transpose8A( l, d, lstride, linc, dstride, dinc, rows, c );
  3768. END;
  3769. END;
  3770. END Transpose8;
  3771. PROCEDURE Transpose8A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3772. CODE {SYSTEM.i386}
  3773. startrows:
  3774. MOV EAX, [EBP+rows] ;
  3775. startouter:
  3776. CMP EAX, 0 ;
  3777. JLE endL ;
  3778. MOV ESI, [EBP+ladr] ;
  3779. MOV EDI, [EBP+dadr] ;
  3780. MOV EBX, [EBP+linc] ;
  3781. MOV ECX, [EBP+dstride] ;
  3782. MOV EAX, [EBP+cols] ;
  3783. startinner:
  3784. CMP EAX, 0 ;
  3785. JLE endinner ;
  3786. MOV EDX, [ESI] ;
  3787. MOV [EDI], EDX ;
  3788. MOV EDX, [ESI+4] ;
  3789. MOV [EDI+4], EDX ;
  3790. ADD ESI, EBX ;
  3791. ADD EDI, ECX ;
  3792. DEC EAX ;
  3793. JMP startinner ;
  3794. endinner:
  3795. MOV ESI, [EBP+ladr] ;
  3796. ADD ESI, [EBP+lstride] ;
  3797. MOV [EBP+ladr], ESI
  3798. MOV EDI, [EBP+dadr] ;
  3799. ADD EDI, [EBP+dinc] ;
  3800. MOV [EBP+dadr], EDI ;
  3801. MOV EAX, [EBP+rows] ;
  3802. DEC EAX ;
  3803. MOV [EBP+rows], EAX ;
  3804. JMP startouter ;
  3805. endL:
  3806. END Transpose8A;
  3807. PROCEDURE SSEMul24BlockR( VAR CbFirst: SIZE;
  3808. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3809. add: BOOLEAN );
  3810. CODE {SYSTEM.i386, SYSTEM.SSE}
  3811. MatrixOfResultsSetup:
  3812. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3813. RowOfResultsLoop:
  3814. MOV EBX, 0 ; counter FOR columns IN B-Cb
  3815. DotProductSetup:
  3816. MOV ESI, [EBP+matrixA] ; matrixA
  3817. MOV EDI, [EBP+matrixB] ; matrixB
  3818. LEA EDI, [EDI+EBX*4] ; current position IN matrixB
  3819. XORPS XMM2, XMM2
  3820. XORPS XMM3, XMM3
  3821. XORPS XMM4, XMM4
  3822. XORPS XMM5, XMM5
  3823. XORPS XMM6, XMM6
  3824. XORPS XMM7, XMM7
  3825. MOV EAX, 0 ;
  3826. MOV AL, [EBP+add] ;
  3827. CMP AL, 0 ; add?
  3828. JE DotProductLoop ;
  3829. MOV EAX, [EBP+matrixC] ; matrixC
  3830. LEA EAX, [EAX+EBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3831. MOVUPS XMM2, [EAX]
  3832. MOVUPS XMM3, [EAX+16]
  3833. MOVUPS XMM4, [EAX+32]
  3834. MOVUPS XMM5, [EAX+48]
  3835. MOVUPS XMM6, [EAX+64]
  3836. MOVUPS XMM7, [EAX+80]
  3837. MOV EAX, 0
  3838. DotProductLoop:
  3839. MOV EDX, [ESI+EAX*4]
  3840. SHL EDX, 1
  3841. CMP EDX, 0
  3842. JE SparseEntryEscape
  3843. MOVSS XMM0, [ESI+EAX*4]
  3844. SHUFPS XMM0, XMM0, 0H
  3845. MOVUPS XMM1, [EDI]
  3846. MULPS XMM1, XMM0
  3847. ADDPS XMM2, XMM1
  3848. MOVUPS XMM1, [EDI+16]
  3849. MULPS XMM1, XMM0
  3850. ADDPS XMM3, XMM1
  3851. MOVUPS XMM1, [EDI+32]
  3852. MULPS XMM1, XMM0
  3853. ADDPS XMM4, XMM1
  3854. MOVUPS XMM1, [EDI+48]
  3855. MULPS XMM1, XMM0
  3856. ADDPS XMM5, XMM1
  3857. MOVUPS XMM1, [EDI+64]
  3858. MULPS XMM1, XMM0
  3859. ADDPS XMM6, XMM1
  3860. MOVUPS XMM1, [EDI+80]
  3861. MULPS XMM1, XMM0
  3862. ADDPS XMM7, XMM1
  3863. SparseEntryEscape:
  3864. ADD EDI, [EBP+StrideB] ; StrideB
  3865. INC EAX
  3866. CMP EAX, [EBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3867. JL DotProductLoop
  3868. ; endL DopProductLoop
  3869. MOV EAX, [EBP+matrixC] ; matrixC
  3870. LEA EAX, [EAX+EBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3871. MOVUPS [EAX], XMM2
  3872. MOVUPS [EAX+16], XMM3
  3873. MOVUPS [EAX+32], XMM4
  3874. MOVUPS [EAX+48], XMM5
  3875. MOVUPS [EAX+64], XMM6
  3876. MOVUPS [EAX+80], XMM7
  3877. ADD EBX, 24 ; move over TO next batch OF 24
  3878. MOV EDX, EBX
  3879. ADD EDX, 24
  3880. CMP EDX, [EBP+Cb] ; Cb, check TO see IF row IS complete
  3881. JLE DotProductSetup
  3882. ; endL RowOfResultsLoop
  3883. MOV EAX, [EBP+matrixA] ; matrixA
  3884. ADD EAX, [EBP+StrideA] ; StrideA
  3885. MOV [EBP+matrixA], EAX ; matrixA
  3886. MOV EAX, [EBP+matrixC] ; matrixC
  3887. ADD EAX, [EBP+StrideC] ; StrideC
  3888. MOV [EBP+matrixC], EAX ; matrixC
  3889. INC ECX
  3890. CMP ECX, [EBP+Ra] ; Ra
  3891. JL RowOfResultsLoop
  3892. Done:
  3893. MOV EAX, [EBP+CbFirst] ; CbFirst
  3894. MOV [EAX], EBX ;
  3895. END SSEMul24BlockR;
  3896. (*! might be better to make a 10Block operation and utilize 2 registers for temporary calculations, see article abaout Emmerald*)
  3897. PROCEDURE SSEMul12BlockX( VAR CbFirst: SIZE;
  3898. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC :ADDRESS;
  3899. add: BOOLEAN );
  3900. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3901. MatrixOfResultsSetup:
  3902. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3903. RowOfResultsLoop:
  3904. MOV EBX, 0 ; counter FOR columns IN B-Cb
  3905. DotProductSetup:
  3906. MOV ESI, [EBP+matrixA] ; matrixA
  3907. MOV EDI, [EBP+matrixB] ; matrixB
  3908. LEA EDI, [EDI+EBX*8]
  3909. XORPD XMM2, XMM2
  3910. XORPD XMM3, XMM3
  3911. XORPD XMM4, XMM4
  3912. XORPD XMM5, XMM5
  3913. XORPD XMM6, XMM6
  3914. XORPD XMM7, XMM7
  3915. MOV EAX, 0 ;
  3916. MOV AL, [EBP+add] ;
  3917. CMP AL, 0 ; add?
  3918. JE DotProductLoop ;
  3919. MOV EAX, [EBP+matrixC] ; matrixC
  3920. LEA EAX, [EAX+EBX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3921. MOVUPD XMM2, [EAX]
  3922. MOVUPD XMM3, [EAX+16]
  3923. MOVUPD XMM4, [EAX+32]
  3924. MOVUPD XMM5, [EAX+48]
  3925. MOVUPD XMM6, [EAX+64]
  3926. MOVUPD XMM7, [EAX+80]
  3927. MOV EAX, 0
  3928. DotProductLoop:
  3929. ; MOV EDX, [ESI+EAX*8]
  3930. ; SHL EDX, 1
  3931. ; CMP EDX, 0
  3932. ; JE SparseEntryEscape
  3933. MOVSD XMM0, [ESI+EAX*8]
  3934. SHUFPD XMM0, XMM0, 0H
  3935. MOVUPD XMM1, [EDI]
  3936. MULPD XMM1, XMM0
  3937. ADDPD XMM2, XMM1
  3938. MOVUPD XMM1, [EDI+16]
  3939. MULPD XMM1, XMM0
  3940. ADDPD XMM3, XMM1
  3941. MOVUPD XMM1, [EDI+32]
  3942. MULPD XMM1, XMM0
  3943. ADDPD XMM4, XMM1
  3944. MOVUPD XMM1, [EDI+48]
  3945. MULPD XMM1, XMM0
  3946. ADDPD XMM5, XMM1
  3947. MOVUPD XMM1, [EDI+64]
  3948. MULPD XMM1, XMM0
  3949. ADDPD XMM6, XMM1
  3950. MOVUPD XMM1, [EDI+80]
  3951. MULPD XMM1, XMM0
  3952. ADDPD XMM7, XMM1
  3953. SparseEntryEscape:
  3954. ADD EDI, [EBP+StrideB] ; StrideB
  3955. INC EAX
  3956. CMP EAX, [EBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3957. JL DotProductLoop ; endL DopProductLoop
  3958. MOV EAX , [EBP+matrixC] ; matrixC
  3959. LEA EAX, [EAX+EBX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3960. MOVUPD [EAX], XMM2
  3961. MOVUPD [EAX+16], XMM3
  3962. MOVUPD [EAX+32], XMM4
  3963. MOVUPD [EAX+48], XMM5
  3964. MOVUPD [EAX+64], XMM6
  3965. MOVUPD [EAX+80], XMM7
  3966. ADD EBX, 12 ; move over TO next batch OF 12
  3967. MOV EDX, EBX
  3968. ADD EDX, 12
  3969. CMP EDX, [EBP+Cb] ; Cb, check TO see IF row IS complete
  3970. JLE DotProductSetup ; end RowOfResultsLoop
  3971. MOV EAX , [EBP+matrixA] ; matrixA
  3972. ADD EAX, [EBP+StrideA] ; StrideA
  3973. MOV [EBP+matrixA], EAX ; matrixA
  3974. MOV EAX, [EBP+matrixC] ; matrixC
  3975. ADD EAX, [EBP+StrideC] ; StrideC
  3976. MOV [EBP+matrixC], EAX ; matrixC
  3977. INC ECX
  3978. CMP ECX, [EBP+Ra] ; Ra
  3979. JL RowOfResultsLoop
  3980. Done:
  3981. MOV EAX, [EBP+CbFirst] ; CbFirst
  3982. MOV [EAX], EBX ;
  3983. END SSEMul12BlockX;
  3984. PROCEDURE SSEMul16BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3985. add: BOOLEAN );
  3986. CODE {SYSTEM.i386, SYSTEM.SSE}
  3987. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3988. DotProductSetup:
  3989. MOV ESI, [EBP+matrixA] ; matrixA
  3990. MOV EDI, [EBP+matrixB] ; matrixB
  3991. MOV EDX, [EBP+CbFrom] ; CbFrom
  3992. LEA EDI, [EDI+EDX*4]
  3993. XORPS XMM2, XMM2
  3994. XORPS XMM3, XMM3
  3995. XORPS XMM4, XMM4
  3996. XORPS XMM5, XMM5
  3997. MOV EAX, 0 ;
  3998. MOV AL, [EBP+add] ;
  3999. CMP AL, 0 ; add?
  4000. JE DotProductLoop ;
  4001. MOV EAX, [EBP+matrixC] ; matrixC
  4002. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally
  4003. MOVUPS XMM2, [EAX]
  4004. MOVUPS XMM3, [EAX+16]
  4005. MOVUPS XMM4, [EAX+32]
  4006. MOVUPS XMM5, [EAX+48]
  4007. MOV EAX, 0
  4008. DotProductLoop:
  4009. MOV EDX, [ESI+EAX*4]
  4010. SHL EDX, 1
  4011. CMP EDX, 0
  4012. JE SparseEntryEscape
  4013. MOVSS XMM0, [ESI+EAX*4]
  4014. SHUFPS XMM0, XMM0, 0H
  4015. MOVUPS XMM1, [EDI]
  4016. MULPS XMM1, XMM0
  4017. ADDPS XMM2, XMM1
  4018. MOVUPS XMM1, [EDI+16]
  4019. MULPS XMM1, XMM0
  4020. ADDPS XMM3, XMM1
  4021. MOVUPS XMM1, [EDI+32]
  4022. MULPS XMM1, XMM0
  4023. ADDPS XMM4, XMM1
  4024. MOVUPS XMM1, [EDI+48]
  4025. MULPS XMM1, XMM0
  4026. ADDPS XMM5, XMM1
  4027. SparseEntryEscape:
  4028. ADD EDI, [EBP+StrideB] ; StrideB
  4029. INC EAX
  4030. CMP EAX, [EBP+Ca] ; Ca
  4031. JL DotProductLoop ; end DotProductLoop
  4032. MOV EAX , [EBP+matrixC] ; matrixC
  4033. MOV EDX, [EBP+CbFrom] ; CbFirst
  4034. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 12
  4035. MOVUPS [EAX], XMM2
  4036. MOVUPS [EAX+16], XMM3
  4037. MOVUPS [EAX+32], XMM4
  4038. MOVUPS [EAX+48], XMM5
  4039. MOV EAX, [EBP+matrixA] ; matrixA
  4040. ADD EAX, [EBP+StrideA] ; StrideA
  4041. MOV [EBP+matrixA], EAX ; matrixA
  4042. MOV EAX, [EBP+matrixC] ; matrixC
  4043. ADD EAX, [EBP+StrideC] ; StrideC
  4044. MOV [EBP+matrixC], EAX ; matrixC
  4045. INC ECX
  4046. CMP ECX, [EBP+Ra] ; Ra
  4047. JL DotProductSetup ;
  4048. END SSEMul16BlockR;
  4049. PROCEDURE SSEMul8BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4050. add: BOOLEAN );
  4051. CODE {SYSTEM.i386, SYSTEM.SSE2}
  4052. MOV ECX, 0 ; counter FOR rows IN A-Ra
  4053. DotProductSetup:
  4054. MOV ESI, [EBP+matrixA] ; matrixA
  4055. MOV EDI, [EBP+matrixB] ; matrixB
  4056. MOV EDX, [EBP+CbFrom] ; CbFrom
  4057. LEA EDI, [EDI+EDX*8]
  4058. XORPD XMM2, XMM2
  4059. XORPD XMM3, XMM3
  4060. XORPD XMM4, XMM4
  4061. XORPD XMM5, XMM5
  4062. MOV EAX, 0 ;
  4063. MOV AL, [EBP+add] ;
  4064. CMP AL, 0 ; add?
  4065. JE DotProductLoop ;
  4066. MOV EAX, [EBP+matrixC] ; matrixC
  4067. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 24
  4068. MOVUPD XMM2, [EAX]
  4069. MOVUPD XMM3, [EAX+16]
  4070. MOVUPD XMM4, [EAX+32]
  4071. MOVUPD XMM5, [EAX+48]
  4072. MOV EAX, 0
  4073. DotProductLoop:
  4074. ; MOV EDX, [ESI+EAX*8]
  4075. ; SHL EDX, 1
  4076. ; CMP EDX, 0
  4077. ; JE SparseEntryEscape
  4078. MOVSD XMM0, [ESI+EAX*8]
  4079. SHUFPD XMM0, XMM0, 0H
  4080. MOVUPD XMM1, [EDI]
  4081. MULPD XMM1, XMM0
  4082. ADDPD XMM2, XMM1
  4083. MOVUPD XMM1, [EDI+16]
  4084. MULPD XMM1, XMM0
  4085. ADDPD XMM3, XMM1
  4086. MOVUPD XMM1, [EDI+32]
  4087. MULPD XMM1, XMM0
  4088. ADDPD XMM4, XMM1
  4089. MOVUPD XMM1, [EDI+48]
  4090. MULPD XMM1, XMM0
  4091. ADDPD XMM5, XMM1
  4092. SparseEntryEscape:
  4093. ADD EDI, [EBP+StrideB] ; StrideB
  4094. INC EAX
  4095. CMP EAX, [EBP+Ca] ; Ca
  4096. JL DotProductLoop ; end DotProductLoop
  4097. MOV EAX , [EBP+matrixC] ; matrixC
  4098. MOV EDX, [EBP+CbFrom] ; CbFirst
  4099. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 12
  4100. MOVUPD [EAX], XMM2
  4101. MOVUPD [EAX+16], XMM3
  4102. MOVUPD [EAX+32], XMM4
  4103. MOVUPD [EAX+48], XMM5
  4104. MOV EAX, [EBP+matrixA] ; matrixA
  4105. ADD EAX, [EBP+StrideA] ; StrideA
  4106. MOV [EBP+matrixA], EAX ; matrixA
  4107. MOV EAX, [EBP+matrixC] ; matrixC
  4108. ADD EAX, [EBP+StrideC] ; StrideC
  4109. MOV [EBP+matrixC], EAX ; matrixC
  4110. INC ECX
  4111. CMP ECX, [EBP+Ra] ; Ra
  4112. JL DotProductSetup ;
  4113. END SSEMul8BlockX;
  4114. PROCEDURE SSEMul8BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4115. add: BOOLEAN );
  4116. CODE {SYSTEM.i386, SYSTEM.SSE}
  4117. MOV ECX, 0 ; counter FOR rows IN A-Ra
  4118. DotProductSetup:
  4119. MOV ESI, [EBP+matrixA] ; matrixA
  4120. MOV EDI, [EBP+matrixB] ; matrixB
  4121. MOV EDX, [EBP+CbFrom] ; CbFrom
  4122. LEA EDI, [EDI+EDX*4]
  4123. XORPS XMM2, XMM2
  4124. XORPS XMM3, XMM3
  4125. MOV EAX, 0 ;
  4126. MOV AL, [EBP+add] ;
  4127. CMP AL, 0 ; add?
  4128. JE DotProductLoop ;
  4129. MOV EAX, [EBP+matrixC] ; matrixC
  4130. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  4131. MOVUPS XMM2, [EAX]
  4132. MOVUPS XMM3, [EAX+16]
  4133. MOV EAX, 0
  4134. DotProductLoop:
  4135. MOV EDX, [ESI+EAX*4]
  4136. SHL EDX, 1
  4137. CMP EDX, 0
  4138. JE SparseEntryEscape
  4139. MOVSS XMM0, [ESI+EAX*4]
  4140. SHUFPS XMM0, XMM0, 0H
  4141. MOVUPS XMM1, [EDI]
  4142. MULPS XMM1, XMM0
  4143. ADDPS XMM2, XMM1
  4144. MOVUPS XMM1, [EDI+16]
  4145. MULPS XMM1, XMM0
  4146. ADDPS XMM3, XMM1
  4147. SparseEntryEscape:
  4148. ADD EDI, [EBP+StrideB] ; StrideB
  4149. INC EAX
  4150. CMP EAX, [EBP+Ca] ; Ca
  4151. JL DotProductLoop ; end DotProductLoop
  4152. MOV EAX , [EBP+matrixC] ; matrixC
  4153. MOV EDX, [EBP+CbFrom] ; CbFrom
  4154. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  4155. MOVUPS [EAX], XMM2
  4156. MOVUPS [EAX+16], XMM3
  4157. MOV EAX, [EBP+matrixA] ; matrixA
  4158. ADD EAX, [EBP+StrideA] ; StrideA
  4159. MOV [EBP+matrixA], EAX ; matrixA
  4160. MOV EAX, [EBP+matrixC] ; matrixC
  4161. ADD EAX, [EBP+StrideC] ; StrideC
  4162. MOV [EBP+matrixC], EAX ; matrixC
  4163. INC ECX
  4164. CMP ECX, [EBP+Ra] ; Ra
  4165. JL DotProductSetup ;
  4166. END SSEMul8BlockR;
  4167. PROCEDURE SSEMul4BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4168. add: BOOLEAN );
  4169. CODE {SYSTEM.i386, SYSTEM.SSE2}
  4170. MOV ECX, 0 ; counter FOR rows IN A-Ra
  4171. DotProductSetup:
  4172. MOV EAX, 0 ; cols IN A
  4173. MOV ESI, [EBP+matrixA] ; matrixA
  4174. MOV EDI, [EBP+matrixB] ; matrixB
  4175. MOV EDX, [EBP+CbFrom] ; CbFrom
  4176. LEA EDI, [EDI+EDX*8]
  4177. XORPS XMM2, XMM2
  4178. XORPS XMM3, XMM3
  4179. MOV EAX, 0 ;
  4180. MOV AL, [EBP+add] ;
  4181. CMP AL, 0 ; add?
  4182. JE DotProductLoop ;
  4183. MOV EAX, [EBP+matrixC] ; matrixC
  4184. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  4185. MOVUPD XMM2, [EAX]
  4186. MOVUPD XMM3, [EAX+16]
  4187. MOV EAX, 0
  4188. DotProductLoop:
  4189. ; MOV EDX, [ESI+EAX*8]
  4190. ; SHL EDX, 1
  4191. ; CMP EDX, 0
  4192. ; JE SparseEntryEscape
  4193. MOVSD XMM0, [ESI+EAX*8]
  4194. SHUFPD XMM0, XMM0, 0H
  4195. MOVUPD XMM1, [EDI]
  4196. MULPD XMM1, XMM0
  4197. ADDPD XMM2, XMM1
  4198. MOVUPD XMM1, [EDI+16]
  4199. MULPD XMM1, XMM0
  4200. ADDPD XMM3, XMM1
  4201. SparseEntryEscape:
  4202. ADD EDI, [EBP+StrideB] ; StrideB
  4203. INC EAX
  4204. CMP EAX, [EBP+Ca] ; Ca
  4205. JL DotProductLoop ; end DotProductLoop
  4206. MOV EAX , [EBP+matrixC] ; matrixC
  4207. MOV EDX, [EBP+CbFrom] ; CbFrom
  4208. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  4209. MOVUPD [EAX], XMM2
  4210. MOVUPD [EAX+16], XMM3
  4211. MOV EAX, [EBP+matrixA] ; matrixA
  4212. ADD EAX, [EBP+StrideA] ; StrideA
  4213. MOV [EBP+matrixA], EAX ; matrixA
  4214. MOV EAX, [EBP+matrixC] ; matrixC
  4215. ADD EAX, [EBP+StrideC] ; StrideC
  4216. MOV [EBP+matrixC], EAX ; matrixC
  4217. INC ECX
  4218. CMP ECX, [EBP+Ra] ; Ra
  4219. JL DotProductSetup ;
  4220. END SSEMul4BlockX;
  4221. PROCEDURE SSEMul4BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4222. add: BOOLEAN );
  4223. CODE {SYSTEM.i386, SYSTEM.SSE}
  4224. MOV ECX, 0 ; counter FOR rows IN A-Ra
  4225. DotProductSetup:
  4226. MOV EAX, 0 ; cols IN A
  4227. MOV ESI, [EBP+matrixA] ; matrixA
  4228. MOV EDI, [EBP+matrixB] ; matrixB
  4229. MOV EDX, [EBP+CbFrom] ; CbFrom
  4230. LEA EDI, [EDI+EDX*4]
  4231. XORPS XMM2, XMM2
  4232. MOV EAX, 0 ;
  4233. MOV AL, [EBP+add] ;
  4234. CMP AL, 0 ; add?
  4235. JE DotProductLoop ;
  4236. MOV EAX, [EBP+matrixC] ; matrixC
  4237. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  4238. MOVUPS XMM2, [EAX]
  4239. MOV EAX, 0
  4240. DotProductLoop:
  4241. MOV EDX, [ESI+EAX*4]
  4242. SHL EDX, 1
  4243. CMP EDX, 0
  4244. JE SparseEntryEscape
  4245. MOVSS XMM0, [ESI+EAX*4]
  4246. SHUFPS XMM0, XMM0, 0H
  4247. MOVUPS XMM1, [EDI]
  4248. MULPS XMM1, XMM0
  4249. ADDPS XMM2, XMM1
  4250. SparseEntryEscape:
  4251. ADD EDI, [EBP+StrideB] ; StrideB
  4252. INC EAX
  4253. CMP EAX, [EBP+Ca] ; Ca
  4254. JL DotProductLoop ; end DopProductLoop
  4255. MOV EAX, [EBP+matrixC] ; matrixC
  4256. MOV EDX, [EBP+CbFrom] ; CbFrom
  4257. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  4258. MOVUPS [EAX], XMM2
  4259. MOV EAX, [EBP+matrixA] ; matrixA
  4260. ADD EAX, [EBP+StrideA] ; StrideA
  4261. MOV [EBP+matrixA], EAX ; matrixA
  4262. MOV EAX, [EBP+matrixC] ; matrixC
  4263. ADD EAX, [EBP+StrideC] ; StrideC
  4264. MOV [EBP+matrixC], EAX ; matrixC
  4265. INC ECX
  4266. CMP ECX, [EBP+Ra] ; Ra
  4267. JL DotProductSetup ;
  4268. END SSEMul4BlockR;
  4269. PROCEDURE SSEMul2BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  4270. add: BOOLEAN );
  4271. CODE {SYSTEM.i386, SYSTEM.SSE2}
  4272. MOV ECX, 0 ; counter FOR rows IN A-Ra
  4273. DotProductSetup:
  4274. MOV EAX, 0 ; cols IN A
  4275. MOV ESI, [EBP+matrixA] ; matrixA
  4276. MOV EDI, [EBP+matrixB] ; matrixB
  4277. MOV EDX, [EBP+CbFrom] ; CbFrom
  4278. LEA EDI, [EDI+EDX*8]
  4279. XORPD XMM2, XMM2
  4280. MOV EAX, 0 ;
  4281. MOV AL, [EBP+add] ;
  4282. CMP AL, 0 ; add?
  4283. JE DotProductLoop ;
  4284. MOV EAX, [EBP+matrixC] ; matrixC
  4285. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  4286. MOVUPD XMM2, [EAX]
  4287. MOV EAX, 0
  4288. DotProductLoop:
  4289. ; MOV EDX, [ESI+EAX*4] ;
  4290. ; SHL EDX, 1 ;
  4291. ; CMP EDX, 0
  4292. ; JE SparseEntryEscape
  4293. MOVSD XMM0, [ESI+EAX*8]
  4294. SHUFPD XMM0, XMM0, 0H
  4295. MOVUPD XMM1, [EDI]
  4296. MULPD XMM1, XMM0
  4297. ADDPD XMM2, XMM1
  4298. SparseEntryEscape:
  4299. ADD EDI, [EBP+StrideB] ; StrideB
  4300. INC EAX
  4301. CMP EAX, [EBP+Ca] ; Ca
  4302. JL DotProductLoop ; end DotProductLoop
  4303. MOV EAX , [EBP+matrixC] ; matrixC
  4304. MOV EDX, [EBP+CbFrom] ; CbFrom
  4305. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  4306. MOVUPD [EAX], XMM2
  4307. MOV EAX, [EBP+matrixA] ; matrixA
  4308. ADD EAX, [EBP+StrideA] ; StrideA
  4309. MOV [EBP+matrixA], EAX ; matrixA
  4310. MOV EAX, [EBP+matrixC] ; matrixC
  4311. ADD EAX, [EBP+StrideC] ; StrideC
  4312. MOV [EBP+matrixC], EAX ; matrixC
  4313. INC ECX
  4314. CMP ECX, [EBP+Ra] ; Ra
  4315. JL DotProductSetup ;
  4316. END SSEMul2BlockX;
  4317. (****** blocking matrix multiplication with copy of data ******)
  4318. PROCEDURE MagicBlockR( M, N, K: SIZE;
  4319. VAR L2BlockM, L2BlockN, L2BlockK: SIZE );
  4320. BEGIN
  4321. K := (K DIV L0BlockKR) * L0BlockKR;
  4322. N := (N DIV L1BlockN) * L1BlockN;
  4323. IF M = 0 THEN M := 1 END;
  4324. IF N = 0 THEN N := 1 END;
  4325. IF K = 0 THEN K := 1 END;
  4326. L2BlockK :=
  4327. K DIV ((K + L1MaxBlockKR - 1) DIV L1MaxBlockKR);
  4328. (* Round up to next multiple of 16 *)
  4329. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  4330. L2BlockN :=
  4331. L2BlockSize DIV SIZEOF( REAL ) DIV
  4332. (L2BlockK * (L2BARatio + 1));
  4333. IF L2BlockN > N THEN L2BlockN := N
  4334. ELSIF L2BlockN < 1 THEN L2BlockN := 1;
  4335. END;
  4336. L2BlockM :=
  4337. (L2BlockSize DIV SIZEOF( REAL ) - L2BlockN * L2BlockK) DIV
  4338. L2BlockK;
  4339. (* Round up to next multiple of 5 *)
  4340. IF L2BlockM > M THEN L2BlockM := M
  4341. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  4342. END;
  4343. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  4344. END MagicBlockR;
  4345. PROCEDURE MagicBlockX( M, N, K: SIZE;
  4346. VAR L2BlockM, L2BlockN, L2BlockK:SIZE );
  4347. BEGIN
  4348. K := (K DIV L0BlockKX) * L0BlockKX;
  4349. N := (N DIV L1BlockN) * L1BlockN;
  4350. IF M = 0 THEN M := 1 END;
  4351. IF N = 0 THEN N := 1 END;
  4352. IF K = 0 THEN K := 1 END;
  4353. L2BlockK :=
  4354. K DIV ((K + L1MaxBlockKX - 1) DIV L1MaxBlockKX);
  4355. (* Round up to next multiple of 16 *)
  4356. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  4357. L2BlockN :=
  4358. L2BlockSize DIV SIZEOF( LONGREAL ) DIV
  4359. (L2BlockK * (L2BARatio + 1));
  4360. IF L2BlockN > N THEN L2BlockN := N END;
  4361. L2BlockM :=
  4362. (L2BlockSize DIV SIZEOF( LONGREAL ) - L2BlockN * L2BlockK) DIV
  4363. L2BlockK;
  4364. (* Round up to next multiple of 5 *)
  4365. IF L2BlockM > M THEN L2BlockM := M
  4366. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  4367. END;
  4368. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  4369. END MagicBlockX;
  4370. (*
  4371. PROCEDURE L1Block1X( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4372. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  4373. PROCEDURE null( i: LONGINT );
  4374. BEGIN
  4375. reg[i, 0] := 0; reg[i, 1] := 0;
  4376. END null;
  4377. PROCEDURE get1( adr, i: LONGINT );
  4378. BEGIN
  4379. SYSTEM.GET( adr, reg[i, 0] );
  4380. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4381. END get1;
  4382. PROCEDURE get2( adr, i: LONGINT );
  4383. BEGIN
  4384. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  4385. IF debug THEN
  4386. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4387. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  4388. END;
  4389. END get2;
  4390. PROCEDURE mul2( i, j: LONGINT );
  4391. BEGIN
  4392. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4393. END mul2;
  4394. PROCEDURE add2( i, j: LONGINT );
  4395. BEGIN
  4396. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4397. END add2;
  4398. PROCEDURE put1( adr, i: LONGINT );
  4399. BEGIN
  4400. SYSTEM.PUT( adr, reg[i, 0] );
  4401. END put1;
  4402. PROCEDURE horadd( i: LONGINT );
  4403. BEGIN
  4404. reg[i, 0] := reg[i, 0] + reg[i, 1];
  4405. END horadd;
  4406. BEGIN
  4407. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  4408. null( 2 ); get1( adrC, 2 );
  4409. WHILE (K > 0) DO (* padding guaranteed *)
  4410. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 ); INC( adrB, 16 );
  4411. INC( adrA, 16 ); DEC( K, 2 );
  4412. END;
  4413. horadd( 2 ); put1( adrC, 2 );
  4414. END L1Block1X;
  4415. PROCEDURE L1Block5X( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4416. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  4417. PROCEDURE null( i: LONGINT );
  4418. BEGIN
  4419. reg[i, 0] := 0; reg[i, 1] := 0;
  4420. END null;
  4421. PROCEDURE get1( adr, i: LONGINT );
  4422. BEGIN
  4423. SYSTEM.GET( adr, reg[i, 0] );
  4424. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4425. END get1;
  4426. PROCEDURE get2( adr, i: LONGINT );
  4427. BEGIN
  4428. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  4429. IF debug THEN
  4430. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4431. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  4432. END;
  4433. END get2;
  4434. PROCEDURE mul2( i, j: LONGINT );
  4435. BEGIN
  4436. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4437. END mul2;
  4438. PROCEDURE add2( i, j: LONGINT );
  4439. BEGIN
  4440. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4441. END add2;
  4442. PROCEDURE put1( adr, i: LONGINT );
  4443. BEGIN
  4444. SYSTEM.PUT( adr, reg[i, 0] );
  4445. END put1;
  4446. PROCEDURE horadd( i: LONGINT );
  4447. BEGIN
  4448. reg[i, 0] := reg[i, 0] + reg[i, 1];
  4449. END horadd;
  4450. BEGIN
  4451. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  4452. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  4453. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  4454. get1( adrC + 4 * IncC, 6 );
  4455. WHILE (K > 0) DO (* padding guaranteed *)
  4456. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 );
  4457. get2( adrB + 16, 0 ); mul2( 0, 7 ); add2( 3, 0 ); get2( adrB + 32, 0 );
  4458. mul2( 0, 7 ); add2( 4, 0 ); get2( adrB + 48, 0 ); mul2( 0, 7 );
  4459. add2( 5, 0 ); get2( adrB + 64, 0 ); mul2( 0, 7 ); add2( 6, 0 ); INC( adrB, 80 );
  4460. INC( adrA, 16 ); DEC( K, 2 );
  4461. END;
  4462. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  4463. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  4464. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  4465. END L1Block5X;
  4466. PROCEDURE L1Block1R( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4467. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  4468. PROCEDURE null( i: LONGINT );
  4469. BEGIN
  4470. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  4471. END null;
  4472. PROCEDURE get1( adr, i: LONGINT );
  4473. BEGIN
  4474. SYSTEM.GET( adr, reg[i, 0] );
  4475. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4476. END get1;
  4477. PROCEDURE get4( adr, i: LONGINT );
  4478. BEGIN
  4479. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  4480. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  4481. IF debug THEN
  4482. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4483. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  4484. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  4485. END;
  4486. END get4;
  4487. PROCEDURE mul4( i, j: LONGINT );
  4488. BEGIN
  4489. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4490. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  4491. END mul4;
  4492. PROCEDURE add4( i, j: LONGINT );
  4493. BEGIN
  4494. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4495. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  4496. END add4;
  4497. PROCEDURE put1( adr, i: LONGINT );
  4498. BEGIN
  4499. SYSTEM.PUT( adr, reg[i, 0] );
  4500. END put1;
  4501. PROCEDURE horadd( i: LONGINT );
  4502. BEGIN
  4503. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  4504. END horadd;
  4505. BEGIN
  4506. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  4507. null( 2 ); get1( adrC, 2 );
  4508. WHILE (K > 0) DO (* padding guaranteed *)
  4509. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 ); INC( adrB, 16 );
  4510. INC( adrA, 16 ); DEC( K, 4 );
  4511. END;
  4512. horadd( 2 ); put1( adrC, 2 );
  4513. END L1Block1R;
  4514. PROCEDURE L1Block5R( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  4515. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  4516. PROCEDURE null( i: LONGINT );
  4517. BEGIN
  4518. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  4519. END null;
  4520. PROCEDURE get1( adr, i: LONGINT );
  4521. BEGIN
  4522. SYSTEM.GET( adr, reg[i, 0] );
  4523. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  4524. END get1;
  4525. PROCEDURE get4( adr, i: LONGINT );
  4526. BEGIN
  4527. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  4528. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  4529. IF debug THEN
  4530. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  4531. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  4532. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  4533. END;
  4534. END get4;
  4535. PROCEDURE mul4( i, j: LONGINT );
  4536. BEGIN
  4537. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  4538. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  4539. END mul4;
  4540. PROCEDURE add4( i, j: LONGINT );
  4541. BEGIN
  4542. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  4543. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  4544. END add4;
  4545. PROCEDURE put1( adr, i: LONGINT );
  4546. BEGIN
  4547. SYSTEM.PUT( adr, reg[i, 0] );
  4548. END put1;
  4549. PROCEDURE horadd( i: LONGINT );
  4550. BEGIN
  4551. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  4552. END horadd;
  4553. BEGIN
  4554. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  4555. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  4556. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  4557. get1( adrC + 4 * IncC, 6 );
  4558. WHILE (K > 0) DO (* padding guaranteed *)
  4559. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 );
  4560. get4( adrB + 16, 0 ); mul4( 0, 7 ); add4( 3, 0 ); get4( adrB + 32, 0 );
  4561. mul4( 0, 7 ); add4( 4, 0 ); get4( adrB + 48, 0 ); mul4( 0, 7 );
  4562. add4( 5, 0 ); get4( adrB + 64, 0 ); mul4( 0, 7 ); add4( 6, 0 ); INC( adrB, 80 );
  4563. INC( adrA, 16 ); DEC( K, 4 );
  4564. END;
  4565. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  4566. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  4567. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  4568. END L1Block5R;
  4569. *)
  4570. PROCEDURE DispCR( adrM: ADDRESS;
  4571. inc, stride, M, N: SIZE );
  4572. VAR i, j: SIZE; adr: ADDRESS; val: REAL;
  4573. BEGIN
  4574. FOR i := 0 TO M - 1 DO
  4575. adr := adrM + i * stride;
  4576. FOR j := 0 TO N - 1 DO
  4577. SYSTEM.GET( adr, val );
  4578. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  4579. END;
  4580. KernelLog.Ln;
  4581. END;
  4582. END DispCR;
  4583. PROCEDURE DispCX( adrM: ADDRESS;
  4584. inc, stride, M, N: SIZE );
  4585. VAR i, j: SIZE; adr: ADDRESS; val: LONGREAL;
  4586. BEGIN
  4587. FOR i := 0 TO M - 1 DO
  4588. adr := adrM + i * stride;
  4589. FOR j := 0 TO N - 1 DO
  4590. SYSTEM.GET( adr, val );
  4591. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  4592. END;
  4593. KernelLog.Ln;
  4594. END;
  4595. END DispCX;
  4596. PROCEDURE L3BlockX( matrixA, matrixB, matrixC: ADDRESS;
  4597. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4598. (*
  4599. K N
  4600. *** N *****
  4601. M *** ****** -> ***** M
  4602. *** K ****** *****
  4603. *** ****** *****
  4604. A * B -> C
  4605. *)
  4606. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4607. KAligned: SIZE;
  4608. CONST Size = SIZEOF( LONGREAL );
  4609. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4610. (* M,N and K arbitrary ! *)
  4611. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4612. m, k, KAligned: SIZE;
  4613. BEGIN
  4614. KAligned := Align2( K ) * 8;
  4615. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4616. END;
  4617. adrB := matrixB;
  4618. WHILE (N >= L1BlockN) DO
  4619. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4620. adrC := matrixC; adrA := matrixA; m := M;
  4621. WHILE (m > 0) DO
  4622. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4623. IF SSE THEN
  4624. L1Block5XSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4625. ELSE
  4626. aadrA := adrA; aadrB := adrB; k := K;
  4627. WHILE (k > 0) DO
  4628. L1Block1XA( aadrA, aadrB, adrC, 2 );
  4629. L1Block1XA( aadrA, aadrB + 16, adrC + incC, 2 );
  4630. L1Block1XA( aadrA, aadrB + 32, adrC + 2 * incC,
  4631. 2 );
  4632. L1Block1XA( aadrA, aadrB + 48, adrC + 3 * incC,
  4633. 2 );
  4634. L1Block1XA( aadrA, aadrB + 64, adrC + 4 * incC,
  4635. 2 );
  4636. DEC( k, 2 ); INC( aadrA, 16 );
  4637. INC( aadrB, 16 * L1BlockN );
  4638. END;
  4639. END;
  4640. IF debug THEN
  4641. DispCX( matrixC, incC, strideC, M, N );
  4642. END;
  4643. INC( adrA, KAligned ); INC( adrC, strideC );
  4644. DEC( m );
  4645. END;
  4646. INC( matrixC, L1BlockN * incC );
  4647. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4648. END;
  4649. WHILE (N > 0) DO
  4650. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4651. adrC := matrixC; adrA := matrixA; m := M;
  4652. WHILE (m > 0) DO
  4653. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4654. IF SSE THEN
  4655. L1Block1XSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4656. ELSE L1Block1XA( adrA, adrB, adrC, K );
  4657. END;
  4658. IF debug THEN
  4659. DispCX( matrixC, incC, strideC, M, N );
  4660. END;
  4661. INC( adrA, KAligned ); INC( adrC, strideC );
  4662. DEC( m );
  4663. END;
  4664. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4665. END;
  4666. END L2Block;
  4667. BEGIN
  4668. KAligned := Align2( K ) * 8;
  4669. ASSERT( L2BlockK MOD 2 = 0 );
  4670. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4671. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4672. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4673. WHILE (n >= L2BlockN) DO
  4674. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4675. a1 := matrixA; adrC := matrixC; m := M;
  4676. WHILE (m >= L2BlockM) DO
  4677. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4678. adrA := a1; adrB := b1; k := K;
  4679. (* core: do matching level 2 cache Blocks *)
  4680. WHILE (k >= L2BlockK) DO
  4681. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4682. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4683. L2BlockK );
  4684. INC( adrA, L2BlockK * L2BlockM * Size );
  4685. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4686. DEC( k, L2BlockK );
  4687. END;
  4688. (* core: do rest of k *)
  4689. IF k > 0 THEN
  4690. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4691. END;
  4692. INC( a1, KAligned * L2BlockM );
  4693. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4694. END;
  4695. IF m > 0 THEN
  4696. (* clean up M *)
  4697. adrA := a1; adrB := b1; k := K;
  4698. WHILE (k >= L2BlockK) DO
  4699. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4700. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4701. INC( adrA, L2BlockK * Size * m );
  4702. INC( adrB, L2BlockK * L2BlockN * Size );
  4703. DEC( k, L2BlockK );
  4704. END;
  4705. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4706. IF k > 0 THEN
  4707. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4708. END;
  4709. END;
  4710. INC( b1, L2BlockN * KAligned );
  4711. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4712. END;
  4713. IF (n = 0) THEN RETURN
  4714. END;
  4715. a1 := matrixA; adrC := matrixC; m := M;
  4716. WHILE (m >= L2BlockM) DO
  4717. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4718. adrA := a1; adrB := b1; k := K;
  4719. WHILE (k >= L2BlockK) DO
  4720. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4721. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4722. INC( adrA, L2BlockM * L2BlockK * Size );
  4723. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4724. END;
  4725. IF k > 0 THEN
  4726. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4727. END;
  4728. INC( a1, L2BlockM * KAligned );
  4729. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4730. END;
  4731. IF (m = 0) THEN RETURN
  4732. END;
  4733. adrA := a1; adrB := b1; k := K;
  4734. WHILE (k >= L2BlockK) DO
  4735. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4736. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4737. INC( adrA, L2BlockK * m * Size );
  4738. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4739. END;
  4740. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4741. END;
  4742. END L3BlockX;
  4743. PROCEDURE L3BlockR( matrixA, matrixB, matrixC: ADDRESS;
  4744. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4745. (*
  4746. K N
  4747. *** N *****
  4748. M *** ****** -> ***** M
  4749. *** K ****** *****
  4750. *** ****** *****
  4751. A * B -> C
  4752. *)
  4753. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4754. KAligned: SIZE;
  4755. CONST Size = SIZEOF( REAL );
  4756. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4757. (* M,N and K arbitrary ! *)
  4758. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4759. m, KAligned, k: SIZE;
  4760. BEGIN
  4761. KAligned := Align4( K ) * 4;
  4762. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4763. END;
  4764. adrB := matrixB;
  4765. WHILE (N >= L1BlockN) DO
  4766. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4767. adrC := matrixC; adrA := matrixA; m := M;
  4768. WHILE (m > 0) DO
  4769. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4770. IF SSE THEN
  4771. L1Block5RSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4772. ELSE
  4773. aadrA := adrA; aadrB := adrB; k := K;
  4774. WHILE (k > 0) DO
  4775. L1Block1RA( aadrA, aadrB, adrC, 4 );
  4776. L1Block1RA( aadrA, aadrB + 16, adrC + incC, 4 );
  4777. L1Block1RA( aadrA, aadrB + 32, adrC + 2 * incC,
  4778. 4 );
  4779. L1Block1RA( aadrA, aadrB + 48, adrC + 3 * incC,
  4780. 4 );
  4781. L1Block1RA( aadrA, aadrB + 64, adrC + 4 * incC,
  4782. 4 );
  4783. DEC( k, 4 ); INC( aadrA, 16 );
  4784. INC( aadrB, 16 * L1BlockN );
  4785. END;
  4786. END;
  4787. IF debug THEN
  4788. DispCR( matrixC, incC, strideC, M, N );
  4789. END;
  4790. INC( adrA, KAligned ); INC( adrC, strideC );
  4791. DEC( m );
  4792. END;
  4793. INC( matrixC, L1BlockN * incC );
  4794. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4795. END;
  4796. WHILE (N > 0) DO
  4797. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4798. adrC := matrixC; adrA := matrixA; m := M;
  4799. WHILE (m > 0) DO
  4800. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4801. IF SSE THEN
  4802. L1Block1RSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4803. ELSE L1Block1RA( adrA, adrB, adrC, K );
  4804. END;
  4805. IF debug THEN
  4806. DispCR( matrixC, incC, strideC, M, N );
  4807. END;
  4808. INC( adrA, KAligned ); INC( adrC, strideC );
  4809. DEC( m );
  4810. END;
  4811. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4812. END;
  4813. END L2Block;
  4814. BEGIN
  4815. KAligned := Align4( K ) * 4;
  4816. ASSERT( L2BlockK MOD 4 = 0 );
  4817. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4818. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4819. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4820. WHILE (n >= L2BlockN) DO
  4821. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4822. a1 := matrixA; adrC := matrixC; m := M;
  4823. WHILE (m >= L2BlockM) DO
  4824. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4825. adrA := a1; adrB := b1; k := K;
  4826. (* core: do matching level 2 cache Blocks *)
  4827. WHILE (k >= L2BlockK) DO
  4828. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4829. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4830. L2BlockK );
  4831. INC( adrA, L2BlockK * L2BlockM * Size );
  4832. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4833. DEC( k, L2BlockK );
  4834. END;
  4835. (* core: do rest of k *)
  4836. IF k > 0 THEN
  4837. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4838. END;
  4839. INC( a1, KAligned * L2BlockM );
  4840. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4841. END;
  4842. IF m > 0 THEN
  4843. (* clean up M *)
  4844. adrA := a1; adrB := b1; k := K;
  4845. WHILE (k >= L2BlockK) DO
  4846. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4847. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4848. INC( adrA, L2BlockK * Size * m );
  4849. INC( adrB, L2BlockK * L2BlockN * Size );
  4850. DEC( k, L2BlockK );
  4851. END;
  4852. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4853. IF k > 0 THEN
  4854. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4855. END;
  4856. END;
  4857. INC( b1, L2BlockN * KAligned );
  4858. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4859. END;
  4860. IF (n = 0) THEN RETURN
  4861. END;
  4862. a1 := matrixA; adrC := matrixC; m := M;
  4863. WHILE (m >= L2BlockM) DO
  4864. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4865. adrA := a1; adrB := b1; k := K;
  4866. WHILE (k >= L2BlockK) DO
  4867. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4868. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4869. INC( adrA, L2BlockM * L2BlockK * Size );
  4870. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4871. END;
  4872. IF k > 0 THEN
  4873. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4874. END;
  4875. INC( a1, L2BlockM * KAligned );
  4876. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4877. END;
  4878. IF (m = 0) THEN RETURN
  4879. END;
  4880. adrA := a1; adrB := b1; k := K;
  4881. WHILE (k >= L2BlockK) DO
  4882. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4883. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4884. INC( adrA, L2BlockK * m * Size );
  4885. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4886. END;
  4887. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4888. END;
  4889. END L3BlockR;
  4890. PROCEDURE Align( adr: ADDRESS; align: SIZE ): ADDRESS;
  4891. BEGIN
  4892. RETURN adr + (-adr) MOD align; (* 128 bit = 16 byte alignment *)
  4893. END Align;
  4894. PROCEDURE CopyAX( matrixA, dest: ADDRESS;
  4895. IncA, StrideA: SIZE;
  4896. K, M, L2BlockK, L2BlockM: SIZE );
  4897. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4898. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4899. VAR rest: SIZE;
  4900. BEGIN
  4901. IF debug THEN
  4902. KernelLog.String( "CopyMK:" ); KernelLog.Int( M, 10 ); KernelLog.Int( K, 10 );
  4903. KernelLog.Ln;
  4904. END;
  4905. rest := (-K) MOD 2;
  4906. WHILE (M > 0) DO
  4907. MovX( matrixA, dest, IncA, K ); INC( dest, K * 8 );
  4908. IF rest # 0 THEN
  4909. ZeroX( dest, rest ); INC( dest, 8 * rest );
  4910. END;
  4911. INC( matrixA, StrideA ); DEC( M );
  4912. END;
  4913. END CopyMK;
  4914. BEGIN
  4915. Tic( t ); m := M;
  4916. WHILE (m >= L2BlockM) DO
  4917. k := K; adrA := matrixA;
  4918. WHILE (k >= L2BlockK) DO
  4919. CopyMK( adrA, L2BlockM, L2BlockK );
  4920. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4921. END;
  4922. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4923. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4924. END;
  4925. adrA := matrixA; k := K;
  4926. WHILE (k >= L2BlockK) DO
  4927. CopyMK( adrA, m, L2BlockK );
  4928. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4929. END;
  4930. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4931. Toc( t, copyT );
  4932. END CopyAX;
  4933. PROCEDURE CopyAR( matrixA, dest: ADDRESS;
  4934. IncA, StrideA: SIZE;
  4935. K, M, L2BlockK, L2BlockM: SIZE );
  4936. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4937. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4938. VAR rest: SIZE;
  4939. BEGIN
  4940. rest := (-K) MOD 4;
  4941. WHILE (M > 0) DO
  4942. MovR( matrixA, dest, IncA, K ); INC( dest, K * 4 );
  4943. IF rest # 0 THEN
  4944. ZeroR( dest, rest ); INC( dest, 4 * rest );
  4945. END;
  4946. INC( matrixA, StrideA ); DEC( M );
  4947. END;
  4948. END CopyMK;
  4949. BEGIN
  4950. Tic( t ); m := M;
  4951. WHILE (m >= L2BlockM) DO
  4952. k := K; adrA := matrixA;
  4953. WHILE (k >= L2BlockK) DO
  4954. CopyMK( adrA, L2BlockM, L2BlockK );
  4955. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4956. END;
  4957. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4958. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4959. END;
  4960. adrA := matrixA; k := K;
  4961. WHILE (k >= L2BlockK) DO
  4962. CopyMK( adrA, m, L2BlockK );
  4963. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4964. END;
  4965. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4966. Toc( t, copyT );
  4967. END CopyAR;
  4968. PROCEDURE CopyBX( matrixB, dest: ADDRESS;
  4969. IncB, StrideB: SIZE;
  4970. N, K, L2BlockN, L2BlockK: SIZE );
  4971. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4972. PROCEDURE Copy5x2k( matrixB: ADDRESS; k: SIZE );
  4973. VAR i: SIZE; adrB: ADDRESS; rest: SIZE;
  4974. BEGIN
  4975. rest := (-k) MOD 2;
  4976. WHILE (k >= 2) DO (* store 5x4 Block in line *)
  4977. adrB := matrixB;
  4978. FOR i := 1 TO L1BlockN DO
  4979. MovX( adrB, dest, StrideB, 2 ); INC( dest, 16 );
  4980. INC( adrB, IncB );
  4981. END;
  4982. INC( matrixB, 2 * StrideB ); DEC( k, 2 );
  4983. END;
  4984. IF k > 0 THEN
  4985. adrB := matrixB;
  4986. FOR i := 1 TO L1BlockN DO
  4987. MovX( adrB, dest, StrideB, k ); INC( dest, 8 * k );
  4988. IF rest # 0 THEN
  4989. ZeroX( dest, rest ); INC( dest, rest * 8 );
  4990. END;
  4991. INC( adrB, IncB );
  4992. END;
  4993. END;
  4994. END Copy5x2k;
  4995. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4996. VAR n, rest: SIZE;
  4997. BEGIN
  4998. rest := (-K) MOD 2;
  4999. IF debug THEN
  5000. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  5001. END;
  5002. n := N;
  5003. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  5004. Copy5x2k( matrixB, K );
  5005. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  5006. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  5007. END;
  5008. IF debug THEN KernelLog.String( "Copy1, n=" ); KernelLog.Int( n, 10 ); KernelLog.Ln;
  5009. END;
  5010. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  5011. MovX( matrixB, dest, StrideB, K ); INC( dest, K * 8 );
  5012. ZeroR( dest, rest ); INC( dest, rest * 8 );
  5013. INC( matrixB, IncB ); DEC( n );
  5014. END;
  5015. END Copy1;
  5016. BEGIN
  5017. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  5018. ASSERT( L2BlockK MOD 2 = 0 ); n := N;
  5019. WHILE (n >= L2BlockN) DO
  5020. k := K; adrB := matrixB;
  5021. WHILE (k >= L2BlockK) DO
  5022. Copy1( adrB, L2BlockK, L2BlockN );
  5023. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  5024. END;
  5025. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  5026. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  5027. END;
  5028. IF (n = 0) THEN RETURN
  5029. END;
  5030. k := K; adrB := matrixB;
  5031. WHILE (k >= L2BlockK) DO
  5032. Copy1( adrB, L2BlockK, n );
  5033. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  5034. END;
  5035. Copy1( adrB, k, n ); Toc( t, copyT );
  5036. END CopyBX;
  5037. PROCEDURE CopyBR( matrixB, dest: ADDRESS;
  5038. IncB, StrideB: SIZE;
  5039. N, K, L2BlockN, L2BlockK: SIZE );
  5040. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  5041. PROCEDURE Copy5x4k( matrixB: ADDRESS; k: SIZE );
  5042. VAR i: SIZE; adrB: ADDRESS; rest, k4: SIZE;
  5043. BEGIN
  5044. k4 := k - k MOD 4; rest := (-k) MOD 4;
  5045. IF k4 > 0 THEN
  5046. MovR5( matrixB, IncB, StrideB, dest, k4 );
  5047. INC( matrixB, k4 * StrideB ); INC( dest, k4 * 80 DIV 4 );
  5048. DEC( k, k4 );
  5049. END;
  5050. (*
  5051. WHILE (k >= 4) DO (* store 5x4 Block in line *)
  5052. adrB := matrixB;
  5053. FOR i := 1 TO L1BlockN DO
  5054. MovR( adrB, dest, StrideB, 4 ); INC( dest, 16 ); INC( adrB, IncB );
  5055. END;
  5056. INC( matrixB, 4 * StrideB ); DEC( k, 4 );
  5057. END;
  5058. *)
  5059. IF k > 0 THEN
  5060. adrB := matrixB;
  5061. FOR i := 1 TO L1BlockN DO
  5062. MovR( adrB, dest, StrideB, k ); INC( dest, 4 * k );
  5063. IF rest # 0 THEN
  5064. ZeroR( dest, rest ); INC( dest, rest * 4 );
  5065. END;
  5066. INC( adrB, IncB );
  5067. END;
  5068. END;
  5069. END Copy5x4k;
  5070. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  5071. VAR n, rest: SIZE;
  5072. BEGIN
  5073. rest := (-K) MOD 4;
  5074. IF debug THEN
  5075. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  5076. END;
  5077. n := N;
  5078. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  5079. Copy5x4k( matrixB, K );
  5080. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  5081. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  5082. END;
  5083. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  5084. MovR( matrixB, dest, StrideB, K ); INC( dest, K * 4 );
  5085. ZeroR( dest, rest ); INC( dest, rest * 4 );
  5086. INC( matrixB, IncB ); DEC( n );
  5087. END;
  5088. END Copy1;
  5089. BEGIN
  5090. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  5091. ASSERT( L2BlockK MOD 4 = 0 ); n := N;
  5092. WHILE (n >= L2BlockN) DO
  5093. k := K; adrB := matrixB;
  5094. WHILE (k >= L2BlockK) DO
  5095. Copy1( adrB, L2BlockK, L2BlockN );
  5096. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  5097. END;
  5098. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  5099. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  5100. END;
  5101. IF (n = 0) THEN RETURN
  5102. END;
  5103. k := K; adrB := matrixB;
  5104. WHILE (k >= L2BlockK) DO
  5105. Copy1( adrB, L2BlockK, n );
  5106. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  5107. END;
  5108. Copy1( adrB, k, n ); Toc( t, copyT );
  5109. END CopyBR;
  5110. (*
  5111. PROCEDURE FillMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  5112. VAR i, j: LONGINT;
  5113. BEGIN
  5114. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  5115. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  5116. A[i, j] := ran.Dice( 10 );
  5117. IF debug THEN A[i, j] := 10 * i + j; END;
  5118. END;
  5119. END;
  5120. END FillMR;
  5121. PROCEDURE DispMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  5122. VAR i, j: LONGINT;
  5123. BEGIN
  5124. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  5125. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  5126. KernelLog.Ln;
  5127. END;
  5128. END DispMR;
  5129. PROCEDURE FillMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  5130. VAR i, j: LONGINT;
  5131. BEGIN
  5132. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  5133. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  5134. A[i, j] := ran.Dice( 10 );
  5135. IF debug THEN A[i, j] := 10 * i + j; END;
  5136. END;
  5137. END;
  5138. END FillMX;
  5139. PROCEDURE DispMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  5140. VAR i, j: LONGINT;
  5141. BEGIN
  5142. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  5143. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  5144. KernelLog.Ln;
  5145. END;
  5146. END DispMX;
  5147. *)
  5148. PROCEDURE Tic( VAR t: HUGEINT );
  5149. BEGIN
  5150. t := Machine.GetTimer();
  5151. END Tic;
  5152. PROCEDURE Toc( VAR t, addto: HUGEINT );
  5153. BEGIN
  5154. INC( addto, Machine.GetTimer() - t ); t := Machine.GetTimer();
  5155. END Toc;
  5156. PROCEDURE MultiplyX( A, B, C: ADDRESS;
  5157. M, N, K, L2BlockM, L2BlockN, L2BlockK:SIZE;
  5158. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  5159. add: BOOLEAN );
  5160. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  5161. M1, M2, i: SIZE; val: LONGREAL; t: HUGEINT;
  5162. inc: SIZE;
  5163. obj: POINTER TO ARRAY OF MultiplyObjectX;
  5164. cache: Cache;
  5165. BEGIN
  5166. NEW(obj,nrProcesses+1);
  5167. lenA := M * Align2( K ) * 8; lenB := N * Align2( K ) * 8;
  5168. cache := cachePool.Acquire( lenA + lenB );
  5169. adrA := cache.adr; adrB := adrA + lenA;
  5170. CopyAX( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  5171. CopyBX( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  5172. Tic( t ); m := M; adrC := C;
  5173. IF ~add THEN
  5174. WHILE (m > 0) DO
  5175. ZeroXI( adrC, IncC, N ); INC( adrC, StrideC ); DEC( m );
  5176. END;
  5177. END;
  5178. Toc( t, zeroT );
  5179. IF debug THEN
  5180. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  5181. FOR i := 0 TO M * Align2( K ) - 1 DO
  5182. SYSTEM.GET( adrA + i * 8, val );
  5183. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  5184. END;
  5185. END;
  5186. IF debug THEN
  5187. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  5188. FOR i := 0 TO N * Align2( K ) - 1 DO
  5189. SYSTEM.GET( adrB + i * 8, val );
  5190. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  5191. END;
  5192. END;
  5193. IF parallel & (M > L2BlockM) THEN
  5194. inc := Align( MAX(M DIV nrProcesses,L2BlockM), L2BlockM ); M1 := 0;
  5195. i := 0;
  5196. WHILE (M1 < M) DO
  5197. M2 := M1 + inc;
  5198. IF M2 > M THEN M2 := M END;
  5199. NEW( obj[i], adrA + M1 * Align2( K ) * 8, adrB,
  5200. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  5201. L2BlockM, L2BlockN, L2BlockK );
  5202. M1 := M2; INC( i );
  5203. END;
  5204. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  5205. ELSE
  5206. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  5207. L2BlockN, L2BlockK );
  5208. END;
  5209. Toc( t, compT ); cachePool.Release( cache );
  5210. END MultiplyX;
  5211. PROCEDURE MultiplyR( A, B, C: ADDRESS;
  5212. M, N, K, L2BlockM, L2BlockN, L2BlockK: SIZE;
  5213. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  5214. add: BOOLEAN );
  5215. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  5216. M1, M2, i: SIZE; val: REAL; inc: SIZE;
  5217. obj: POINTER TO ARRAY OF MultiplyObjectR;
  5218. t: HUGEINT; cache: Cache;
  5219. BEGIN
  5220. NEW(obj,nrProcesses+1);
  5221. lenA := M * Align4( K ) * 4; lenB := N * Align4( K ) * 4;
  5222. cache := cachePool.Acquire( lenA + lenB );
  5223. adrA := cache.adr; adrB := adrA + lenA;
  5224. CopyAR( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  5225. CopyBR( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  5226. Tic( t ); m := M; adrC := C;
  5227. IF ~add THEN
  5228. WHILE (m > 0) DO
  5229. ZeroRI( adrC, IncC, N ); INC( adrC, StrideC );
  5230. DEC( m );
  5231. END;
  5232. END;
  5233. Toc( t, zeroT );
  5234. IF debug THEN
  5235. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  5236. FOR i := 0 TO M * Align4( K ) - 1 DO
  5237. SYSTEM.GET( adrA + i * 4, val );
  5238. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  5239. END;
  5240. END;
  5241. IF debug THEN
  5242. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  5243. FOR i := 0 TO N * Align4( K ) - 1 DO
  5244. SYSTEM.GET( adrB + i * 4, val );
  5245. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  5246. END;
  5247. END;
  5248. IF parallel & (M > L2BlockM) THEN
  5249. inc := Align( MAX(M DIV nrProcesses,L2BlockM), L2BlockM ); M1 := 0;
  5250. i := 0;
  5251. WHILE (M1 < M) DO
  5252. M2 := M1 + inc;
  5253. IF M2 > M THEN M2 := M END;
  5254. NEW( obj[i], adrA + M1 * Align4( K ) * 4, adrB,
  5255. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  5256. L2BlockM, L2BlockN, L2BlockK );
  5257. M1 := M2; INC( i );
  5258. END;
  5259. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  5260. ELSE
  5261. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  5262. L2BlockN, L2BlockK );
  5263. END;
  5264. Toc( t, compT ); cachePool.Release( cache );
  5265. END MultiplyR;
  5266. (*
  5267. PROCEDURE DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  5268. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  5269. A, B, C, D: ARRAY [ .. , .. ] OF LONGREAL;
  5270. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  5271. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: LONGREAL; atime, time: LONGINT;
  5272. BEGIN
  5273. KernelLog.String( "LONGREAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  5274. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  5275. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  5276. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  5277. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMX( A ); FillMX( B );
  5278. IF debug THEN DispMX( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMX( B );
  5279. END;
  5280. atime := Input.Time(); (* C := 0; *)
  5281. WHILE (iter > 0) DO
  5282. MultiplyX( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5283. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  5284. (*
  5285. 8,
  5286. LEN( A, 1 ) * 8, 8, LEN( B, 1 ) * 8, 8, LEN( C, 1 ) * 8
  5287. *)
  5288. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5289. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  5290. );
  5291. DEC( iter );
  5292. END;
  5293. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  5294. IF debug THEN
  5295. DispMX( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMX( B ); KernelLog.String( " = " );
  5296. KernelLog.Ln; DispMX( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  5297. END;
  5298. IF check THEN
  5299. (*
  5300. NEW(D,M,N);
  5301. MatMulAXAXNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5302. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5303. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  5304. *)
  5305. D := A * B;
  5306. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  5307. END;
  5308. END DoTestX;
  5309. PROCEDURE DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  5310. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  5311. A, B, C, D: ARRAY [ .. , .. ] OF REAL;
  5312. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  5313. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: REAL; atime, time: LONGINT;
  5314. BEGIN
  5315. KernelLog.String( "REAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  5316. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  5317. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  5318. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  5319. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMR( A ); FillMR( B );
  5320. IF debug THEN DispMR( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMR( B );
  5321. END;
  5322. atime := Input.Time(); (* C := 0; *)
  5323. FOR i := 1 TO iter DO
  5324. MultiplyR( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5325. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  5326. (* 4,
  5327. LEN( A, 1 ) * 4, 4, LEN( B, 1 ) * 4, 4, LEN( C, 1 ) * 4 *)
  5328. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5329. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  5330. );
  5331. END;
  5332. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  5333. IF debug THEN
  5334. DispMR( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMR( B ); KernelLog.String( " = " );
  5335. KernelLog.Ln; DispMR( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  5336. END;
  5337. IF check THEN
  5338. (*
  5339. NEW(D,M,N);
  5340. MatMulARARNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  5341. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  5342. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  5343. *)
  5344. D := A * B;
  5345. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  5346. END;
  5347. END DoTestR;
  5348. PROCEDURE RandTestR*;
  5349. VAR iter, i, time: LONGINT;
  5350. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  5351. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  5352. BEGIN
  5353. IF Min = Max THEN RETURN Min
  5354. ELSE RETURN ran.Dice( Max - Min ) + Min
  5355. END;
  5356. END Ran;
  5357. BEGIN
  5358. In.Open(); In.LongInt( iter );
  5359. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  5360. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  5361. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  5362. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  5363. K := Ran( MinK, MaxK );
  5364. IF N < 5 THEN N := 5 END;
  5365. IF K < 4 THEN K := 4 END;
  5366. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  5367. BN := Align( BN, 5 );
  5368. IF BN > N THEN DEC( BN, 5 ) END;
  5369. BK := Align( BK, 4 );
  5370. IF BK > K THEN DEC( BK, 4 ) END;
  5371. DoTestR( M, N, K, BM, BN, BK, TRUE , 1 );
  5372. END;
  5373. END RandTestR;
  5374. PROCEDURE RandTestX*;
  5375. VAR iter, i, time: LONGINT;
  5376. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  5377. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  5378. BEGIN
  5379. IF Min = Max THEN RETURN Min
  5380. ELSE RETURN ran.Dice( Max - Min ) + Min
  5381. END;
  5382. END Ran;
  5383. BEGIN
  5384. In.Open(); In.LongInt( iter );
  5385. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  5386. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  5387. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  5388. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  5389. K := Ran( MinK, MaxK );
  5390. IF N < 5 THEN N := 5 END;
  5391. IF K < 4 THEN K := 4 END;
  5392. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  5393. BN := Align( BN, 5 );
  5394. IF BN > N THEN DEC( BN, 5 ) END;
  5395. BK := Align( BK, 4 );
  5396. IF BK > K THEN DEC( BK, 4 ) END;
  5397. DoTestX( M, N, K, BM, BN, BK, TRUE , 1 );
  5398. END;
  5399. END RandTestX;
  5400. *)
  5401. (*
  5402. PROCEDURE Times*;
  5403. VAR all: HUGEINT;
  5404. BEGIN
  5405. all := allocT + copyT + zeroT + compT; KernelLog.String( "alloc=" );
  5406. KernelLog.LongRealFix( allocT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5407. KernelLog.Int( ENTIER( 100 * allocT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5408. KernelLog.Ln; KernelLog.String( "copy=" );
  5409. KernelLog.LongRealFix( copyT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5410. KernelLog.Int( ENTIER( 100 * copyT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5411. KernelLog.Ln; KernelLog.String( "zero=" );
  5412. KernelLog.LongRealFix( zeroT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5413. KernelLog.Int( ENTIER( 100 * zeroT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5414. KernelLog.Ln; KernelLog.String( "comp=" );
  5415. KernelLog.LongRealFix( compT / 1000000, 0, 20 ); KernelLog.String( "[" );
  5416. KernelLog.Int( ENTIER( 100 * compT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  5417. KernelLog.Ln;
  5418. END Times;
  5419. *)
  5420. (*
  5421. PROCEDURE TestRMM*;
  5422. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  5423. check, iter: LONGINT;
  5424. BEGIN
  5425. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  5426. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  5427. In.LongInt( iter ); In.LongInt( check );
  5428. IF L2BlockM = 0 THEN
  5429. MagicBlockR( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  5430. END;
  5431. DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  5432. END TestRMM;
  5433. PROCEDURE TestXMM*;
  5434. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  5435. iter, check: LONGINT;
  5436. BEGIN
  5437. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  5438. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  5439. In.LongInt( iter ); In.LongInt( check );
  5440. IF L2BlockM = 0 THEN
  5441. MagicBlockX( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  5442. END;
  5443. DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  5444. END TestXMM;
  5445. *)
  5446. (****** matrix multiplication using fast scalar product ******)
  5447. PROCEDURE MatMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5448. BEGIN
  5449. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  5450. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5451. END MatMulAXAXLoopA;
  5452. PROCEDURE MatMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5453. BEGIN
  5454. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  5455. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5456. END MatMulAXAXLoopSSE;
  5457. PROCEDURE MatMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5458. BEGIN
  5459. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  5460. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5461. END MatMulARARLoopA;
  5462. PROCEDURE MatMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5463. BEGIN
  5464. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  5465. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5466. END MatMulARARLoopSSE;
  5467. PROCEDURE MatMulIncAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5468. BEGIN
  5469. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5470. END MatMulIncAXAXLoopA;
  5471. PROCEDURE MatMulIncAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5472. BEGIN
  5473. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5474. END MatMulIncAXAXLoopSSE;
  5475. PROCEDURE MatMulIncARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5476. BEGIN
  5477. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5478. END MatMulIncARARLoopA;
  5479. PROCEDURE MatMulIncARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  5480. BEGIN
  5481. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  5482. END MatMulIncARARLoopSSE;
  5483. (****** matrix multiplication over rows with transposition of B *)
  5484. PROCEDURE MatMulHBlockR( MatrixA, MatrixB, MatrixC: ADDRESS;
  5485. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  5486. add: BOOLEAN );
  5487. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  5488. (*
  5489. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  5490. *)
  5491. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  5492. VAR i, j: SIZE; adrA, adrB, adrC: ADDRESS;
  5493. BEGIN
  5494. FOR i := fromA TO toA - 1 DO
  5495. adrA := MatrixA + i * Stride;
  5496. FOR j := fromB TO toB - 1 DO
  5497. adrB := MatrixB + j * Stride;
  5498. adrC := MatrixC + i * StrideC + j * IncC;
  5499. AlignedSPRSSE( adrA, adrB, adrC, Cols, add );
  5500. END;
  5501. END;
  5502. END Block;
  5503. BEGIN
  5504. IF cBlockSize = 0 THEN
  5505. BlockSize := L2CacheSize DIV Stride DIV 4;
  5506. ELSE BlockSize := cBlockSize;
  5507. END;
  5508. lastUsedBlockSize := BlockSize;
  5509. fromA := 0;
  5510. REPEAT
  5511. toA := fromA + BlockSize;
  5512. IF toA > RowsA THEN toA := RowsA END;
  5513. fromB := 0;
  5514. REPEAT
  5515. toB := fromB + BlockSize;
  5516. IF toB > RowsB THEN toB := RowsB END;
  5517. Block( fromA, toA, fromB, toB ); fromB := toB;
  5518. UNTIL toB = RowsB;
  5519. fromA := toA;
  5520. UNTIL toA = RowsA;
  5521. END MatMulHBlockR;
  5522. PROCEDURE MatMulHBlockX( MatrixA, MatrixB, MatrixC: ADDRESS;
  5523. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  5524. add: BOOLEAN );
  5525. VAR fromA, toA, fromB, toB: SIZE; BlockSize: SIZE;
  5526. (*
  5527. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  5528. *)
  5529. PROCEDURE Block( fromA, toA, fromB, toB: SIZE );
  5530. VAR adrA, adrB, adrC: ADDRESS; i, j: SIZE;
  5531. BEGIN
  5532. FOR i := fromA TO toA - 1 DO
  5533. adrA := MatrixA + i * Stride;
  5534. FOR j := fromB TO toB - 1 DO
  5535. adrB := MatrixB + j * Stride;
  5536. adrC := MatrixC + i * StrideC + j * IncC;
  5537. AlignedSPXSSE( adrA, adrB, adrC, Cols, add );
  5538. END;
  5539. END;
  5540. END Block;
  5541. BEGIN
  5542. IF cBlockSize = 0 THEN
  5543. BlockSize := L2CacheSize DIV Stride DIV 8;
  5544. ELSE BlockSize := cBlockSize;
  5545. END;
  5546. lastUsedBlockSize := BlockSize;
  5547. fromA := 0;
  5548. REPEAT
  5549. toA := fromA + BlockSize;
  5550. IF toA > RowsA THEN toA := RowsA END;
  5551. fromB := 0;
  5552. REPEAT
  5553. toB := fromB + BlockSize;
  5554. IF toB > RowsB THEN toB := RowsB END;
  5555. Block( fromA, toA, fromB, toB ); fromB := toB;
  5556. UNTIL toB = RowsB;
  5557. fromA := toA;
  5558. UNTIL toA = RowsA;
  5559. END MatMulHBlockX;
  5560. PROCEDURE CopyDataR( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  5561. VAR i: SIZE; t: HUGEINT;
  5562. BEGIN
  5563. Tic( t );
  5564. FOR i := 0 TO rows - 1 DO
  5565. Copy4( src, dest, incSrc, incDest, cols );
  5566. INC( src, strideSrc ); INC( dest, strideDest );
  5567. END;
  5568. Toc( t, copyT );
  5569. END CopyDataR;
  5570. PROCEDURE CopyDataX( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  5571. VAR i: SIZE; t: HUGEINT;
  5572. BEGIN
  5573. Tic( t );
  5574. FOR i := 0 TO rows - 1 DO
  5575. Copy8( src, dest, incSrc, incDest, cols );
  5576. INC( src, strideSrc ); INC( dest, strideDest );
  5577. END;
  5578. Toc( t, copyT );
  5579. END CopyDataX;
  5580. PROCEDURE MatMulARARTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5581. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5582. add: BOOLEAN ): BOOLEAN;
  5583. VAR stride: SIZE; adrB, adrC: ADDRESS;
  5584. proc: POINTER TO ARRAY OF MatMulHObjR;
  5585. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5586. t: HUGEINT;
  5587. BEGIN
  5588. NEW(proc,nrProcesses);
  5589. ASSERT( ColsA = RowsB );
  5590. (* allocate 128 bit = 16 byte aligned matrix *)
  5591. stride := Align( ColsA * SIZEOF( REAL ), 16 );
  5592. IF (IncA # SIZEOF( REAL )) OR (StrideA # stride) OR
  5593. (matrixA MOD 16 # 0) THEN
  5594. cacheA := cachePool.Acquire( stride * RowsA );
  5595. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5596. SIZEOF( REAL ), stride, RowsA, ColsA ); (* copy to array *)
  5597. matrixA := cacheA.adr;
  5598. ELSE cacheA := NIL;
  5599. END;
  5600. IF (StrideB # SIZEOF( REAL )) OR (IncB # stride) OR
  5601. (matrixB MOD 16 # 0) THEN
  5602. cacheB := cachePool.Acquire( stride * ColsB );
  5603. CopyDataR( matrixB, cacheB.adr, StrideB, IncB,
  5604. SIZEOF( REAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5605. matrixB := cacheB.adr;
  5606. ELSE cacheB := NIL;
  5607. END;
  5608. Tic( t );
  5609. (*! needs decision rule if to split by rows or columns *)
  5610. IF nrProcesses > 1 THEN
  5611. from := 0;
  5612. FOR i := 0 TO nrProcesses - 1 DO
  5613. (*
  5614. to := RowsA * (i + 1) DIV nrProcesses; adrA := matrixA + from * stride;
  5615. adrC := matrixC + from * StrideC;
  5616. *)
  5617. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5618. adrB := matrixB + from * stride;
  5619. adrC := matrixC + from * IncC;
  5620. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5621. RowsA, to0 - from, RowsB, add );
  5622. from := to0;
  5623. END;
  5624. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5625. ELSE
  5626. MatMulHBlockR( matrixA, matrixB, matrixC, stride, IncC,
  5627. StrideC, RowsA, ColsB, RowsB, add );
  5628. END;
  5629. Toc( t, compT ); cachePool.Release( cacheA );
  5630. cachePool.Release( cacheB ); RETURN TRUE;
  5631. END MatMulARARTransposed;
  5632. PROCEDURE MatMulAXAXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5633. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5634. add: BOOLEAN ): BOOLEAN;
  5635. VAR stride: SIZE; adrB, adrC: ADDRESS;
  5636. proc: POINTER TO ARRAY OF MatMulHObjX;
  5637. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5638. t: HUGEINT;
  5639. BEGIN
  5640. NEW(proc,nrProcesses);
  5641. ASSERT( ColsA = RowsB );
  5642. stride := Align( ColsA * SIZEOF( LONGREAL ), 16 );
  5643. IF (IncA # SIZEOF( LONGREAL )) OR (StrideA # stride) OR
  5644. (matrixA MOD 16 # 0) THEN
  5645. cacheA := cachePool.Acquire( stride * RowsA );
  5646. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5647. SIZEOF( LONGREAL ), stride, RowsA, ColsA ); (* copy to array *)
  5648. matrixA := cacheA.adr;
  5649. ELSE cacheA := NIL;
  5650. END;
  5651. IF (StrideB # SIZEOF( LONGREAL )) OR (IncB # stride) OR
  5652. (matrixB MOD 16 # 0) THEN
  5653. cacheB := cachePool.Acquire( stride * ColsB );
  5654. CopyDataX( matrixB, cacheB.adr, StrideB, IncB,
  5655. SIZEOF( LONGREAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5656. matrixB := cacheB.adr;
  5657. ELSE cacheB := NIL;
  5658. END;
  5659. Tic( t );
  5660. IF nrProcesses > 1 THEN
  5661. from := 0;
  5662. FOR i := 0 TO nrProcesses - 1 DO
  5663. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5664. adrB := matrixB + from * stride;
  5665. adrC := matrixC + from * IncC;
  5666. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5667. RowsA, to0 - from, RowsB, add );
  5668. from := to0;
  5669. END;
  5670. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5671. ELSE
  5672. MatMulHBlockX( matrixA, matrixB, matrixC, stride, IncC,
  5673. StrideC, RowsA, ColsB, RowsB, add );
  5674. END;
  5675. Toc( t, compT ); cachePool.Release( cacheA );
  5676. cachePool.Release( cacheB ); RETURN TRUE;
  5677. END MatMulAXAXTransposed;
  5678. (****** strided matrix multiplication with restrictions to increments ******)
  5679. PROCEDURE MatMulARARSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5680. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5681. add: BOOLEAN ): BOOLEAN;
  5682. VAR sum: REAL; CbFrom, i, j, k: SIZE; valA, valB: REAL;
  5683. adrA, adrB, adrC: ADDRESS;
  5684. cacheA, cacheB, cacheC: Cache;
  5685. matrixCO, StrideCO, IncCO: SIZE; t: HUGEINT;
  5686. (*VAR fromA, toA: LONGINT; *)
  5687. BEGIN
  5688. IF (IncA # SIZEOF( REAL )) THEN
  5689. cacheA :=
  5690. cachePool.Acquire( RowsA * ColsA * SIZEOF( REAL ) );
  5691. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5692. SIZEOF( REAL ), SIZEOF( REAL ) * ColsA, RowsA,
  5693. ColsA );
  5694. matrixA := cacheA.adr; IncA := SIZEOF( REAL );
  5695. StrideA := SIZEOF( REAL ) * ColsA;
  5696. END;
  5697. IF (IncB # SIZEOF( REAL )) THEN
  5698. cacheB :=
  5699. cachePool.Acquire( RowsB * ColsB * SIZEOF( REAL ) );
  5700. CopyDataR( matrixB, cacheB.adr, IncB, StrideB,
  5701. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsB,
  5702. ColsB );
  5703. matrixB := cacheB.adr; IncB := SIZEOF( REAL );
  5704. StrideB := SIZEOF( REAL ) * ColsB;
  5705. END;
  5706. IF (IncC # SIZEOF( REAL )) THEN
  5707. cacheC :=
  5708. cachePool.Acquire( RowsA * ColsB * SIZEOF( REAL ) );
  5709. CopyDataR( matrixC, cacheC.adr, IncC, StrideC,
  5710. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsA,
  5711. ColsB );
  5712. matrixCO := matrixC; StrideCO := StrideC;
  5713. IncCO := IncC; matrixC := cacheC.adr;
  5714. IncC := SIZEOF( REAL ); StrideC := SIZEOF( REAL ) * ColsB;
  5715. END;
  5716. Tic( t );
  5717. CbFrom := 0;
  5718. IF ColsB >= 24 THEN
  5719. SSEMul24BlockR( CbFrom, StrideA, StrideB, StrideC,
  5720. ColsA, RowsA, ColsB, RowsB, matrixA,
  5721. matrixB, matrixC, add );
  5722. END;
  5723. IF ColsB - CbFrom >= 16 THEN
  5724. SSEMul16BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5725. CbFrom, matrixA, matrixB, matrixC, add );
  5726. INC( CbFrom, 16 );
  5727. END;
  5728. IF ColsB - CbFrom >= 8 THEN
  5729. SSEMul8BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5730. CbFrom, matrixA, matrixB, matrixC, add );
  5731. INC( CbFrom, 8 );
  5732. END;
  5733. IF ColsB - CbFrom >= 4 THEN
  5734. SSEMul4BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5735. CbFrom, matrixA, matrixB, matrixC, add );
  5736. INC( CbFrom, 4 );
  5737. END;
  5738. IF ColsB - CbFrom > 0 THEN
  5739. (* do it in Oberon *)
  5740. FOR i := 0 TO RowsA - 1 DO
  5741. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5742. FOR j := CbFrom TO ColsB - 1 DO
  5743. adrA := matrixA + i * StrideA;
  5744. adrB := matrixB + j * IncB;
  5745. IF add THEN SYSTEM.GET( adrC, sum )
  5746. ELSE sum := 0
  5747. END;
  5748. FOR k := 0 TO RowsB - 1 DO
  5749. SYSTEM.GET( adrA, valA );
  5750. SYSTEM.GET( adrB, valB );
  5751. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5752. INC( adrA, IncA ); INC( adrB, StrideB );
  5753. END;
  5754. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5755. (* C[i, j] := sum; *)
  5756. END;
  5757. END;
  5758. END;
  5759. Toc( t, compT );
  5760. IF cacheC # NIL THEN
  5761. CopyDataR( matrixC, matrixCO, IncC, StrideC, IncCO,
  5762. StrideCO, RowsA, ColsB );
  5763. END;
  5764. cachePool.Release( cacheA );
  5765. cachePool.Release( cacheB );
  5766. cachePool.Release( cacheC );
  5767. RETURN TRUE;
  5768. END MatMulARARSSEStride;
  5769. PROCEDURE MatMulAXAXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5770. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5771. add: BOOLEAN ): BOOLEAN;
  5772. VAR sum: LONGREAL; CbFrom, i, j, k: SIZE;
  5773. valA, valB: LONGREAL; adrA, adrB, adrC: ADDRESS;
  5774. cacheA, cacheB, cacheC: Cache;
  5775. matrixCO, StrideCO, IncCO:SIZE; t: HUGEINT;
  5776. BEGIN
  5777. IF (IncA # SIZEOF( LONGREAL )) THEN
  5778. cacheA :=
  5779. cachePool.Acquire( RowsA * ColsA * SIZEOF( LONGREAL ) );
  5780. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5781. SIZEOF( LONGREAL ),
  5782. SIZEOF( LONGREAL ) * ColsA, RowsA, ColsA );
  5783. matrixA := cacheA.adr;
  5784. StrideA := SIZEOF( LONGREAL ) * ColsA;
  5785. IncA := SIZEOF( LONGREAL );
  5786. END;
  5787. IF (IncB # SIZEOF( LONGREAL )) THEN
  5788. cacheB :=
  5789. cachePool.Acquire( RowsB * ColsB * SIZEOF( LONGREAL ) );
  5790. CopyDataX( matrixB, cacheB.adr, IncB, StrideB,
  5791. SIZEOF( LONGREAL ),
  5792. SIZEOF( LONGREAL ) * ColsB, RowsB, ColsB );
  5793. matrixB := cacheB.adr;
  5794. StrideB := SIZEOF( LONGREAL ) * ColsB;
  5795. IncB := SIZEOF( LONGREAL );
  5796. END;
  5797. IF (IncC # SIZEOF( LONGREAL )) THEN
  5798. cacheC :=
  5799. cachePool.Acquire( RowsA * ColsB * SIZEOF( LONGREAL ) );
  5800. CopyDataX( matrixC, cacheC.adr, IncC, StrideC,
  5801. SIZEOF( LONGREAL ),
  5802. SIZEOF( LONGREAL ) * ColsB, RowsA, ColsB );
  5803. matrixCO := matrixC; StrideCO := StrideC;
  5804. IncCO := IncC; StrideC := SIZEOF( LONGREAL ) * ColsB;
  5805. IncC := SIZEOF( LONGREAL ); matrixC := cacheC.adr;
  5806. END;
  5807. Tic( t );
  5808. CbFrom := 0;
  5809. IF ColsB >= 12 THEN
  5810. SSEMul12BlockX( CbFrom, StrideA, StrideB, StrideC,
  5811. ColsA, RowsA, ColsB, RowsB, matrixA,
  5812. matrixB, matrixC, add );
  5813. END;
  5814. IF ColsB - CbFrom >= 8 THEN
  5815. SSEMul8BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5816. CbFrom, matrixA, matrixB, matrixC, add );
  5817. INC( CbFrom, 8 );
  5818. END;
  5819. IF ColsB - CbFrom >= 4 THEN
  5820. SSEMul4BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5821. CbFrom, matrixA, matrixB, matrixC, add );
  5822. INC( CbFrom, 4 );
  5823. END;
  5824. IF ColsB - CbFrom >= 2 THEN
  5825. SSEMul2BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5826. CbFrom, matrixA, matrixB, matrixC, add );
  5827. INC( CbFrom, 2 );
  5828. END;
  5829. IF ColsB - CbFrom > 0 THEN
  5830. (* do it in Oberon *)
  5831. FOR i := 0 TO RowsA - 1 DO
  5832. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5833. FOR j := CbFrom TO ColsB - 1 DO
  5834. adrA := matrixA + i * StrideA;
  5835. adrB := matrixB + j * IncB;
  5836. IF add THEN SYSTEM.GET( adrC, sum )
  5837. ELSE sum := 0
  5838. END;
  5839. FOR k := 0 TO RowsB - 1 DO
  5840. SYSTEM.GET( adrA, valA );
  5841. SYSTEM.GET( adrB, valB );
  5842. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5843. INC( adrA, IncA ); INC( adrB, StrideB );
  5844. END;
  5845. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5846. (* C[i, j] := sum; *)
  5847. END;
  5848. END;
  5849. END;
  5850. Toc( t, compT );
  5851. IF cacheC # NIL THEN
  5852. CopyDataX( matrixC, matrixCO, IncC, StrideC, IncCO,
  5853. StrideCO, RowsA, ColsB );
  5854. END;
  5855. cachePool.Release( cacheA );
  5856. cachePool.Release( cacheB );
  5857. cachePool.Release( cacheC );
  5858. RETURN TRUE;
  5859. END MatMulAXAXSSEStride;
  5860. (****** naiive Oberon matrix multiplication ******)
  5861. PROCEDURE MatMulARARNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5862. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5863. add: BOOLEAN );
  5864. (*
  5865. A is M x K matrix, M=rows (A); K=cols(A);
  5866. B is K x N matrix; K=rows(B); N = cols(B);
  5867. C is M x N matrix; M=rows(C); N=cols(C);
  5868. *)
  5869. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5870. val1, val2, sum: REAL; t: HUGEINT;
  5871. BEGIN
  5872. Tic( t );
  5873. FOR i := 1 TO M DO
  5874. adrC := matrixC; adrB := matrixB;
  5875. FOR j := 1 TO N DO
  5876. adrA := matrixA; innerB := adrB;
  5877. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5878. FOR k := 1 TO K DO
  5879. SYSTEM.GET( adrA, val1 );
  5880. SYSTEM.GET( innerB, val2 );
  5881. sum := sum + val1 * val2; INC( adrA, IncA );
  5882. INC( innerB, StrideB );
  5883. END;
  5884. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5885. INC( adrC, IncC );
  5886. END;
  5887. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5888. END;
  5889. Toc( t, compT );
  5890. END MatMulARARNaiive;
  5891. PROCEDURE MatMulAXAXNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5892. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5893. add: BOOLEAN );
  5894. (*
  5895. A is M x K matrix, M=rows (A); K=cols(A);
  5896. B is K x N matrix; K=rows(B); N = cols(B);
  5897. C is M x N matrix; M=rows(C); N=cols(C);
  5898. *)
  5899. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5900. val1, val2, sum: LONGREAL; t: HUGEINT;
  5901. BEGIN
  5902. Tic( t );
  5903. FOR i := 1 TO M DO
  5904. adrC := matrixC; adrB := matrixB;
  5905. FOR j := 1 TO N DO
  5906. adrA := matrixA; innerB := adrB;
  5907. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5908. FOR k := 1 TO K DO
  5909. SYSTEM.GET( adrA, val1 );
  5910. SYSTEM.GET( innerB, val2 );
  5911. sum := sum + val1 * val2; INC( adrA, IncA );
  5912. INC( innerB, StrideB );
  5913. END;
  5914. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5915. INC( adrC, IncC );
  5916. END;
  5917. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5918. END;
  5919. Toc( t, compT );
  5920. END MatMulAXAXNaiive;
  5921. (*
  5922. PROCEDURE Toggle( VAR A, B: LONGINT );
  5923. VAR temp: LONGINT;
  5924. BEGIN
  5925. temp := A; A := B; B := temp;
  5926. END Toggle;
  5927. PROCEDURE Transpose( VAR matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT );
  5928. (*
  5929. prepare computation of C=A*B via C = (B` * A`)`
  5930. *)
  5931. BEGIN
  5932. Toggle( matrixA, matrixB ); Toggle( IncA, StrideB ); Toggle( StrideA, IncB );
  5933. Toggle( IncC, StrideC ); Toggle( M, N );
  5934. END Transpose;
  5935. *)
  5936. (*
  5937. *)
  5938. PROCEDURE BestMethod( M, N, K: SIZE ): LONGINT;
  5939. BEGIN
  5940. IF M = 1 THEN
  5941. IF N < 32 THEN RETURN cMatMulScalarProduct
  5942. ELSIF N < 256 THEN
  5943. IF K < 256 THEN RETURN cMatMulScalarProduct
  5944. ELSE RETURN cMatMulStride
  5945. END;
  5946. ELSE RETURN cMatMulStride
  5947. END;
  5948. ELSIF N = 1 THEN
  5949. IF (M > 1024) & (K > 1024) THEN
  5950. RETURN cMatMulTransposed
  5951. ELSE RETURN cMatMulScalarProduct
  5952. END;
  5953. ELSIF K = 1 THEN
  5954. IF N < 32 THEN
  5955. IF M < 256 THEN RETURN cMatMulNaive
  5956. ELSE RETURN cMatMulStride
  5957. END;
  5958. ELSIF N < 256 THEN
  5959. IF M < 32 THEN RETURN cMatMulNaive
  5960. ELSE RETURN cMatMulStride
  5961. END;
  5962. ELSE RETURN cMatMulStride
  5963. END;
  5964. ELSIF M < 32 THEN
  5965. IF N < 32 THEN RETURN cMatMulScalarProduct
  5966. ELSIF N < 256 THEN
  5967. IF K < 32 THEN RETURN cMatMulScalarProduct
  5968. ELSE RETURN cMatMulStride
  5969. END;
  5970. ELSE RETURN cMatMulStride
  5971. END;
  5972. ELSIF M < 256 THEN
  5973. IF N < 32 THEN
  5974. IF K < 32 THEN RETURN cMatMulScalarProduct
  5975. ELSE RETURN cMatMulStride
  5976. END;
  5977. ELSE
  5978. IF K < 256 THEN RETURN cMatMulStride
  5979. ELSE RETURN cMatMulBlocked
  5980. END;
  5981. END;
  5982. ELSE
  5983. IF N < 32 THEN RETURN cMatMulStride ELSE
  5984. IF K < 256 THEN RETURN cMatMulStride
  5985. ELSE RETURN cMatMulBlocked
  5986. END;
  5987. END;
  5988. END;
  5989. RETURN cMatMulStride;
  5990. END BestMethod;
  5991. (*
  5992. (N) (K) (N)
  5993. CCCCCC AAAAA BBBBB
  5994. CCCCCC AAAAA BBBBB
  5995. (M) CCCCCC = (M) AAAAA * (K) BBBBB
  5996. CCCCCC AAAAA BBBBB
  5997. CCCCCC AAAAA BBBBB
  5998. *)
  5999. PROCEDURE MatMulR( matrixA, matrixB, matrixC: ADDRESS;
  6000. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6001. (*! heuristics for choice of different methods needs improvement *)
  6002. (*! transpose if superior*)
  6003. (*! provide special variant for small [up to 4x4] matrices *)
  6004. VAR M, N, K: SIZE;
  6005. BEGIN
  6006. ASSERT( ColsA = RowsB );
  6007. M := RowsA; N := ColsB; K := ColsA;
  6008. CASE BestMethod( M, N, K ) OF
  6009. | cMatMulScalarProduct:
  6010. RETURN FALSE;
  6011. | cMatMulNaive:
  6012. RETURN MatMulRNaive( matrixA, matrixB, matrixC, IncA,
  6013. StrideA, IncB, StrideB, IncC,
  6014. StrideC, RowsA, ColsA, RowsB,
  6015. ColsB );
  6016. | cMatMulTransposed:
  6017. RETURN MatMulARARTransposed( matrixA, matrixB,
  6018. matrixC, IncA,
  6019. StrideA, IncB,
  6020. StrideB, IncC,
  6021. StrideC, RowsA,
  6022. ColsA, RowsB,
  6023. ColsB, FALSE );
  6024. | cMatMulStride:
  6025. RETURN MatMulARARSSEStride( matrixA, matrixB,
  6026. matrixC, IncA, StrideA,
  6027. IncB, StrideB, IncC,
  6028. StrideC, RowsA,
  6029. ColsA, RowsB, ColsB,
  6030. FALSE );
  6031. | cMatMulBlocked:
  6032. RETURN MatMulARARBlocked( matrixA, matrixB,
  6033. matrixC, IncA, StrideA,
  6034. IncB, StrideB, IncC,
  6035. StrideC, RowsA, ColsA,
  6036. RowsB, ColsB, FALSE );
  6037. ELSE
  6038. RETURN FALSE (* use scalar product for each row and column *)
  6039. END;
  6040. END MatMulR;
  6041. PROCEDURE MatMulX( matrixA, matrixB, matrixC: ADDRESS;
  6042. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6043. VAR M, N, K: SIZE;
  6044. BEGIN
  6045. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6046. K := ColsA;
  6047. (*
  6048. KernelLog.String("MatMulX, M,N,K = "); KernelLog.Int(M,10); KernelLog.Int(N,10); KernelLog.Int(K,10); KernelLog.Ln;
  6049. KernelLog.String("Method= "); KernelLog.Int( BestMethod(M,N,K),10); KernelLog.Ln;
  6050. *)
  6051. CASE BestMethod( M, N, K ) OF
  6052. | cMatMulScalarProduct:
  6053. RETURN FALSE;
  6054. | cMatMulNaive:
  6055. RETURN MatMulXNaive( matrixA, matrixB, matrixC, IncA,
  6056. StrideA, IncB, StrideB, IncC,
  6057. StrideC, RowsA, ColsA, RowsB,
  6058. ColsB );
  6059. | cMatMulTransposed:
  6060. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  6061. matrixC, IncA,
  6062. StrideA, IncB, StrideB,
  6063. IncC, StrideC, RowsA,
  6064. ColsA, RowsB, ColsB,
  6065. FALSE );
  6066. | cMatMulStride:
  6067. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  6068. matrixC, IncA, StrideA,
  6069. IncB, StrideB, IncC,
  6070. StrideC, RowsA, ColsA,
  6071. RowsB, ColsB,
  6072. FALSE );
  6073. | cMatMulBlocked:
  6074. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  6075. matrixC, IncA, StrideA,
  6076. IncB, StrideB, IncC,
  6077. StrideC, RowsA, ColsA,
  6078. RowsB, ColsB, FALSE );
  6079. ELSE
  6080. RETURN FALSE (* use scalar product for each row and column *)
  6081. END;
  6082. END MatMulX;
  6083. PROCEDURE MatMulIncR( matrixA, matrixB, matrixC: ADDRESS;
  6084. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6085. (*! heuristics for choice of different methods needs improvement *)
  6086. (*! transpose if superior*)
  6087. (*! provide special variant for small [up to 4x4] matrices *)
  6088. VAR M, N, K: SIZE;
  6089. BEGIN
  6090. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6091. K := ColsA;
  6092. CASE BestMethod( M, N, K ) OF
  6093. | cMatMulScalarProduct:
  6094. RETURN FALSE;
  6095. | cMatMulNaive:
  6096. RETURN MatMulIncRNaive( matrixA, matrixB, matrixC,
  6097. IncA, StrideA, IncB, StrideB,
  6098. IncC, StrideC, RowsA, ColsA,
  6099. RowsB, ColsB );
  6100. | cMatMulTransposed:
  6101. RETURN MatMulARARTransposed( matrixA, matrixB,
  6102. matrixC, IncA,
  6103. StrideA, IncB,
  6104. StrideB, IncC,
  6105. StrideC, RowsA,
  6106. ColsA, RowsB,
  6107. ColsB, TRUE );
  6108. | cMatMulStride:
  6109. RETURN MatMulARARSSEStride( matrixA, matrixB,
  6110. matrixC, IncA, StrideA,
  6111. IncB, StrideB, IncC,
  6112. StrideC, RowsA,
  6113. ColsA, RowsB, ColsB,
  6114. TRUE );
  6115. | cMatMulBlocked:
  6116. RETURN MatMulARARBlocked( matrixA, matrixB,
  6117. matrixC, IncA, StrideA,
  6118. IncB, StrideB, IncC,
  6119. StrideC, RowsA, ColsA,
  6120. RowsB, ColsB, TRUE );
  6121. ELSE
  6122. RETURN FALSE (* use scalar product for each row and column *)
  6123. END;
  6124. END MatMulIncR;
  6125. PROCEDURE MatMulIncX( matrixA, matrixB, matrixC: ADDRESS;
  6126. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6127. VAR M, N, K: SIZE;
  6128. BEGIN
  6129. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6130. K := ColsA;
  6131. CASE BestMethod( M, N, K ) OF
  6132. | cMatMulScalarProduct:
  6133. RETURN FALSE;
  6134. | cMatMulNaive:
  6135. RETURN MatMulIncXNaive( matrixA, matrixB, matrixC,
  6136. IncA, StrideA, IncB, StrideB,
  6137. IncC, StrideC, RowsA, ColsA,
  6138. RowsB, ColsB );
  6139. | cMatMulTransposed:
  6140. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  6141. matrixC, IncA,
  6142. StrideA, IncB, StrideB,
  6143. IncC, StrideC, RowsA,
  6144. ColsA, RowsB, ColsB,
  6145. TRUE );
  6146. | cMatMulStride:
  6147. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  6148. matrixC, IncA, StrideA,
  6149. IncB, StrideB, IncC,
  6150. StrideC, RowsA, ColsA,
  6151. RowsB, ColsB, TRUE );
  6152. | cMatMulBlocked:
  6153. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  6154. matrixC, IncA, StrideA,
  6155. IncB, StrideB, IncC,
  6156. StrideC, RowsA, ColsA,
  6157. RowsB, ColsB, TRUE );
  6158. ELSE
  6159. RETURN FALSE (* use scalar product for each row and column *)
  6160. END;
  6161. END MatMulIncX;
  6162. PROCEDURE MatMulARARBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6163. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  6164. add: BOOLEAN ): BOOLEAN;
  6165. VAR M, N, K, L2M, L2N, L2K: SIZE;
  6166. BEGIN
  6167. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6168. K := ColsA; MagicBlockR( M, N, K, L2M, L2N, L2K );
  6169. (*
  6170. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  6171. IncC, StrideC, RowsA, ColsB, ColsA );
  6172. *)
  6173. MultiplyR( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  6174. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  6175. StrideC, add );
  6176. RETURN TRUE;
  6177. END MatMulARARBlocked;
  6178. PROCEDURE MatMulAXAXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6179. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  6180. add: BOOLEAN ): BOOLEAN;
  6181. VAR M, N, K, L2M, L2N, L2K: SIZE;
  6182. BEGIN
  6183. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  6184. K := ColsA; MagicBlockX( M, N, K, L2M, L2N, L2K );
  6185. (*
  6186. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  6187. IncC, StrideC, RowsA, ColsB, ColsA );
  6188. *)
  6189. MultiplyX( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  6190. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  6191. StrideC, add );
  6192. RETURN TRUE;
  6193. END MatMulAXAXBlocked;
  6194. PROCEDURE MatMulRNaive( matrixA, matrixB, matrixC: ADDRESS;
  6195. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6196. BEGIN
  6197. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  6198. IncB, StrideB, IncC, StrideC, RowsA,
  6199. ColsB, ColsA, FALSE );
  6200. RETURN TRUE;
  6201. END MatMulRNaive;
  6202. PROCEDURE MatMulXNaive( matrixA, matrixB, matrixC: ADDRESS;
  6203. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6204. BEGIN
  6205. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  6206. IncB, StrideB, IncC, StrideC, RowsA,
  6207. ColsB, ColsA, FALSE );
  6208. RETURN TRUE;
  6209. END MatMulXNaive;
  6210. PROCEDURE MatMulIncRNaive( matrixA, matrixB, matrixC: ADDRESS;
  6211. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6212. BEGIN
  6213. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  6214. IncB, StrideB, IncC, StrideC, RowsA,
  6215. ColsB, ColsA, TRUE );
  6216. RETURN TRUE;
  6217. END MatMulIncRNaive;
  6218. PROCEDURE MatMulIncXNaive( matrixA, matrixB, matrixC: ADDRESS;
  6219. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6220. BEGIN
  6221. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  6222. IncB, StrideB, IncC, StrideC, RowsA,
  6223. ColsB, ColsA, TRUE );
  6224. RETURN TRUE;
  6225. END MatMulIncXNaive;
  6226. PROCEDURE MatMulXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  6227. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6228. BEGIN
  6229. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  6230. IncA, StrideA, IncB,
  6231. StrideB, IncC, StrideC,
  6232. RowsA, ColsA, RowsB,
  6233. ColsB, FALSE );
  6234. END MatMulXTransposed;
  6235. PROCEDURE MatMulIncXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  6236. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6237. BEGIN
  6238. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  6239. IncA, StrideA, IncB,
  6240. StrideB, IncC, StrideC,
  6241. RowsA, ColsA, RowsB,
  6242. ColsB, TRUE )
  6243. END MatMulIncXTransposed;
  6244. PROCEDURE MatMulRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  6245. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6246. BEGIN
  6247. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  6248. IncA, StrideA, IncB,
  6249. StrideB, IncC, StrideC,
  6250. RowsA, ColsA, RowsB,
  6251. ColsB, FALSE );
  6252. END MatMulRTransposed;
  6253. PROCEDURE MatMulIncRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  6254. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6255. BEGIN
  6256. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  6257. IncA, StrideA, IncB,
  6258. StrideB, IncC, StrideC,
  6259. RowsA, ColsA, RowsB,
  6260. ColsB, TRUE )
  6261. END MatMulIncRTransposed;
  6262. PROCEDURE MatMulXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  6263. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6264. BEGIN
  6265. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  6266. IncA, StrideA, IncB, StrideB,
  6267. IncC, StrideC, RowsA,
  6268. ColsA, RowsB, ColsB,
  6269. FALSE );
  6270. END MatMulXSSEStride;
  6271. PROCEDURE MatMulIncXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  6272. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6273. BEGIN
  6274. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  6275. IncA, StrideA, IncB, StrideB,
  6276. IncC, StrideC, RowsA,
  6277. ColsA, RowsB, ColsB,
  6278. TRUE );
  6279. END MatMulIncXSSEStride;
  6280. PROCEDURE MatMulRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  6281. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6282. BEGIN
  6283. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  6284. IncA, StrideA, IncB, StrideB,
  6285. IncC, StrideC, RowsA,
  6286. ColsA, RowsB, ColsB,
  6287. FALSE );
  6288. END MatMulRSSEStride;
  6289. PROCEDURE MatMulIncRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  6290. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6291. BEGIN
  6292. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  6293. IncA, StrideA, IncB, StrideB,
  6294. IncC, StrideC, RowsA,
  6295. ColsA, RowsB, ColsB,
  6296. TRUE )
  6297. END MatMulIncRSSEStride;
  6298. PROCEDURE MatMulRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6299. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6300. BEGIN
  6301. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  6302. IncA, StrideA, IncB, StrideB,
  6303. IncC, StrideC, RowsA, ColsA,
  6304. RowsB, ColsB, FALSE )
  6305. END MatMulRBlocked;
  6306. PROCEDURE MatMulIncRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6307. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6308. BEGIN
  6309. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  6310. IncA, StrideA, IncB, StrideB,
  6311. IncC, StrideC, RowsA, ColsA,
  6312. RowsB, ColsB, TRUE )
  6313. END MatMulIncRBlocked;
  6314. PROCEDURE MatMulXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6315. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6316. BEGIN
  6317. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  6318. IncA, StrideA, IncB, StrideB,
  6319. IncC, StrideC, RowsA, ColsA,
  6320. RowsB, ColsB, FALSE )
  6321. END MatMulXBlocked;
  6322. PROCEDURE MatMulIncXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  6323. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  6324. BEGIN
  6325. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  6326. IncA, StrideA, IncB, StrideB,
  6327. IncC, StrideC, RowsA, ColsA,
  6328. RowsB, ColsB, TRUE )
  6329. END MatMulIncXBlocked;
  6330. PROCEDURE SetMatMulMethod*( i: LONGINT );
  6331. BEGIN
  6332. KernelLog.String("ArrayBaseOptimized, method = ");
  6333. IF i = cMatMulDynamic THEN
  6334. KernelLog.String("dynamic.");
  6335. ArrayBase.matMulIncR := MatMulIncR;
  6336. ArrayBase.matMulIncX := MatMulIncX;
  6337. ArrayBase.matMulR := MatMulR;
  6338. ArrayBase.matMulX := MatMulX;
  6339. ELSIF i = cMatMulScalarProduct THEN
  6340. KernelLog.String("scalarproduct.");
  6341. ArrayBase.matMulIncR := NIL;
  6342. ArrayBase.matMulIncX := NIL;
  6343. ArrayBase.matMulR := NIL; ArrayBase.matMulX := NIL;
  6344. ELSIF i = cMatMulNaive THEN
  6345. KernelLog.String("naiive.");
  6346. ArrayBase.matMulR := MatMulRNaive;
  6347. ArrayBase.matMulX := MatMulXNaive;
  6348. ArrayBase.matMulIncR := MatMulIncRNaive;
  6349. ArrayBase.matMulIncX := MatMulIncXNaive;
  6350. ELSIF i = cMatMulTransposed THEN
  6351. KernelLog.String("transposed.");
  6352. ArrayBase.matMulR := MatMulRTransposed;
  6353. ArrayBase.matMulX := MatMulXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  6354. ArrayBase.matMulIncR := MatMulIncRTransposed;
  6355. ArrayBase.matMulIncX := MatMulIncXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  6356. ELSIF i = cMatMulStride THEN
  6357. KernelLog.String("stride.");
  6358. ArrayBase.matMulR := MatMulRSSEStride;
  6359. ArrayBase.matMulX := MatMulXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  6360. ArrayBase.matMulIncR := MatMulIncRSSEStride;
  6361. ArrayBase.matMulIncX := MatMulIncXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  6362. ELSIF i = cMatMulBlocked THEN
  6363. KernelLog.String("blocked.");
  6364. ArrayBase.matMulR := MatMulRBlocked;
  6365. ArrayBase.matMulX := MatMulXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  6366. ArrayBase.matMulIncR := MatMulIncRBlocked;
  6367. ArrayBase.matMulIncX := MatMulIncXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  6368. END;
  6369. KernelLog.Ln;
  6370. END SetMatMulMethod;
  6371. (* optimizations for small arrays (Alexey Morozov) *)
  6372. (* assumes that all arrays do not overlap *)
  6373. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  6374. PROCEDURE MatMulR2x2(dadr, ladr, radr: ADDRESS);
  6375. CODE{SYSTEM.i386, SYSTEM.SSE2}
  6376. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6377. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6378. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6379. MOVUPS XMM0, [EAX] ; [a00,a01,a10,a11]
  6380. MOVUPS XMM1, [EBX] ; [b00,b01,b10,b11]
  6381. MOVAPS XMM2, XMM1
  6382. SHUFPS XMM2, XMM1, 204 ; XMM2 := [b00,b11,b00,b11]
  6383. MULPS XMM2, XMM0
  6384. SHUFPS XMM0, XMM0, 177 ; XMM0 := [a01,a00,a11,a10]
  6385. SHUFPS XMM1, XMM1, 102 ; XMM1 := [b10,b01,b10,b01]
  6386. MULPS XMM1, XMM0
  6387. ADDPS XMM1, XMM2
  6388. MOVUPS [ECX], XMM1
  6389. END MatMulR2x2;
  6390. (* based on weighted sum of rows (Alexey Morozov) *)
  6391. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  6392. PROCEDURE MatMulR3x3(dadr, ladr, radr: ADDRESS);
  6393. CODE{SYSTEM.i386, SYSTEM.SSE2}
  6394. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6395. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6396. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6397. MOVUPS XMM0, [EBX] ; XMM0 := [b00,b01,b02,-]
  6398. MOVUPS XMM1, [EBX+12] ; XMM1 := [b10,b11,b12,-]
  6399. ; last element is out of range, is it still OK?
  6400. MOVUPS XMM2, [EBX+24] ; XMM2 := [b20,b21,b22,-]
  6401. ;MOVLPS XMM2, [EBX+24]
  6402. ;MOVSS XMM3, [EBX+32]
  6403. ;MOVLHPS XMM2, XMM3
  6404. MOVSS XMM3, [EAX]
  6405. SHUFPS XMM3, XMM3, 0; XMM3 := [a00,a00,a00,-]
  6406. MOVAPS XMM4, XMM0
  6407. MULPS XMM4, XMM3
  6408. MOVSS XMM3, [EAX+4]
  6409. SHUFPS XMM3, XMM3, 0; XMM3 := [a01,a01,a01,-]
  6410. MULPS XMM3, XMM1
  6411. ADDPS XMM4, XMM3
  6412. MOVSS XMM3, [EAX+8]
  6413. SHUFPS XMM3, XMM3, 0; XMM3 := [a02,a02,a02,-]
  6414. MULPS XMM3, XMM2
  6415. ADDPS XMM4, XMM3
  6416. MOVUPS [ECX], XMM4
  6417. ;***************************************************;
  6418. MOVSS XMM3, [EAX+12]
  6419. SHUFPS XMM3, XMM3, 0; XMM3 := [a10,a10,a10,-]
  6420. MOVAPS XMM4, XMM0
  6421. MULPS XMM4, XMM3
  6422. MOVSS XMM3, [EAX+16]
  6423. SHUFPS XMM3, XMM3, 0; XMM3 := [a11,a11,a11,-]
  6424. MULPS XMM3, XMM1
  6425. ADDPS XMM4, XMM3
  6426. MOVSS XMM3, [EAX+20]
  6427. SHUFPS XMM3, XMM3, 0; XMM3 := [a12,a12,a12,-]
  6428. MULPS XMM3, XMM2
  6429. ADDPS XMM4, XMM3
  6430. MOVUPS [ECX+12], XMM4
  6431. ;***************************************************;
  6432. MOVSS XMM3, [EAX+24]
  6433. SHUFPS XMM3, XMM3, 0; XMM3 := [a20,a20,a20,-]
  6434. MOVAPS XMM4, XMM0
  6435. MULPS XMM4, XMM3
  6436. MOVSS XMM3, [EAX+28]
  6437. SHUFPS XMM3, XMM3, 0; XMM3 := [a21,a21,a21,-]
  6438. MULPS XMM3, XMM1
  6439. ADDPS XMM4, XMM3
  6440. MOVSS XMM3, [EAX+32]
  6441. SHUFPS XMM3, XMM3, 0; XMM3 := [a22,a22,a22,-]
  6442. MULPS XMM3, XMM2
  6443. ADDPS XMM4, XMM3
  6444. ;MOVUPS [ECX+24], XMM4
  6445. MOVLPS [ECX+24], XMM4
  6446. MOVHLPS XMM4, XMM4
  6447. MOVSS [ECX+32], XMM4
  6448. END MatMulR3x3;
  6449. (* based on Strassen algorithm (Alexey Morozov) *)
  6450. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  6451. PROCEDURE MatMulR4x4(dadr, ladr, radr: ADDRESS);
  6452. CODE{SYSTEM.i386, SYSTEM.SSE2}
  6453. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6454. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6455. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6456. ; load A00
  6457. MOVLPS XMM0, [EAX] ; XMM0 := [a00,a01,-,-]
  6458. MOVHPS XMM0, [EAX+16] ; XMM0 := [a00,a01,a10,a11]
  6459. ; load A01
  6460. MOVLPS XMM1, [EAX+8] ; XMM1 := [a02,a03,-,-]
  6461. MOVHPS XMM1, [EAX+24] ; XMM1 := [a02,a03,a12,a13]
  6462. ; load B00
  6463. MOVLPS XMM2, [EBX] ; XMM2 := [b00,b01,-,-]
  6464. MOVHPS XMM2, [EBX+16] ; XMM2 := [b00,b01,b10,b11]
  6465. ; load B01
  6466. MOVLPS XMM3, [EBX+8] ; XMM3 := [a02,a03,-,-]
  6467. MOVHPS XMM3, [EBX+24] ; XMM3 := [a02,a03,a12,a13]
  6468. ; load B10
  6469. MOVLPS XMM4, [EBX+32] ; XMM4 := [b20,b21,-,-]
  6470. MOVHPS XMM4, [EBX+48] ; XMM4 := [b20,b21,b30,b31]
  6471. ; load B11
  6472. MOVLPS XMM5, [EBX+40] ; XMM5 := [b22,b23,-,-]
  6473. MOVHPS XMM5, [EBX+56] ; XMM5 := [b22,b23,b32,b33]
  6474. ;****************************************************;
  6475. ; multiply A00(D)*B00(E) (use MatMulR2x2 code)
  6476. MOVAPS XMM6, XMM2
  6477. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6478. MULPS XMM6, XMM0
  6479. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6480. MOVAPS XMM7, XMM2
  6481. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6482. MULPS XMM7, XMM0
  6483. ADDPS XMM7, XMM6
  6484. ; multiply A01(D)*B10(E)
  6485. MOVAPS XMM0, XMM4
  6486. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6487. MULPS XMM0, XMM1
  6488. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6489. MOVAPS XMM6, XMM4
  6490. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6491. MULPS XMM6, XMM1
  6492. ADDPS XMM6, XMM0
  6493. ADDPS XMM7, XMM6
  6494. MOVLPS [ECX], XMM7
  6495. MOVHPS [ECX+16], XMM7
  6496. ;****************************************************;
  6497. ; load A00
  6498. MOVLPS XMM0, [EAX] ; XMM0 := [a00,a01,-,-]
  6499. MOVHPS XMM0, [EAX+16] ; XMM0 := [a00,a01,a10,a11]
  6500. ; load A01
  6501. MOVLPS XMM1, [EAX+8] ; XMM1 := [a02,a03,-,-]
  6502. MOVHPS XMM1, [EAX+24] ; XMM1 := [a02,a03,a12,a13]
  6503. ; multiply A00(D)*B01(E) (use MatMulR2x2 code)
  6504. MOVAPS XMM6, XMM3
  6505. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6506. MULPS XMM6, XMM0
  6507. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6508. MOVAPS XMM7, XMM3
  6509. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6510. MULPS XMM7, XMM0
  6511. ADDPS XMM7, XMM6
  6512. ; multiply A01(D)*B11(E)
  6513. MOVAPS XMM0, XMM5
  6514. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6515. MULPS XMM0, XMM1
  6516. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6517. MOVAPS XMM6, XMM5
  6518. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6519. MULPS XMM6, XMM1
  6520. ADDPS XMM6, XMM0
  6521. ADDPS XMM7, XMM6
  6522. MOVLPS [ECX+8], XMM7
  6523. MOVHPS [ECX+24], XMM7
  6524. ;****************************************************;
  6525. ; load A10
  6526. MOVLPS XMM0, [EAX+32] ; XMM0 := [a20,a21,-,-]
  6527. MOVHPS XMM0, [EAX+48] ; XMM0 := [a20,a21,a30,a31]
  6528. ; load A11
  6529. MOVLPS XMM1, [EAX+40] ; XMM1 := [a22,a23,-,-]
  6530. MOVHPS XMM1, [EAX+56] ; XMM1 := [a22,a23,a32,a33]
  6531. ; multiply A10(D)*B00(E) (use MatMulR2x2 code)
  6532. MOVAPS XMM6, XMM2
  6533. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6534. MULPS XMM6, XMM0
  6535. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6536. MOVAPS XMM7, XMM2
  6537. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6538. MULPS XMM7, XMM0
  6539. ADDPS XMM7, XMM6
  6540. ; multiply A11(D)*B10(E)
  6541. MOVAPS XMM0, XMM4
  6542. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6543. MULPS XMM0, XMM1
  6544. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6545. MOVAPS XMM6, XMM4
  6546. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6547. MULPS XMM6, XMM1
  6548. ADDPS XMM6, XMM0
  6549. ADDPS XMM7, XMM6
  6550. MOVLPS [ECX+32], XMM7
  6551. MOVHPS [ECX+48], XMM7
  6552. ;****************************************************;
  6553. ; load A10
  6554. MOVLPS XMM0, [EAX+32] ; XMM0 := [a20,a21,-,-]
  6555. MOVHPS XMM0, [EAX+48] ; XMM0 := [a20,a21,a30,a31]
  6556. ; load A11
  6557. MOVLPS XMM1, [EAX+40] ; XMM1 := [a22,a23,-,-]
  6558. MOVHPS XMM1, [EAX+56] ; XMM1 := [a22,a23,a32,a33]
  6559. ; multiply A10(D)*B01(E) (use MatMulR2x2 code)
  6560. MOVAPS XMM6, XMM3
  6561. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  6562. MULPS XMM6, XMM0
  6563. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  6564. MOVAPS XMM7, XMM3
  6565. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  6566. MULPS XMM7, XMM0
  6567. ADDPS XMM7, XMM6
  6568. ; multiply A11(D)*B11(E)
  6569. MOVAPS XMM0, XMM5
  6570. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  6571. MULPS XMM0, XMM1
  6572. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  6573. MOVAPS XMM6, XMM5
  6574. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  6575. MULPS XMM6, XMM1
  6576. ADDPS XMM6, XMM0
  6577. ADDPS XMM7, XMM6
  6578. MOVLPS [ECX+40], XMM7
  6579. MOVHPS [ECX+56], XMM7
  6580. END MatMulR4x4;
  6581. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  6582. (* FIXME: speed it up when horizontal add is available!!! *)
  6583. PROCEDURE MatVecMulR2x2(dadr, ladr, radr: ADDRESS);
  6584. CODE{SYSTEM.i386, SYSTEM.SSE2}
  6585. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6586. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6587. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6588. ; load the whole matrix
  6589. MOVUPS XMM0, [EAX] ; XMM0 := [a00,a01,a10,a11]
  6590. MOVLPS XMM1, [EBX] ; XMM1 := [b00,b01,-,-]
  6591. MOVLHPS XMM1, XMM1 ; XMM1 := [b00,b10,b00,b10]
  6592. MULPS XMM0, XMM1 ; XMM0 := [a00*b00,a01*b10,a10*b00,a11*b10]
  6593. MOVAPS XMM1, XMM0
  6594. SHUFPS XMM0, XMM0, 8; XMM0 := [a00*b00,a10*b00,-,-]
  6595. SHUFPS XMM1, XMM1, 13; XMM1 := [a01*b10,a11*b10,-,-]
  6596. ADDPS XMM0, XMM1
  6597. MOVLPS [ECX], XMM0
  6598. END MatVecMulR2x2;
  6599. (* PH *)
  6600. (* to do: use MOVAPS when Felix fixes issues with alignment *)
  6601. PROCEDURE MatVecMulR4x4(dadr, ladr, radr: ADDRESS);
  6602. CODE{SYSTEM.i386, SYSTEM.SSE3}
  6603. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6604. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6605. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6606. MOVUPS XMM0, [EBX] ; XMM0 := [b0,b1,b2,b3]
  6607. MOVUPS XMM1, [EAX] ; XMM1 := [a00,a01,a02,a03]
  6608. MOVUPS XMM2, [EAX+16] ; XMM2 := [a10,a11,a12,a13]
  6609. MOVUPS XMM3, [EAX+32] ; XMM3 := [a20,a21,a22,a23]
  6610. MOVUPS XMM4, [EAX+48] ; XMM4 := [a30,a31,a32,a33]
  6611. MULPS XMM1, XMM0
  6612. MULPS XMM2, XMM0
  6613. HADDPS XMM1, XMM2 ; adjacent pairs are horizontally added
  6614. MULPS XMM3, XMM0
  6615. MULPS XMM4, XMM0
  6616. HADDPS XMM3, XMM4 ; adjacent pairs are horizontally added
  6617. HADDPS XMM1, XMM3 ; adjacent pairs are horizontally added
  6618. MOVUPS [ECX], XMM1
  6619. END MatVecMulR4x4;
  6620. PROCEDURE InstallMatMul*(context: Commands.Context);
  6621. VAR type: LONGINT; string: ARRAY 32 OF CHAR;
  6622. BEGIN
  6623. context.arg.String(string);
  6624. IF string = "dynamic" THEN
  6625. type := cMatMulDynamic;
  6626. ELSIF string = "scalarproduct" THEN
  6627. type := cMatMulScalarProduct
  6628. ELSIF string = "naive" THEN
  6629. type := cMatMulNaive
  6630. ELSIF string = "transposed" THEN
  6631. type := cMatMulTransposed
  6632. ELSIF string = "stride" THEN
  6633. type := cMatMulStride
  6634. ELSIF string ="blocked" THEN
  6635. type := cMatMulBlocked
  6636. ELSE
  6637. KernelLog.String("unknown method: "); KernelLog.String(string); KernelLog.Ln;
  6638. type := cMatMulDynamic;
  6639. END;
  6640. SetMatMulMethod( type );
  6641. END InstallMatMul;
  6642. PROCEDURE InstallAsm*;
  6643. BEGIN
  6644. KernelLog.String( "ASM " );
  6645. ArrayBase.loopSPAXAX := SPAXAXLoopA;
  6646. ArrayBase.loopSPARAR := SPARARLoopA;
  6647. ArrayBase.loopAddAXAX := AddAXAXLoopA;
  6648. ArrayBase.loopAddARAR := AddARARLoopA;
  6649. ArrayBase.loopSubAXAX := SubAXAXLoopA;
  6650. ArrayBase.loopSubARAR := SubARARLoopA;
  6651. ArrayBase.loopEMulAXAX := EMulAXAXLoopA;
  6652. ArrayBase.loopEMulARAR := EMulARARLoopA;
  6653. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
  6654. ArrayBase.loopMatMulARAR := MatMulARARLoopA;
  6655. ArrayBase.loopMulAXSX := MulAXSXLoopA;
  6656. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopA;
  6657. ArrayBase.loopMulARSR := MulARSRLoopA;
  6658. ArrayBase.loopIncMulARSR := IncMulARSRLoopA;
  6659. ArrayBase.loopMatMulIncAXAX := MatMulIncAXAXLoopA;
  6660. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopA;
  6661. ArrayBase.transpose4 := Transpose4;
  6662. ArrayBase.transpose8 := Transpose8;
  6663. END InstallAsm;
  6664. PROCEDURE InstallSSE*;
  6665. BEGIN
  6666. IF Machine.SSESupport THEN
  6667. KernelLog.String( "SSE " );
  6668. ArrayBase.loopSPARAR := SPARARLoopSSE;
  6669. ArrayBase.loopAddARAR := AddARARLoopSSE;
  6670. ArrayBase.loopSubARAR := SubARARLoopSSE;
  6671. ArrayBase.loopEMulARAR := EMulARARLoopSSE;
  6672. ArrayBase.loopMulARSR := MulARSRLoopSSE;
  6673. ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
  6674. ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
  6675. ArrayBase.matMulR := MatMulR;
  6676. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopSSE;
  6677. ArrayBase.matMulIncR := MatMulIncR;
  6678. (* optimizations for small matrices (Alexey Morozov) *)
  6679. ArrayBase.matMulR2x2 := MatMulR2x2;
  6680. ArrayBase.matMulR3x3 := MatMulR3x3;
  6681. ArrayBase.matMulR4x4 := MatMulR4x4;
  6682. ArrayBase.matVecMulR2x2 := MatVecMulR2x2;
  6683. END;
  6684. END InstallSSE;
  6685. PROCEDURE InstallSSE2*; (* extra for testing, will be merged with Install in later versions *)
  6686. BEGIN
  6687. IF Machine.SSE2Support THEN
  6688. KernelLog.String( "SSE2 " );
  6689. ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
  6690. ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
  6691. ArrayBase.loopSubAXAX := SubAXAXLoopSSE;
  6692. ArrayBase.loopEMulAXAX := EMulAXAXLoopSSE;
  6693. ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
  6694. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
  6695. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;
  6696. ArrayBase.matMulX := MatMulX;
  6697. ArrayBase.loopMatMulIncAXAX :=
  6698. MatMulIncAXAXLoopSSE;
  6699. ArrayBase.matMulIncX := MatMulIncX;
  6700. END;
  6701. END InstallSSE2;
  6702. (*! to do: at current, this only works for Win, not for native because SSE3Support is not yet implemented in BIOS.I386.Machine.Mod*)
  6703. PROCEDURE InstallSSE3*; (* extra for testing, will be merged with Install in later versions *)
  6704. BEGIN
  6705. IF Machine.SSE3Support THEN
  6706. KernelLog.String( "SSE3 " );
  6707. (* optimizations for small matrices *)
  6708. ArrayBase.matVecMulR4x4 := MatVecMulR4x4;
  6709. END;
  6710. END InstallSSE3;
  6711. PROCEDURE Install*;
  6712. BEGIN
  6713. KernelLog.String( "ArrayBaseOptimized: installing runtime library optimizations:" );
  6714. InstallAsm; InstallSSE; InstallSSE2; InstallSSE3;
  6715. KernelLog.String( " done." ); KernelLog.Ln;
  6716. END Install;
  6717. PROCEDURE SetParameters*( context: Commands.Context );
  6718. BEGIN
  6719. context.arg.SkipWhitespace; context.arg.Int(cBlockSize,TRUE);
  6720. context.arg.SkipWhitespace; context.arg.Int(nrProcesses,TRUE);
  6721. IF nrProcesses > maxProcesses THEN
  6722. nrProcesses := maxProcesses
  6723. ELSIF nrProcesses = 0 THEN nrProcesses := LONGINT (Machine.NumberOfProcessors());
  6724. END;
  6725. KernelLog.String( "BlockSize=" ); KernelLog.Int( cBlockSize, 0 );
  6726. KernelLog.String( ", NrProcesses = " ); KernelLog.Int( nrProcesses, 0 ); KernelLog.Ln;
  6727. END SetParameters;
  6728. BEGIN
  6729. cBlockSize := 0; (* automatic *)
  6730. nrProcesses := LONGINT (Machine.NumberOfProcessors()); (* automatic *)
  6731. allocT := 0; copyT := 0; compT := 0;
  6732. NEW( cachePool );
  6733. END FoxArrayBaseOptimized.
  6734. System.Free ArrayBaseOptimized ~
  6735. ArrayBaseOptimized.Install ~
  6736. ArrayBaseOptimized.InstallSSE2 ~
  6737. ArrayBaseOptimized.InstallSSE ~
  6738. ArrayBaseOptimized.InstallAsm ~
  6739. ArrayBaseOptimized.InstallMatMul dynamic ~
  6740. ArrayBaseOptimized.InstallMatMul scalarproduct ~
  6741. ArrayBaseOptimized.InstallMatMul transposed ~
  6742. ArrayBaseOptimized.InstallMatMul naive ~
  6743. ArrayBaseOptimized.InstallMatMul stride ~
  6744. ArrayBaseOptimized.InstallMatMul blocked ~
  6745. ArrayBaseOptimized.SetParameters 0 1 ~ (* BlockSize, NrProcesses *)