I386.FoxArrayBaseOptimized.Mod 181 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628
  1. MODULE FoxArrayBaseOptimized; (** AUTHOR "fof"; PURPOSE ""; **)
  2. IMPORT SYSTEM, ArrayBase := FoxArrayBase, Machine, KernelLog, Commands ;
  3. CONST
  4. L2CacheSize = 512 * 1024; (* L1CacheSize = 16 * 1024; *)
  5. (* parameters for blocking matrix multiplication *)
  6. L1BlockN = 5; (* L1 block size -> nr of columns in a block that can be processed using L1 chache *)
  7. L2BARatio = 1;
  8. L0BlockKR = 4; (* L0 block size -> nr of elements that can be processed at once for type REAL *)
  9. L1MaxBlockKR = 336; (* L1CacheSize/SIZEOF(REAL)/2/6*)
  10. L2BlockSize = 81920;
  11. L0BlockKX = 2; (* L0 block size -> nr of elements that can be processed at once for type LONGREAL *)
  12. L1MaxBlockKX = 256; (* > L1CacheSize/SIZEOF(LONGREAL)/2/6*)
  13. (*
  14. DefaultL2CacheSize = 81920;
  15. L2SizeR = L2CacheSize DIV 8; MaxBlockKR = 336; (* ca L1CacheSize/SIZEOF(REAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  16. L2SizeX = L2CacheSize DIV 8; MaxBlockKX = 256; (* bit more than L1CacheSize/SIZEL(LONGREAL)/2/6*) (* nr of elements that can be processed using L2 cache *)
  17. *)
  18. debug = FALSE; parallel = TRUE; SSE = TRUE;
  19. MaxCachePoolSize = 0 (* disabled *) (* 646*1024*1024 *) (* enabled *) ;
  20. maxProcesses = 32;
  21. cMatMulDynamic* = -1; cMatMulScalarProduct* = 0;
  22. cMatMulNaive* = 1; cMatMulTransposed* = 2;
  23. cMatMulStride* = 3; cMatMulBlocked* = 4;
  24. VAR
  25. alignedC*, unalignedC*, singleC*: LONGINT; (* counters for debugging and statistics *)
  26. rejectMatMul*: LONGINT;
  27. matAllocTime*, matCompTime*: LONGINT;
  28. cBlockSize*: LONGINT; nrProcesses*: LONGINT;
  29. lastUsedBlockSize*: SIZE;
  30. allocT-, copyT-, zeroT-, compT-: HUGEINT;
  31. TYPE
  32. Cache = POINTER TO RECORD
  33. p: ANY;
  34. adr: ADDRESS; size: SIZE;
  35. prev, next: Cache;
  36. END;
  37. CachePool = OBJECT
  38. (*! provide heuristics for overal size *)
  39. VAR first, last: Cache;
  40. PROCEDURE & Init*;
  41. BEGIN
  42. NEW( first ); first.size := 0; (* sentinel *)
  43. NEW( last ); last.size := MAX( LONGINT ); (* sentinel *)
  44. first.next := last; first.prev := NIL; last.prev := first; last.next := NIL;
  45. END Init;
  46. PROCEDURE Acquire( size: SIZE ): Cache;
  47. VAR c: Cache; t: HUGEINT;
  48. BEGIN {EXCLUSIVE}
  49. IF size = 0 THEN RETURN first END;
  50. Tic( t );
  51. c := last;
  52. WHILE (c.prev.size >= size) DO
  53. c := c.prev;
  54. END;
  55. IF c = last THEN
  56. NEW( c ); SYSTEM.NEW( c.p, size + 12 );
  57. c.adr := Align( SYSTEM.VAL( LONGINT, c.p ), 16 );
  58. c.size := size;
  59. ELSE
  60. c.prev.next := c.next;
  61. c.next.prev := c.prev;
  62. c.prev := NIL; c.next := NIL;
  63. END;
  64. Toc( t, allocT ); RETURN c;
  65. END Acquire;
  66. PROCEDURE Release( c: Cache );
  67. VAR t: Cache;
  68. BEGIN {EXCLUSIVE}
  69. IF (c=first) OR (c=NIL) THEN RETURN END;
  70. ASSERT(c.size > 0);
  71. IF c.size > MaxCachePoolSize THEN RETURN END;
  72. t := first;
  73. WHILE (t.size <= c.size) DO t := t.next; END;
  74. c.prev := t.prev; c.next := t; t.prev := c; c.prev.next := c;
  75. END Release;
  76. END CachePool;
  77. ComputationObj = OBJECT
  78. VAR done: BOOLEAN;
  79. PROCEDURE & Init*;
  80. BEGIN
  81. done := FALSE;
  82. END Init;
  83. PROCEDURE Compute; (*abstract*)
  84. END Compute;
  85. PROCEDURE Wait;
  86. BEGIN {EXCLUSIVE}
  87. AWAIT( done );
  88. END Wait;
  89. BEGIN {ACTIVE, EXCLUSIVE}
  90. Compute; done := TRUE;
  91. END ComputationObj;
  92. MatMulHObjR = OBJECT (ComputationObj)
  93. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  94. add: BOOLEAN;
  95. PROCEDURE & InitR*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  96. add: BOOLEAN );
  97. BEGIN
  98. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  99. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  100. SELF.IncC := IncC; SELF.StrideC := StrideC;
  101. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  102. SELF.Cols := Cols; SELF.add := add;
  103. END InitR;
  104. PROCEDURE Compute;
  105. BEGIN
  106. MatMulHBlockR( MatrixA, MatrixB, MatrixC, Stride, IncC,
  107. StrideC, RowsA, RowsB, Cols, add );
  108. END Compute;
  109. END MatMulHObjR;
  110. MatMulHObjX = OBJECT (ComputationObj)
  111. VAR MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  112. add: BOOLEAN;
  113. PROCEDURE & InitX*( MatrixA, MatrixB, MatrixC: ADDRESS; Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  114. add: BOOLEAN );
  115. BEGIN
  116. Init; SELF.MatrixA := MatrixA; SELF.MatrixB := MatrixB;
  117. SELF.MatrixC := MatrixC; SELF.Stride := Stride;
  118. SELF.IncC := IncC; SELF.StrideC := StrideC;
  119. SELF.RowsA := RowsA; SELF.RowsB := RowsB;
  120. SELF.Cols := Cols; SELF.add := add;
  121. END InitX;
  122. PROCEDURE Compute;
  123. BEGIN
  124. MatMulHBlockX( MatrixA, MatrixB, MatrixC, Stride, IncC,
  125. StrideC, RowsA, RowsB, Cols, add );
  126. END Compute;
  127. END MatMulHObjX;
  128. MultiplyObjectR = OBJECT (ComputationObj);
  129. VAR adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK:SIZE;
  130. start, finished: BOOLEAN;
  131. PROCEDURE & InitR*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  132. BEGIN
  133. Init; start := FALSE; finished := FALSE;
  134. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  135. SELF.M := M; SELF.N := N; SELF.K := K;
  136. SELF.IncC := IncC; SELF.StrideC := StrideC;
  137. SELF.L2BlockM := L2BlockM;
  138. SELF.L2BlockN := L2BlockN;
  139. SELF.L2BlockK := L2BlockK;
  140. END InitR;
  141. PROCEDURE Compute;
  142. BEGIN
  143. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  144. L2BlockN, L2BlockK );
  145. END Compute;
  146. END MultiplyObjectR;
  147. MultiplyObjectX = OBJECT (ComputationObj);
  148. VAR adrA, adrB:ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE;
  149. start, finished: BOOLEAN;
  150. PROCEDURE & InitX*( adrA, adrB: ADDRESS; C, M, N, K, IncC, StrideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  151. BEGIN
  152. Init; start := FALSE; finished := FALSE;
  153. SELF.adrA := adrA; SELF.adrB := adrB; SELF.C := C;
  154. SELF.M := M; SELF.N := N; SELF.K := K;
  155. SELF.IncC := IncC; SELF.StrideC := StrideC;
  156. SELF.L2BlockM := L2BlockM;
  157. SELF.L2BlockN := L2BlockN;
  158. SELF.L2BlockK := L2BlockK;
  159. END InitX;
  160. PROCEDURE Compute;
  161. BEGIN
  162. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  163. L2BlockN, L2BlockK );
  164. END Compute;
  165. END MultiplyObjectX;
  166. VAR
  167. (* ran: Random.Generator; (* testing *)*)
  168. cachePool: CachePool;
  169. (*********** Part 0: assembler routines ***************)
  170. PROCEDURE -L1Block1XA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  171. CODE {SYSTEM.i386, SYSTEM.FPU}
  172. MOV EAX, [ESP+K] ; EAX IS counter
  173. MOV EDX, [ESP+adrC]
  174. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  175. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  176. FLD QWORD [EDX] ; S.GET(dadr, x)
  177. loop8:
  178. CMP EAX, 8
  179. JL loop1
  180. FLD QWORD[EBX] ; S.GET(ladr, x)
  181. ADD EBX, 8 ; INC(ladr, incl)
  182. FLD QWORD[ECX] ; S.GET(ladr, y)
  183. ADD ECX, 8 ; INC(radr, incr)
  184. FMULP ; x := x*y
  185. FADDP ; z := z+x
  186. FLD QWORD[EBX] ; S.GET(ladr, x)
  187. ADD EBX, 8 ; INC(ladr, incl)
  188. FLD QWORD[ECX] ; S.GET(ladr, y)
  189. ADD ECX, 8 ; INC(radr, incr)
  190. FMULP ; x := x*y
  191. FADDP ; z := z+x
  192. FLD QWORD[EBX] ; S.GET(ladr, x)
  193. ADD EBX, 8 ; INC(ladr, incl)
  194. FLD QWORD[ECX] ; S.GET(ladr, y)
  195. ADD ECX, 8 ; INC(radr, incr)
  196. FMULP ; x := x*y
  197. FADDP ; z := z+x
  198. FLD QWORD[EBX] ; S.GET(ladr, x)
  199. ADD EBX, 8 ; INC(ladr, incl)
  200. FLD QWORD[ECX] ; S.GET(ladr, y)
  201. ADD ECX, 8 ; INC(radr, incr)
  202. FMULP ; x := x*y
  203. FADDP ; z := z+x
  204. FLD QWORD[EBX] ; S.GET(ladr, x)
  205. ADD EBX, 8 ; INC(ladr, incl)
  206. FLD QWORD[ECX] ; S.GET(ladr, y)
  207. ADD ECX, 8 ; INC(radr, incr)
  208. FMULP ; x := x*y
  209. FADDP ; z := z+x
  210. FLD QWORD[EBX] ; S.GET(ladr, x)
  211. ADD EBX, 8 ; INC(ladr, incl)
  212. FLD QWORD[ECX] ; S.GET(ladr, y)
  213. ADD ECX, 8 ; INC(radr, incr)
  214. FMULP ; x := x*y
  215. FADDP ; z := z+x
  216. FLD QWORD[EBX] ; S.GET(ladr, x)
  217. ADD EBX, 8 ; INC(ladr, incl)
  218. FLD QWORD[ECX] ; S.GET(ladr, y)
  219. ADD ECX, 8 ; INC(radr, incr)
  220. FMULP ; x := x*y
  221. FADDP ; z := z+x
  222. FLD QWORD[EBX] ; S.GET(ladr, x)
  223. ADD EBX, 8 ; INC(ladr, incl)
  224. FLD QWORD[ECX] ; S.GET(ladr, y)
  225. ADD ECX, 8 ; INC(radr, incr)
  226. FMULP ; x := x*y
  227. FADDP ; z := z+x
  228. SUB EAX, 8 ; DEC(len)
  229. JMP loop8 ;
  230. loop1:
  231. CMP EAX, 0 ; WHILE len > 0 DO
  232. JLE endL
  233. FLD QWORD[EBX] ; S.GET(ladr, x)
  234. ADD EBX, 8 ; INC(ladr, incl)
  235. FLD QWORD[ECX] ; S.GET(ladr, y)
  236. ADD ECX, 8 ; INC(radr, incr)
  237. FMULP ; x := x*y
  238. FADDP ; z := z+x
  239. DEC EAX ; DEC(len)
  240. JMP loop1 ;
  241. endL:
  242. FSTP QWORD[EDX] ; S.PUT(dadr, x)
  243. FWAIT ;
  244. ADD ESP, 16 ;
  245. END L1Block1XA;
  246. PROCEDURE -L1Block1XSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  247. (*
  248. matrixA, matrixB must be stored in special format
  249. K>0 guaranteed
  250. *)
  251. CODE {SYSTEM.i386, SYSTEM.SSE2}
  252. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  253. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  254. MOV EDX, [ESP+K] ; EDX IS counter
  255. XORPD XMM2, XMM2 ;
  256. kLoop8: ;
  257. CMP EDX, 8 ;
  258. JL kLoop2 ;
  259. MOVAPD XMM7, [EBX] ;
  260. MOVAPD XMM0, [ECX] ;
  261. ADD ECX, 16 ;
  262. ADD EBX, 16 ;
  263. MOVAPD XMM6, [EBX] ;
  264. MOVAPD XMM1, [ECX] ;
  265. ADD ECX, 16 ;
  266. ADD EBX, 16 ;
  267. MULPD XMM0, XMM7 ;
  268. ADDPD XMM2, XMM0 ;
  269. MOVAPD XMM5, [EBX] ;
  270. MOVAPD XMM3, [ECX] ;
  271. ADD ECX, 16 ;
  272. ADD EBX, 16 ;
  273. MULPD XMM1, XMM6 ;
  274. ADDPD XMM2, XMM1 ;
  275. MOVAPD XMM7, [EBX] ;
  276. MOVAPD XMM0, [ECX] ;
  277. ADD ECX, 16 ;
  278. ADD EBX, 16 ;
  279. MULPD XMM3, XMM5 ;
  280. ADDPD XMM2, XMM3 ;
  281. MULPD XMM0, XMM7 ;
  282. ADDPD XMM2, XMM0 ;
  283. SUB EDX, 8 ;
  284. JMP kLoop8 ;
  285. kLoop2: ;
  286. CMP EDX, 0 ;
  287. JLE horizontalAdd ;
  288. MOVAPD XMM7, [EBX] ;
  289. MOVAPD XMM0, [ECX] ;
  290. ADD ECX, 16 ;
  291. ADD EBX, 16 ;
  292. MULPD XMM0, XMM7 ;
  293. ADDPD XMM2, XMM0 ;
  294. SUB EDX, 2
  295. JMP kLoop2 ;
  296. horizontalAdd:
  297. MOV EDI, [ESP+adrC] ;
  298. MOVAPD XMM1, XMM2 ;
  299. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  300. ADDPD XMM2, XMM1 ;
  301. ADDSD XMM2, [EDI] ;
  302. MOVSD [EDI], XMM2 ;
  303. endL:
  304. ADD ESP, 16 ;
  305. END L1Block1XSSE;
  306. PROCEDURE -L1Block5XSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  307. (*
  308. matrixA and matrix B are stored in special format !
  309. K > 0 is guaranteed
  310. *)
  311. CODE {SYSTEM.i386, SYSTEM.SSE2}
  312. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  313. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  314. MOV EDX, [ESP+K] ; EDX IS counter
  315. XORPD XMM2, XMM2 ;
  316. XORPD XMM3, XMM3 ;
  317. XORPD XMM4, XMM4 ;
  318. XORPD XMM5, XMM5 ;
  319. XORPD XMM6, XMM6 ;
  320. kLoop8: ;
  321. CMP EDX, 8 ;
  322. JL kLoop2
  323. ; (*-- 0 -- *) ;
  324. MOVAPD XMM7, [EBX] ; get 4 elements OF A
  325. ADD EBX, 16 ;
  326. MOVAPD XMM0, [ECX] ; get 4 elements OF B
  327. ADD ECX, 16 ;
  328. MOVAPD XMM1, [ECX] ; get 4 elements OF B
  329. ADD ECX, 16 ;
  330. MULPD XMM0, XMM7 ;
  331. ADDPD XMM2, XMM0 ;
  332. MOVAPD XMM0, [ECX] ;
  333. ADD ECX, 16 ;
  334. MULPD XMM1, XMM7 ;
  335. ADDPD XMM3, XMM1 ;
  336. MOVAPD XMM1, [ECX] ;
  337. ADD ECX, 16 ;
  338. MULPD XMM0, XMM7 ;
  339. ADDPD XMM4, XMM0 ;
  340. MOVAPD XMM0, [ECX] ;
  341. ADD ECX, 16 ;
  342. MULPD XMM1, XMM7 ;
  343. ADDPD XMM5, XMM1 ;
  344. MOVAPD XMM1, [ECX] ;
  345. ADD ECX, 16 ;
  346. MULPD XMM0, XMM7 ;
  347. ADDPD XMM6, XMM0
  348. ; (*-- 2 -- *) ;
  349. MOVAPD XMM7, [EBX] ;
  350. ADD EBX, 16 ;
  351. MOVAPD XMM0, [ECX] ;
  352. ADD ECX, 16 ;
  353. MULPD XMM1, XMM7 ;
  354. ADDPD XMM2, XMM1 ;
  355. MOVAPD XMM1, [ECX] ;
  356. ADD ECX, 16 ;
  357. MULPD XMM0, XMM7 ;
  358. ADDPD XMM3, XMM0 ;
  359. MOVAPD XMM0, [ECX] ;
  360. ADD ECX, 16 ;
  361. MULPD XMM1, XMM7 ;
  362. ADDPD XMM4, XMM1 ;
  363. MOVAPD XMM1, [ECX] ;
  364. ADD ECX, 16 ;
  365. MULPD XMM0, XMM7 ;
  366. ADDPD XMM5, XMM0 ;
  367. MOVAPD XMM0, [ECX] ;
  368. ADD ECX, 16 ;
  369. MULPD XMM1, XMM7 ;
  370. ADDPD XMM6, XMM1
  371. ; (*-- 4 -- *) ;
  372. MOVAPD XMM7, [EBX] ;
  373. ADD EBX, 16 ;
  374. MOVAPD XMM1, [ECX] ;
  375. ADD ECX, 16 ;
  376. MULPD XMM0, XMM7 ;
  377. ADDPD XMM2, XMM0 ;
  378. MOVAPD XMM0, [ECX] ;
  379. ADD ECX, 16 ;
  380. MULPD XMM1, XMM7 ;
  381. ADDPD XMM3, XMM1 ;
  382. MOVAPD XMM1, [ECX] ;
  383. ADD ECX, 16 ;
  384. MULPD XMM0, XMM7 ;
  385. ADDPD XMM4, XMM0 ;
  386. MOVAPD XMM0, [ECX] ;
  387. ADD ECX, 16 ;
  388. MULPD XMM1, XMM7 ;
  389. ADDPD XMM5, XMM1 ;
  390. MOVAPD XMM1, [ECX] ;
  391. ADD ECX, 16 ;
  392. MULPD XMM0, XMM7 ;
  393. ADDPD XMM6, XMM0
  394. ; (*-- 6 -- *) ;
  395. MOVAPD XMM7, [EBX] ;
  396. ADD EBX, 16 ;
  397. MOVAPD XMM0, [ECX] ;
  398. ADD ECX, 16 ;
  399. MULPD XMM1, XMM7 ;
  400. ADDPD XMM2, XMM1 ;
  401. MOVAPD XMM1, [ECX] ;
  402. ADD ECX, 16 ;
  403. MULPD XMM0, XMM7 ;
  404. ADDPD XMM3, XMM0 ;
  405. MOVAPD XMM0, [ECX] ;
  406. ADD ECX, 16 ;
  407. MULPD XMM1, XMM7 ;
  408. ADDPD XMM4, XMM1 ;
  409. MOVAPD XMM1, [ECX] ;
  410. ADD ECX, 16 ;
  411. MULPD XMM0, XMM7 ;
  412. ADDPD XMM5, XMM0 ;
  413. MULPD XMM1, XMM7 ;
  414. ADDPD XMM6, XMM1 ;
  415. SUB EDX, 8
  416. JMP kLoop8 ;
  417. kLoop2: ;
  418. CMP EDX, 0 ;
  419. JLE horizontalAdd ;
  420. MOVAPD XMM7, [EBX] ;
  421. ADD EBX, 16 ;
  422. MOVAPD XMM0, [ECX] ;
  423. ADD ECX, 16 ;
  424. MOVAPD XMM1, [ECX] ;
  425. ADD ECX, 16 ;
  426. MULPD XMM0, XMM7 ;
  427. ADDPD XMM2, XMM0 ;
  428. MOVAPD XMM0, [ECX] ;
  429. ADD ECX, 16 ;
  430. MULPD XMM1, XMM7 ;
  431. ADDPD XMM3, XMM1 ;
  432. MOVAPD XMM1, [ECX] ;
  433. ADD ECX, 16 ;
  434. MULPD XMM0, XMM7 ;
  435. ADDPD XMM4, XMM0 ;
  436. MOVAPD XMM0, [ECX] ;
  437. ADD ECX, 16 ;
  438. MULPD XMM1, XMM7 ;
  439. ADDPD XMM5, XMM1 ;
  440. MULPD XMM0, XMM7 ;
  441. ADDPD XMM6, XMM0 ;
  442. SUB EDX, 2
  443. JMP kLoop2 ;
  444. horizontalAdd: ; add and store
  445. MOV EDI, [ESP+adrC] ;
  446. MOV EAX, [ESP+IncC] ;
  447. MOVAPD XMM1, XMM2 ;
  448. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  449. ADDPD XMM2, XMM1 ;
  450. ADDSD XMM2, [EDI] ;
  451. MOVSD [EDI], XMM2 ;
  452. ADD EDI, EAX ;
  453. MOVAPD XMM1, XMM3 ;
  454. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  455. ADDPD XMM3, XMM1 ;
  456. ADDSD XMM3, [EDI] ;
  457. MOVSD [EDI], XMM3 ;
  458. ADD EDI, EAX ;
  459. MOVAPD XMM1, XMM4 ;
  460. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  461. ADDPD XMM4, XMM1 ;
  462. ADDSD XMM4, [EDI] ;
  463. MOVSD [EDI], XMM4 ;
  464. ADD EDI, EAX ;
  465. MOVAPD XMM1, XMM5 ;
  466. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  467. ADDPD XMM5, XMM1 ;
  468. ADDSD XMM5, [EDI] ;
  469. MOVSD [EDI], XMM5 ;
  470. ADD EDI, EAX ;
  471. MOVAPD XMM1, XMM6 ;
  472. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  473. ADDPD XMM6, XMM1 ;
  474. ADDSD XMM6, [EDI] ;
  475. MOVSD [EDI], XMM6 ;
  476. endL:
  477. ADD ESP, 20 ;
  478. END L1Block5XSSE;
  479. PROCEDURE -L1Block1RA( adrA, adrB, adrC: ADDRESS; K: SIZE );
  480. CODE {SYSTEM.i386, SYSTEM.FPU}
  481. MOV EAX, [ESP+K] ; EAX IS counter
  482. MOV EDX, [ESP+adrC]
  483. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  484. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  485. FLD DWORD [EDX] ; S.GET(dadr, x)
  486. loop16:
  487. CMP EAX, 16
  488. JL loop1
  489. FLD DWORD[EBX] ; S.GET(ladr, x)
  490. ADD EBX, 4 ; INC(ladr, incl)
  491. FLD DWORD[ECX] ; S.GET(ladr, y)
  492. ADD ECX, 4 ; INC(radr, incr)
  493. FMULP ; x := x*y
  494. FADDP ; z := z+x
  495. FLD DWORD[EBX] ; S.GET(ladr, x)
  496. ADD EBX, 4 ; INC(ladr, incl)
  497. FLD DWORD[ECX] ; S.GET(ladr, y)
  498. ADD ECX, 4 ; INC(radr, incr)
  499. FMULP ; x := x*y
  500. FADDP ; z := z+x
  501. FLD DWORD[EBX] ; S.GET(ladr, x)
  502. ADD EBX, 4 ; INC(ladr, incl)
  503. FLD DWORD[ECX] ; S.GET(ladr, y)
  504. ADD ECX, 4 ; INC(radr, incr)
  505. FMULP ; x := x*y
  506. FADDP ; z := z+x
  507. FLD DWORD[EBX] ; S.GET(ladr, x)
  508. ADD EBX, 4 ; INC(ladr, incl)
  509. FLD DWORD[ECX] ; S.GET(ladr, y)
  510. ADD ECX, 4 ; INC(radr, incr)
  511. FMULP ; x := x*y
  512. FADDP ; z := z+x
  513. FLD DWORD[EBX] ; S.GET(ladr, x)
  514. ADD EBX, 4 ; INC(ladr, incl)
  515. FLD DWORD[ECX] ; S.GET(ladr, y)
  516. ADD ECX, 4 ; INC(radr, incr)
  517. FMULP ; x := x*y
  518. FADDP ; z := z+x
  519. FLD DWORD[EBX] ; S.GET(ladr, x)
  520. ADD EBX, 4 ; INC(ladr, incl)
  521. FLD DWORD[ECX] ; S.GET(ladr, y)
  522. ADD ECX, 4 ; INC(radr, incr)
  523. FMULP ; x := x*y
  524. FADDP ; z := z+x
  525. FLD DWORD[EBX] ; S.GET(ladr, x)
  526. ADD EBX, 4 ; INC(ladr, incl)
  527. FLD DWORD[ECX] ; S.GET(ladr, y)
  528. ADD ECX, 4 ; INC(radr, incr)
  529. FMULP ; x := x*y
  530. FADDP ; z := z+x
  531. FLD DWORD[EBX] ; S.GET(ladr, x)
  532. ADD EBX, 4 ; INC(ladr, incl)
  533. FLD DWORD[ECX] ; S.GET(ladr, y)
  534. ADD ECX, 4 ; INC(radr, incr)
  535. FMULP ; x := x*y
  536. FADDP ; z := z+x
  537. FLD DWORD[EBX] ; S.GET(ladr, x)
  538. ADD EBX, 4 ; INC(ladr, incl)
  539. FLD DWORD[ECX] ; S.GET(ladr, y)
  540. ADD ECX, 4 ; INC(radr, incr)
  541. FMULP ; x := x*y
  542. FADDP ; z := z+x
  543. FLD DWORD[EBX] ; S.GET(ladr, x)
  544. ADD EBX, 4 ; INC(ladr, incl)
  545. FLD DWORD[ECX] ; S.GET(ladr, y)
  546. ADD ECX, 4 ; INC(radr, incr)
  547. FMULP ; x := x*y
  548. FADDP ; z := z+x
  549. FLD DWORD[EBX] ; S.GET(ladr, x)
  550. ADD EBX, 4 ; INC(ladr, incl)
  551. FLD DWORD[ECX] ; S.GET(ladr, y)
  552. ADD ECX, 4 ; INC(radr, incr)
  553. FMULP ; x := x*y
  554. FADDP ; z := z+x
  555. FLD DWORD[EBX] ; S.GET(ladr, x)
  556. ADD EBX, 4 ; INC(ladr, incl)
  557. FLD DWORD[ECX] ; S.GET(ladr, y)
  558. ADD ECX, 4 ; INC(radr, incr)
  559. FMULP ; x := x*y
  560. FADDP ; z := z+x
  561. FLD DWORD[EBX] ; S.GET(ladr, x)
  562. ADD EBX, 4 ; INC(ladr, incl)
  563. FLD DWORD[ECX] ; S.GET(ladr, y)
  564. ADD ECX, 4 ; INC(radr, incr)
  565. FMULP ; x := x*y
  566. FADDP ; z := z+x
  567. FLD DWORD[EBX] ; S.GET(ladr, x)
  568. ADD EBX, 4 ; INC(ladr, incl)
  569. FLD DWORD[ECX] ; S.GET(ladr, y)
  570. ADD ECX, 4 ; INC(radr, incr)
  571. FMULP ; x := x*y
  572. FADDP ; z := z+x
  573. FLD DWORD[EBX] ; S.GET(ladr, x)
  574. ADD EBX, 4 ; INC(ladr, incl)
  575. FLD DWORD[ECX] ; S.GET(ladr, y)
  576. ADD ECX, 4 ; INC(radr, incr)
  577. FMULP ; x := x*y
  578. FADDP ; z := z+x
  579. FLD DWORD[EBX] ; S.GET(ladr, x)
  580. ADD EBX, 4 ; INC(ladr, incl)
  581. FLD DWORD[ECX] ; S.GET(ladr, y)
  582. ADD ECX, 4 ; INC(radr, incr)
  583. FMULP ; x := x*y
  584. FADDP ; z := z+x
  585. SUB EAX, 16 ; DEC(len)
  586. JMP loop16 ;
  587. loop1:
  588. CMP EAX, 0 ; WHILE len > 0 DO
  589. JLE endL
  590. FLD DWORD[EBX] ; S.GET(ladr, x)
  591. ADD EBX, 4 ; INC(ladr, incl)
  592. FLD DWORD[ECX] ; S.GET(ladr, y)
  593. ADD ECX, 4 ; INC(radr, incr)
  594. FMULP ; x := x*y
  595. FADDP ; z := z+x
  596. DEC EAX ; DEC(len)
  597. JMP loop1 ;
  598. endL:
  599. FSTP DWORD[EDX] ; S.PUT(dadr, x)
  600. FWAIT ;
  601. ADD ESP, 16 ;
  602. END L1Block1RA;
  603. PROCEDURE -L1Block1RSSE( adrA, adrB, adrC: ADDRESS; K: SIZE );
  604. (*
  605. matrixA, matrixB must be stored in special format
  606. K>0 guaranteed
  607. *)
  608. CODE {SYSTEM.i386, SYSTEM.SSE}
  609. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  610. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  611. MOV EDX, [ESP+K] ; EDX IS counter
  612. XORPS XMM2, XMM2 ;
  613. kLoop16: ;
  614. CMP EDX, 16 ;
  615. JL kLoop4 ;
  616. MOVAPS XMM7, [EBX] ;
  617. MOVAPS XMM0, [ECX] ;
  618. ADD ECX, 16 ;
  619. ADD EBX, 16 ;
  620. MOVAPS XMM6, [EBX] ;
  621. MOVAPS XMM1, [ECX] ;
  622. ADD ECX, 16 ;
  623. ADD EBX, 16 ;
  624. MULPS XMM0, XMM7 ;
  625. ADDPS XMM2, XMM0 ;
  626. MOVAPS XMM5, [EBX] ;
  627. MOVAPS XMM3, [ECX] ;
  628. ADD ECX, 16 ;
  629. ADD EBX, 16 ;
  630. MULPS XMM1, XMM6 ;
  631. ADDPS XMM2, XMM1 ;
  632. MOVAPS XMM7, [EBX] ;
  633. MOVAPS XMM0, [ECX] ;
  634. ADD ECX, 16 ;
  635. ADD EBX, 16 ;
  636. MULPS XMM3, XMM5 ;
  637. ADDPS XMM2, XMM3 ;
  638. MULPS XMM0, XMM7 ;
  639. ADDPS XMM2, XMM0 ;
  640. SUB EDX, 16 ;
  641. JMP kLoop16 ;
  642. kLoop4: ;
  643. CMP EDX, 0 ;
  644. JLE horizontalAdd ;
  645. MOVAPS XMM7, [EBX] ;
  646. MOVAPS XMM0, [ECX] ;
  647. ADD ECX, 16 ;
  648. ADD EBX, 16 ;
  649. MULPS XMM0, XMM7 ;
  650. ADDPS XMM2, XMM0 ;
  651. SUB EDX, 4
  652. JMP kLoop4 ;
  653. horizontalAdd:
  654. MOV EDI, [ESP+adrC] ;
  655. MOVLHPS XMM1, XMM2 ;
  656. ADDPS XMM1, XMM2 ;
  657. SHUFPS XMM2, XMM1, 48 ;
  658. ADDPS XMM2, XMM1 ;
  659. MOVHLPS XMM2, XMM2 ;
  660. ADDSS XMM2, [EDI] ;
  661. MOVSS [EDI], XMM2 ;
  662. endL:
  663. ADD ESP, 16 ;
  664. END L1Block1RSSE;
  665. PROCEDURE -L1Block5RSSE( adrA, adrB, adrC: ADDRESS; IncC, K: SIZE );
  666. (*
  667. matrixA and matrix B are stored in special format !
  668. K > 0 is guaranteed
  669. *)
  670. CODE {SYSTEM.i386, SYSTEM.SSE}
  671. MOV EBX, [ESP+adrA] ; EBX IS POINTER TO data OF matrix A
  672. MOV ECX, [ESP+adrB] ; ECX IS POINTER TO data OF matrix B
  673. MOV EDX, [ESP+K] ; EDX IS counter
  674. XORPS XMM2, XMM2 ;
  675. XORPS XMM3, XMM3 ;
  676. XORPS XMM4, XMM4 ;
  677. XORPS XMM5, XMM5 ;
  678. XORPS XMM6, XMM6 ;
  679. kLoop16: ;
  680. CMP EDX, 16 ;
  681. JL kLoop4 ; (*-- 0 -- *)
  682. MOVAPS XMM7, [EBX] ; get 4 elements OF A
  683. ADD EBX, 16 ;
  684. MOVAPS XMM0, [ECX] ; get 4 elements OF B
  685. ADD ECX, 16 ;
  686. MOVAPS XMM1, [ECX] ; get 4 elements OF B
  687. ADD ECX, 16 ;
  688. MULPS XMM0, XMM7 ;
  689. ADDPS XMM2, XMM0 ;
  690. MOVAPS XMM0, [ECX] ;
  691. ADD ECX, 16 ;
  692. MULPS XMM1, XMM7 ;
  693. ADDPS XMM3, XMM1 ;
  694. MOVAPS XMM1, [ECX] ;
  695. ADD ECX, 16 ;
  696. MULPS XMM0, XMM7 ;
  697. ADDPS XMM4, XMM0 ;
  698. MOVAPS XMM0, [ECX] ;
  699. ADD ECX, 16 ;
  700. MULPS XMM1, XMM7 ;
  701. ADDPS XMM5, XMM1 ;
  702. MOVAPS XMM1, [ECX] ;
  703. ADD ECX, 16 ;
  704. MULPS XMM0, XMM7 ;
  705. ADDPS XMM6, XMM0
  706. ; (*-- 4 -- *) ;
  707. MOVAPS XMM7, [EBX] ;
  708. ADD EBX, 16 ;
  709. MOVAPS XMM0, [ECX] ;
  710. ADD ECX, 16 ;
  711. MULPS XMM1, XMM7 ;
  712. ADDPS XMM2, XMM1 ;
  713. MOVAPS XMM1, [ECX] ;
  714. ADD ECX, 16 ;
  715. MULPS XMM0, XMM7 ;
  716. ADDPS XMM3, XMM0 ;
  717. MOVAPS XMM0, [ECX] ;
  718. ADD ECX, 16 ;
  719. MULPS XMM1, XMM7 ;
  720. ADDPS XMM4, XMM1 ;
  721. MOVAPS XMM1, [ECX] ;
  722. ADD ECX, 16 ;
  723. MULPS XMM0, XMM7 ;
  724. ADDPS XMM5, XMM0 ;
  725. MOVAPS XMM0, [ECX] ;
  726. ADD ECX, 16 ;
  727. MULPS XMM1, XMM7 ;
  728. ADDPS XMM6, XMM1
  729. ; (*-- 8 -- *) ;
  730. MOVAPS XMM7, [EBX] ;
  731. ADD EBX, 16 ;
  732. MOVAPS XMM1, [ECX] ;
  733. ADD ECX, 16 ;
  734. MULPS XMM0, XMM7 ;
  735. ADDPS XMM2, XMM0 ;
  736. MOVAPS XMM0, [ECX] ;
  737. ADD ECX, 16 ;
  738. MULPS XMM1, XMM7 ;
  739. ADDPS XMM3, XMM1 ;
  740. MOVAPS XMM1, [ECX] ;
  741. ADD ECX, 16 ;
  742. MULPS XMM0, XMM7 ;
  743. ADDPS XMM4, XMM0 ;
  744. MOVAPS XMM0, [ECX] ;
  745. ADD ECX, 16 ;
  746. MULPS XMM1, XMM7 ;
  747. ADDPS XMM5, XMM1 ;
  748. MOVAPS XMM1, [ECX] ;
  749. ADD ECX, 16 ;
  750. MULPS XMM0, XMM7 ;
  751. ADDPS XMM6, XMM0
  752. ; (*-- 12 -- *) ;
  753. MOVAPS XMM7, [EBX] ;
  754. ADD EBX, 16 ;
  755. MOVAPS XMM0, [ECX] ;
  756. ADD ECX, 16 ;
  757. MULPS XMM1, XMM7 ;
  758. ADDPS XMM2, XMM1 ;
  759. MOVAPS XMM1, [ECX] ;
  760. ADD ECX, 16 ;
  761. MULPS XMM0, XMM7 ;
  762. ADDPS XMM3, XMM0 ;
  763. MOVAPS XMM0, [ECX] ;
  764. ADD ECX, 16 ;
  765. MULPS XMM1, XMM7 ;
  766. ADDPS XMM4, XMM1 ;
  767. MOVAPS XMM1, [ECX] ;
  768. ADD ECX, 16 ;
  769. MULPS XMM0, XMM7 ;
  770. ADDPS XMM5, XMM0 ;
  771. MULPS XMM1, XMM7 ;
  772. ADDPS XMM6, XMM1 ;
  773. SUB EDX, 16
  774. JMP kLoop16 ;
  775. kLoop4: ;
  776. CMP EDX, 0 ;
  777. JLE horizontalAdd ;
  778. MOVAPS XMM7, [EBX] ;
  779. ADD EBX, 16 ;
  780. MOVAPS XMM0, [ECX] ;
  781. ADD ECX, 16 ;
  782. MOVAPS XMM1, [ECX] ;
  783. ADD ECX, 16 ;
  784. MULPS XMM0, XMM7 ;
  785. ADDPS XMM2, XMM0 ;
  786. MOVAPS XMM0, [ECX] ;
  787. ADD ECX, 16 ;
  788. MULPS XMM1, XMM7 ;
  789. ADDPS XMM3, XMM1 ;
  790. MOVAPS XMM1, [ECX] ;
  791. ADD ECX, 16 ;
  792. MULPS XMM0, XMM7 ;
  793. ADDPS XMM4, XMM0 ;
  794. MOVAPS XMM0, [ECX] ;
  795. ADD ECX, 16 ;
  796. MULPS XMM1, XMM7 ;
  797. ADDPS XMM5, XMM1 ;
  798. MULPS XMM0, XMM7 ;
  799. ADDPS XMM6, XMM0 ;
  800. SUB EDX, 4
  801. JMP kLoop4 ;
  802. horizontalAdd: ; add and store
  803. MOV EDI, [ESP+adrC] ;
  804. MOV EAX, [ESP+IncC] ;
  805. MOVLHPS XMM1, XMM2 ;
  806. ADDPS XMM1, XMM2 ;
  807. SHUFPS XMM2, XMM1, 48 ;
  808. ADDPS XMM2, XMM1 ;
  809. MOVHLPS XMM2, XMM2 ;
  810. ADDSS XMM2, [EDI] ;
  811. MOVSS [EDI], XMM2 ;
  812. ADD EDI, EAX ;
  813. MOVLHPS XMM1, XMM3 ;
  814. ADDPS XMM1, XMM3 ;
  815. SHUFPS XMM3, XMM1, 48 ;
  816. ADDPS XMM3, XMM1 ;
  817. MOVHLPS XMM3, XMM3 ;
  818. ADDSS XMM3, [EDI] ;
  819. MOVSS [EDI], XMM3 ;
  820. ADD EDI, EAX ;
  821. MOVLHPS XMM1, XMM4 ;
  822. ADDPS XMM1, XMM4 ;
  823. SHUFPS XMM4, XMM1, 48 ;
  824. ADDPS XMM4, XMM1 ;
  825. MOVHLPS XMM4, XMM4 ;
  826. ADDSS XMM4, [EDI] ;
  827. MOVSS [EDI], XMM4 ;
  828. ADD EDI, EAX ;
  829. MOVLHPS XMM1, XMM5 ;
  830. ADDPS XMM1, XMM5 ;
  831. SHUFPS XMM5, XMM1, 48 ;
  832. ADDPS XMM5, XMM1 ;
  833. MOVHLPS XMM5, XMM5 ;
  834. ADDSS XMM5, [EDI] ;
  835. MOVSS [EDI], XMM5 ;
  836. ADD EDI, EAX ;
  837. MOVLHPS XMM1, XMM6 ;
  838. ADDPS XMM1, XMM6 ;
  839. SHUFPS XMM6, XMM1, 48 ;
  840. ADDPS XMM6, XMM1 ;
  841. MOVHLPS XMM6, XMM6 ;
  842. ADDSS XMM6, [EDI] ;
  843. MOVSS [EDI], XMM6 ;
  844. endL:
  845. ADD ESP, 20 ;
  846. END L1Block5RSSE;
  847. PROCEDURE -Align4( adr: ADDRESS ): ADDRESS;
  848. CODE {SYSTEM.i386}
  849. MOV EAX, [ESP+adr] ;
  850. NEG EAX ;
  851. AND EAX, 3H ;
  852. ADD EAX, [ESP+adr] ;
  853. ADD ESP, 4
  854. END Align4;
  855. PROCEDURE -Align2( adr: ADDRESS ): ADDRESS;
  856. CODE {SYSTEM.i386}
  857. MOV EAX, [ESP+adr] ;
  858. NEG EAX ;
  859. AND EAX, 1H ;
  860. ADD EAX, [ESP+adr] ;
  861. ADD ESP, 4
  862. END Align2;
  863. PROCEDURE -ZeroR( adr: ADDRESS; count: SIZE );
  864. (** For 32 bit types *)
  865. CODE {SYSTEM.i386}
  866. MOV EDI, [ESP+adr] ; address OF dest index
  867. MOV ECX, [ESP+count] ; counter
  868. MOV EAX, 0 ; value
  869. CLD ; incremental
  870. REP ;
  871. STOSD ;
  872. ADD ESP, 8 ;
  873. END ZeroR;
  874. PROCEDURE -ZeroX( adr: ADDRESS; count: SIZE );
  875. (** For 64 bit types *)
  876. CODE {SYSTEM.i386}
  877. MOV EDI, [ESP+adr] ; address OF dest index
  878. MOV ECX, [ESP+count] ; counter
  879. SHL ECX, 1 ;
  880. MOV EAX, 0 ; value
  881. CLD ; incremental
  882. REP ;
  883. STOSD ;
  884. ADD ESP, 8 ;
  885. END ZeroX;
  886. PROCEDURE -ZeroRI( adr: SIZE; inc, count: SIZE );
  887. (** For 32 bit types *)
  888. CODE {SYSTEM.i386}
  889. MOV EDI, [ESP+adr] ; address OF dest index
  890. MOV EBX, [ESP+inc] ;
  891. MOV ECX, [ESP+count] ; counter
  892. CMP EBX, 4 ;
  893. JE fastzero ;
  894. MOV EAX, 0 ;
  895. loopL:
  896. CMP ECX, 0 ;
  897. JLE endL ;
  898. MOV [EDI], EAX ;
  899. ADD EDI, EBX ;
  900. DEC ECX ;
  901. JMP loopL ;
  902. fastzero:
  903. MOV EAX, 0 ; value
  904. CLD ; incremental
  905. REP ;
  906. STOSD ;
  907. endL:
  908. ADD ESP, 12 ;
  909. END ZeroRI;
  910. PROCEDURE -ZeroXI( adr: ADDRESS; inc, count: SIZE );
  911. (** For 32 bit types *)
  912. CODE {SYSTEM.i386}
  913. MOV EDI, [ESP+adr] ; address OF dest index
  914. MOV EBX, [ESP+inc] ;
  915. MOV ECX, [ESP+count] ; counter
  916. MOV EAX, 0 ;
  917. CMP EBX, 8 ;
  918. JE fastzero ;
  919. loopL:
  920. CMP ECX, 0 ;
  921. JLE endL ;
  922. MOV [EDI], EAX ;
  923. MOV [EDI+4], EAX ;
  924. ADD EDI, EBX ;
  925. DEC ECX ;
  926. JMP loopL ;
  927. fastzero:
  928. SHL ECX, 1 ;
  929. CLD ; incremental
  930. REP ;
  931. STOSD ;
  932. endL:
  933. ADD ESP, 12 ;
  934. END ZeroXI;
  935. PROCEDURE -MovR( from, to0, frominc, count: SIZE );
  936. CODE {SYSTEM.i386}
  937. MOV EDI, [ESP+to0] ; TO
  938. MOV ESI, [ESP+from] ; from
  939. MOV ECX, [ESP+count] ; count
  940. MOV EBX, [ESP+frominc] ; inc
  941. CMP EBX, 4 ;
  942. JE fastmove ;
  943. loopL:
  944. CMP ECX, 0 ;
  945. JLE endL ;
  946. MOV EAX, [ESI] ;
  947. MOV [EDI], EAX ;
  948. ADD ESI, EBX ;
  949. ADD EDI, 4 ;
  950. DEC ECX ;
  951. JMP loopL ;
  952. fastmove:
  953. CLD ; incremental
  954. REP ;
  955. MOVSD ; move rest IN one byte steps
  956. endL:
  957. ADD ESP, 16 ;
  958. END MovR;
  959. PROCEDURE -MovX( from, to0: ADDRESS; frominc, count:SIZE );
  960. CODE {SYSTEM.i386}
  961. MOV EDI, [ESP+to0] ; TO
  962. MOV ESI, [ESP+from] ; from
  963. MOV ECX, [ESP+count] ; count
  964. MOV EBX, [ESP+frominc] ; inc
  965. CMP EBX, 8 ;
  966. JE fastmove ;
  967. loopL:
  968. CMP ECX, 0 ;
  969. JLE endL ;
  970. MOV EAX, [ESI] ;
  971. MOV [EDI], EAX ;
  972. MOV EAX, [ESI+4] ;
  973. MOV [EDI+4], EAX ;
  974. ADD ESI, EBX ;
  975. ADD EDI, 8 ;
  976. DEC ECX ;
  977. JMP loopL ;
  978. fastmove:
  979. SHL ECX, 1 ;
  980. CLD ; incremental
  981. REP ;
  982. MOVSD ; move rest IN one byte steps
  983. endL:
  984. ADD ESP, 16 ;
  985. END MovX;
  986. PROCEDURE -MovR5( src: ADDRESS; inc, stride: SIZE; dest: ADDRESS; count: SIZE);
  987. CODE {SYSTEM.i386}
  988. MOV ESI, [ESP+src] ; src
  989. MOV EBX, [ESP+inc] ; inc
  990. MOV ECX, [ESP+stride] ; stride
  991. MOV EDI, [ESP+dest] ; dest
  992. loopL:
  993. MOV EAX, [ESP+count] ; count
  994. CMP EAX, 0 ;
  995. JLE endL ;
  996. SUB EAX, 4 ;
  997. MOV [ESP+count], EAX ;
  998. MOV EDX, ESI ;
  999. MOV EAX, [EDX] ;
  1000. MOV [EDI], EAX ;
  1001. ADD EDX, EBX ;
  1002. MOV EAX, [EDX] ;
  1003. MOV [EDI+16], EAX ;
  1004. ADD EDX, EBX ;
  1005. MOV EAX, [EDX] ;
  1006. MOV [EDI+32], EAX ;
  1007. ADD EDX, EBX ;
  1008. MOV EAX, [EDX] ;
  1009. MOV [EDI+48], EAX ;
  1010. ADD EDX, EBX ;
  1011. MOV EAX, [EDX] ;
  1012. MOV [EDI+64], EAX ;
  1013. ADD ESI, ECX ;
  1014. ADD EDI, 4 ;
  1015. MOV EDX, ESI ;
  1016. MOV EAX, [EDX] ;
  1017. MOV [EDI], EAX ;
  1018. ADD EDX, EBX ;
  1019. MOV EAX, [EDX] ;
  1020. MOV [EDI+16], EAX ;
  1021. ADD EDX, EBX ;
  1022. MOV EAX, [EDX] ;
  1023. MOV [EDI+32], EAX ;
  1024. ADD EDX, EBX ;
  1025. MOV EAX, [EDX] ;
  1026. MOV [EDI+48], EAX ;
  1027. ADD EDX, EBX ;
  1028. MOV EAX, [EDX] ;
  1029. MOV [EDI+64], EAX ;
  1030. ADD ESI, ECX ;
  1031. ADD EDI, 4 ;
  1032. MOV EDX, ESI ;
  1033. MOV EAX, [EDX] ;
  1034. MOV [EDI], EAX ;
  1035. ADD EDX, EBX ;
  1036. MOV EAX, [EDX] ;
  1037. MOV [EDI+16], EAX ;
  1038. ADD EDX, EBX ;
  1039. MOV EAX, [EDX] ;
  1040. MOV [EDI+32], EAX ;
  1041. ADD EDX, EBX ;
  1042. MOV EAX, [EDX] ;
  1043. MOV [EDI+48], EAX ;
  1044. ADD EDX, EBX ;
  1045. MOV EAX, [EDX] ;
  1046. MOV [EDI+64], EAX ;
  1047. ADD ESI, ECX ;
  1048. ADD EDI, 4 ;
  1049. MOV EDX, ESI ;
  1050. MOV EAX, [EDX] ;
  1051. MOV [EDI], EAX ;
  1052. ADD EDX, EBX ;
  1053. MOV EAX, [EDX] ;
  1054. MOV [EDI+16], EAX ;
  1055. ADD EDX, EBX ;
  1056. MOV EAX, [EDX] ;
  1057. MOV [EDI+32], EAX ;
  1058. ADD EDX, EBX ;
  1059. MOV EAX, [EDX] ;
  1060. MOV [EDI+48], EAX ;
  1061. ADD EDX, EBX ;
  1062. MOV EAX, [EDX] ;
  1063. MOV [EDI+64], EAX ;
  1064. ADD ESI, ECX ;
  1065. ADD EDI, 4 ;
  1066. ADD EDI, 64 ;
  1067. JMP loopL ;
  1068. endL:
  1069. ADD ESP, 20 ;
  1070. END MovR5;
  1071. (* *)
  1072. PROCEDURE AddAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1073. CODE {SYSTEM.i386, SYSTEM.FPU}
  1074. MOV EAX, [EBP+len] ;
  1075. MOV EBX, [EBP+ladr] ;
  1076. MOV ECX, [EBP+radr] ;
  1077. MOV EDX, [EBP+dadr] ;
  1078. start:
  1079. CMP EAX, 0 ;
  1080. JLE endL ;
  1081. FLD QWORD [EBX] ;
  1082. ADD EBX, [EBP+linc] ;
  1083. FLD QWORD [ECX] ;
  1084. ADD ECX, [EBP+rinc] ;
  1085. FADDP ;
  1086. FSTP QWORD [EDX] ;
  1087. ADD EDX, [EBP+dinc] ;
  1088. DEC EAX ;
  1089. JMP start ;
  1090. endL:
  1091. FWAIT ;
  1092. END AddAXAXLoopA;
  1093. PROCEDURE AddARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1094. CODE {SYSTEM.i386, SYSTEM.FPU}
  1095. MOV EAX, [EBP+len] ;
  1096. MOV EBX, [EBP+ladr] ;
  1097. MOV ECX, [EBP+radr] ;
  1098. MOV EDX, [EBP+dadr] ;
  1099. start:
  1100. CMP EAX, 0 ;
  1101. JLE endL ;
  1102. FLD DWORD [EBX] ;
  1103. ADD EBX, [EBP+linc] ;
  1104. FLD DWORD [ECX] ;
  1105. ADD ECX, [EBP+rinc] ;
  1106. FADDP ;
  1107. FSTP DWORD [EDX] ;
  1108. ADD EDX, [EBP+dinc] ;
  1109. DEC EAX ;
  1110. JMP start ;
  1111. endL:
  1112. FWAIT ;
  1113. END AddARARLoopA;
  1114. PROCEDURE AddAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1115. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1116. MOV EAX, [EBP+len] ;
  1117. CMP EAX, 0 ;
  1118. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1119. MOV EBX, [EBP+ladr] ;
  1120. MOV ECX, [EBP+radr] ;
  1121. MOV EDX, [EBP+dadr] ;
  1122. ; check IF data are contiguous IN memory
  1123. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1124. JNE single ; not continuous- > simplest method
  1125. CMP [EBP+rinc], 8 ; check right FOR contiunuity
  1126. JNE single ; not continuous- > simplest method
  1127. CMP [EBP+dinc], 8 ; check destination FOR contiunuity
  1128. JNE single ; not continuous- > simplest method
  1129. ; check FOR alignment
  1130. MOV ESI, EBX ;
  1131. AND ESI, 7 ; ladr MOD 8
  1132. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1133. JNE unaligned ; not 64 bit aligned
  1134. MOV ESI, ECX ;
  1135. AND ESI, 7 ; radr MOD 8
  1136. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1137. JNE unaligned ; not 64 bit aligned
  1138. MOV ESI, EDX ;
  1139. AND ESI, 7 ; dadr MOD 8
  1140. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1141. JNE unaligned ; not 64 bit aligned
  1142. MOV ESI, EBX ;
  1143. AND ESI, 8 ; 16 byte alignment
  1144. MOV EDI, ECX ;
  1145. AND EDI, 8 ; 16 byte alignment
  1146. CMP ESI, EDI ;
  1147. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1148. MOV EDI, EDX ;
  1149. AND EDI, 8 ; 16 byte alignment
  1150. CMP ESI, EDI ;
  1151. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1152. CMP ESI, 8 ;
  1153. JNE aligned ; lad, radr and dadr already 128 bit aligned
  1154. ; one single element processing TO achieve 128 bt alignment
  1155. MOVSD XMM1, [EBX] ;
  1156. MOVSD XMM0, [ECX] ;
  1157. ADDSD XMM0, XMM1 ;
  1158. MOVSD [EDX], XMM0 ;
  1159. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1160. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1161. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1162. DEC EAX ; one element has been processed
  1163. aligned:
  1164. MOV ESI, alignedC ;
  1165. INC ESI ;
  1166. MOV alignedC, ESI ;
  1167. aligned8:
  1168. CMP EAX, 8 ;
  1169. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1170. MOVAPD XMM0, [EBX] ;
  1171. MOVAPD XMM1, [EBX+16] ;
  1172. MOVAPD XMM2, [EBX+32] ;
  1173. MOVAPD XMM3, [EBX+48] ;
  1174. ADD EBX, 64 ;
  1175. MOVAPD XMM4, [ECX] ;
  1176. MOVAPD XMM5, [ECX+16] ;
  1177. MOVAPD XMM6, [ECX+32] ;
  1178. MOVAPD XMM7, [ECX+48] ;
  1179. ADD ECX, 64 ;
  1180. ADDPD XMM0, XMM4 ;
  1181. ADDPD XMM1, XMM5 ;
  1182. ADDPD XMM2, XMM6 ;
  1183. ADDPD XMM3, XMM7 ;
  1184. MOVAPD [EDX], XMM0 ;
  1185. MOVAPD [EDX+16], XMM1 ;
  1186. MOVAPD [EDX+32], XMM2 ;
  1187. MOVAPD [EDX+48], XMM3 ;
  1188. ADD EDX, 64 ;
  1189. SUB EAX, 8 ;
  1190. JMP aligned8 ;
  1191. ; LOOP FOR 2 pieces aligned
  1192. aligned2: ;
  1193. CMP EAX, 2 ;
  1194. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1195. MOVAPD XMM0, [EBX] ;
  1196. ADD EBX, 16 ;
  1197. MOVAPD XMM1, [ECX] ;
  1198. ADD ECX, 16 ;
  1199. ADDPD XMM0, XMM1 ;
  1200. MOVAPD [EDX], XMM0 ;
  1201. ADD EDX, 16 ;
  1202. SUB EAX, 2 ;
  1203. JMP aligned2 ;
  1204. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1205. unaligned: ;
  1206. MOV ESI, unalignedC ;
  1207. INC ESI ;
  1208. MOV unalignedC, ESI ;
  1209. unaligned8: ;
  1210. CMP EAX, 8 ;
  1211. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1212. MOVUPD XMM0, [EBX] ;
  1213. MOVUPD XMM1, [EBX+16] ;
  1214. MOVUPD XMM2, [EBX+32] ;
  1215. MOVUPD XMM3, [EBX+48] ;
  1216. ADD EBX, 64 ;
  1217. MOVUPD XMM4, [ECX] ;
  1218. MOVUPD XMM5, [ECX+16] ;
  1219. MOVUPD XMM6, [ECX+32] ;
  1220. MOVUPD XMM7, [ECX+48] ;
  1221. ADD ECX, 64 ;
  1222. ADDPD XMM0, XMM4 ;
  1223. ADDPD XMM1, XMM5 ;
  1224. ADDPD XMM2, XMM6 ;
  1225. ADDPD XMM3, XMM7 ;
  1226. MOVUPD [EDX], XMM0 ;
  1227. MOVUPD [EDX+16], XMM1 ;
  1228. MOVUPD [EDX+32], XMM2 ;
  1229. MOVUPD [EDX+48], XMM3 ;
  1230. ADD EDX, 64 ;
  1231. SUB EAX, 8 ;
  1232. JMP unaligned8 ;
  1233. ; LOOP FOR 2 pieces aligned
  1234. unaligned2: ;
  1235. CMP EAX, 2 ;
  1236. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1237. MOVUPD XMM0, [EBX] ;
  1238. ADD EBX, 16 ;
  1239. MOVUPD XMM1, [ECX] ;
  1240. ADD ECX, 16 ;
  1241. ADDPD XMM0, XMM1 ;
  1242. MOVUPD [EDX], XMM0 ;
  1243. ADD EDX, 16 ;
  1244. SUB EAX, 2 ;
  1245. JMP unaligned2 ;
  1246. ; one piece left OR non-contiguous data
  1247. single:
  1248. MOV ESI, singleC ;
  1249. INC ESI ;
  1250. MOV singleC, ESI ;
  1251. singlepieces: ;
  1252. CMP EAX, 0 ;
  1253. JLE endL ; len <= 0- > EXIT
  1254. MOVSD XMM0, [EBX]
  1255. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1256. MOVSD XMM1, [ECX]
  1257. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1258. ADDSD XMM0, XMM1 ;
  1259. MOVSD [EDX], XMM0
  1260. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1261. DEC EAX ; DEC(len)
  1262. JMP singlepieces ;
  1263. endL:
  1264. END AddAXAXLoopSSE;
  1265. PROCEDURE AddARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, dinc, len: SIZE );
  1266. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1267. MOV EAX, [EBP+len] ;
  1268. CMP EAX, 0 ;
  1269. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1270. MOV EBX, [EBP+ladr] ;
  1271. MOV ECX, [EBP+radr] ;
  1272. MOV EDX, [EBP+dadr] ;
  1273. ; check IF data are contiguous IN memory
  1274. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1275. JNE single ; not continuous- > simplest method
  1276. CMP [EBP+rinc], 4 ; check right FOR contiunuity
  1277. JNE single ; not continuous- > simplest method
  1278. CMP [EBP+dinc], 4 ; check destination FOR contiunuity
  1279. JNE single ; not continuous- > simplest method
  1280. ; check FOR alignment
  1281. MOV ESI, EBX ;
  1282. AND ESI, 3 ; ladr MOD 4
  1283. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1284. JNE unaligned ; not 32 bit aligned
  1285. MOV ESI, ECX ;
  1286. AND ESI, 3 ; radr MOD 4
  1287. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1288. JNE unaligned ; not 32 bit aligned
  1289. MOV ESI, EDX ;
  1290. AND ESI, 3 ; dadr MOD 4
  1291. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1292. JNE unaligned ; not 32 bit aligned
  1293. MOV ESI, EBX ;
  1294. AND ESI, 8+4 ; 16 byte alignment?
  1295. MOV EDI, ECX ;
  1296. AND EDI, 8+4 ; 16 byte alignment?
  1297. CMP ESI, EDI ;
  1298. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and radr
  1299. MOV EDI, EDX ;
  1300. AND EDI, 8+4 ; 16 byte alignment
  1301. CMP ESI, EDI ;
  1302. JNE unaligned ; different 16 byte = 128 bit alignment OF dadr and radr
  1303. CMP ESI, 0 ;
  1304. JE aligned ; already aligned
  1305. align:
  1306. ; one single element processing UNTIL 128 bt alignment achieved
  1307. MOVSS XMM1, [EBX] ;
  1308. MOVSS XMM0, [ECX] ;
  1309. ADDSS XMM0, XMM1 ;
  1310. MOVSS [EDX], XMM0 ;
  1311. ADD EBX, 4 ;
  1312. ADD ECX, 4 ;
  1313. ADD EDX, 4 ;
  1314. DEC EAX ; one element has been processed ;
  1315. CMP EAX, 0 ; all elements already processed?
  1316. JLE single ;
  1317. MOV ESI, EBX ;
  1318. AND ESI, 8+4 ;
  1319. CMP ESI, 0 ;
  1320. JNE align ;
  1321. aligned:
  1322. MOV ESI, alignedC ;
  1323. INC ESI ;
  1324. MOV alignedC, ESI ;
  1325. aligned16:
  1326. CMP EAX, 16 ;
  1327. JL aligned4 ; len < 16- > EXIT TO singlepieces
  1328. MOVAPS XMM0, [EBX] ;
  1329. MOVAPS XMM1, [EBX+16] ;
  1330. MOVAPS XMM2, [EBX+32] ;
  1331. MOVAPS XMM3, [EBX+48] ;
  1332. ADD EBX, 64 ;
  1333. MOVAPS XMM4, [ECX] ;
  1334. MOVAPS XMM5, [ECX+16] ;
  1335. MOVAPS XMM6, [ECX+32] ;
  1336. MOVAPS XMM7, [ECX+48] ;
  1337. ADD ECX, 64 ;
  1338. ADDPS XMM0, XMM4 ;
  1339. ADDPS XMM1, XMM5 ;
  1340. ADDPS XMM2, XMM6 ;
  1341. ADDPS XMM3, XMM7 ;
  1342. MOVAPS [EDX], XMM0 ;
  1343. MOVAPS [EDX+16], XMM1 ;
  1344. MOVAPS [EDX+32], XMM2 ;
  1345. MOVAPS [EDX+48], XMM3 ;
  1346. ADD EDX, 64 ;
  1347. SUB EAX, 16 ;
  1348. JMP aligned16 ;
  1349. ; LOOP FOR 2 pieces aligned
  1350. aligned4: ;
  1351. CMP EAX, 4 ;
  1352. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1353. MOVAPS XMM0, [EBX] ;
  1354. ADD EBX, 16 ;
  1355. MOVAPS XMM1, [ECX] ;
  1356. ADD ECX, 16 ;
  1357. ADDPS XMM0, XMM1 ;
  1358. MOVAPS [EDX], XMM0 ;
  1359. ADD EDX, 16 ;
  1360. SUB EAX, 4 ;
  1361. JMP aligned4 ;
  1362. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1363. unaligned: ;
  1364. MOV ESI, unalignedC ;
  1365. INC ESI ;
  1366. MOV unalignedC, ESI ;
  1367. unaligned16: ;
  1368. CMP EAX, 16 ;
  1369. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1370. MOVUPS XMM0, [EBX] ;
  1371. MOVUPS XMM1, [EBX+16] ;
  1372. MOVUPS XMM2, [EBX+32] ;
  1373. MOVUPS XMM3, [EBX+48] ;
  1374. ADD EBX, 64 ;
  1375. MOVUPS XMM4, [ECX] ;
  1376. MOVUPS XMM5, [ECX+16] ;
  1377. MOVUPS XMM6, [ECX+32] ;
  1378. MOVUPS XMM7, [ECX+48] ;
  1379. ADD ECX, 64 ;
  1380. ADDPS XMM0, XMM4 ;
  1381. ADDPS XMM1, XMM5 ;
  1382. ADDPS XMM2, XMM6 ;
  1383. ADDPS XMM3, XMM7 ;
  1384. MOVUPS [EDX], XMM0 ;
  1385. MOVUPS [EDX+16], XMM1 ;
  1386. MOVUPS [EDX+32], XMM2 ;
  1387. MOVUPS [EDX+48], XMM3 ;
  1388. ADD EDX, 64 ;
  1389. SUB EAX, 16 ;
  1390. JMP unaligned16 ;
  1391. ; LOOP FOR 2 pieces aligned
  1392. unaligned4: ;
  1393. CMP EAX, 4 ;
  1394. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1395. MOVUPS XMM0, [EBX] ;
  1396. ADD EBX, 16 ;
  1397. MOVUPS XMM1, [ECX] ;
  1398. ADD ECX, 16 ;
  1399. ADDPS XMM0, XMM1 ;
  1400. MOVUPS [EDX], XMM0 ;
  1401. ADD EDX, 16 ;
  1402. SUB EAX, 4 ;
  1403. JMP unaligned4 ;
  1404. ; one piece left OR non-contiguous data
  1405. single:
  1406. MOV ESI, singleC ;
  1407. INC ESI ;
  1408. MOV singleC, ESI ;
  1409. singlepieces: ;
  1410. CMP EAX, 0 ;
  1411. JLE endL ; len <= 0- > EXIT
  1412. MOVSS XMM0, [EBX]
  1413. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1414. MOVSS XMM1, [ECX]
  1415. ADD ECX, [EBP+rinc] ; INC(ladr, incl)
  1416. ADDSS XMM0, XMM1 ;
  1417. MOVSS [EDX], XMM0
  1418. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1419. DEC EAX ; DEC(len)
  1420. JMP singlepieces ;
  1421. endL:
  1422. END AddARARLoopSSE;
  1423. PROCEDURE SPAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1424. CODE {SYSTEM.i386, SYSTEM.FPU}
  1425. MOV EAX, [EBP+len] ; eax := len
  1426. MOV EBX, [EBP+ladr] ; ebx := ladr
  1427. MOV ECX, [EBP+radr] ; ecx := radr
  1428. MOV EDX, [EBP+dadr] ; edx := dadr
  1429. FLD QWORD [EDX] ; S.GET(dadr, x)
  1430. start:
  1431. CMP EAX, 0 ; WHILE len > 0 DO
  1432. JLE endL
  1433. FLD QWORD [EBX] ; S.GET(ladr, x)
  1434. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1435. FLD QWORD [ECX] ; S.GET(ladr, y)
  1436. FMULP ; x := x*y
  1437. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1438. FADDP ; z := z+x
  1439. DEC EAX ; DEC(len)
  1440. JMP start ;
  1441. endL:
  1442. FSTP QWORD [EDX] ; S.PUT(dadr, x)
  1443. FWAIT ;
  1444. END SPAXAXLoopA;
  1445. PROCEDURE SPARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1446. CODE {SYSTEM.i386, SYSTEM.FPU}
  1447. MOV EAX, [EBP+len] ; eax := len
  1448. MOV EBX, [EBP+ladr] ; ebx := ladr
  1449. MOV ECX, [EBP+radr] ; ecx := radr
  1450. MOV EDX, [EBP+dadr] ; edx := dadr
  1451. FLD DWORD [EDX] ; S.GET(dadr, x)
  1452. start:
  1453. CMP EAX, 0 ; WHILE len > 0 DO
  1454. JLE endL
  1455. FLD DWORD [EBX] ; S.GET(ladr, x)
  1456. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1457. FLD DWORD [ECX] ; S.GET(ladr, y)
  1458. FMULP ; x := x*y
  1459. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1460. FADDP ; z := z+x
  1461. DEC EAX ; DEC(len)
  1462. JMP start ;
  1463. endL:
  1464. FSTP DWORD [EDX] ; S.PUT(dadr, x)
  1465. FWAIT ;
  1466. END SPARARLoopA;
  1467. (* sse version of scalar product *)
  1468. PROCEDURE SPAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1469. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1470. ; register initialization
  1471. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1472. CMP EAX, 0 ;
  1473. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1474. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1475. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  1476. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1477. XORPD XMM0, XMM0 ;
  1478. MOVSD XMM0, [EDX] ; destination- > low bytes OF xmm0
  1479. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1480. JNE single ; not continuous- > simplest method
  1481. CMP [EBP+rinc], 8 ; check dest FOR continuity
  1482. JNE single ; not continuous- > simplest method
  1483. ; check FOR alignment
  1484. MOV ESI, EBX ;
  1485. AND ESI, 7 ; ladr MOD 8
  1486. CMP ESI, 0 ; ECX = 0- > 64 Bit alignment
  1487. JNE unaligned ; not 64 bit aligned
  1488. MOV ESI, ECX ;
  1489. AND ESI, 7 ; radr MOD 8
  1490. CMP ESI, 0 ; = 0- > 64 Bit alignment
  1491. JNE unaligned ; not 64 bit aligned
  1492. MOV ESI, EBX ;
  1493. AND ESI, 8 ; 16 byte alignment
  1494. MOV EDI, ECX ;
  1495. AND EDI, 8 ; 16 byte alignment
  1496. CMP ESI, EDI ;
  1497. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1498. CMP ESI, 8 ;
  1499. JNE aligned ; ladr and dadr already 128 bit aligned
  1500. ; one single element processing TO achieve 128 bt alignment
  1501. MOVSD XMM1, [EBX] ;
  1502. MOVSD XMM2, [ECX] ;
  1503. MULSD XMM1, XMM2 ;
  1504. ADDSD XMM0, XMM1 ;
  1505. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1506. ADD ECX, 8 ; now EDX IS 16 byte aligned ;
  1507. DEC EAX ; one element has been processed
  1508. ; LOOP FOR 4 pieces aligned
  1509. aligned:
  1510. MOV ESI, alignedC ;
  1511. INC ESI ;
  1512. MOV alignedC, ESI ;
  1513. aligned6:
  1514. CMP EAX, 6 ;
  1515. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1516. MOVAPD XMM1, [EBX] ;
  1517. MOVAPD XMM2, [EBX+16] ;
  1518. MOVAPD XMM3, [EBX+32] ;
  1519. MOVAPD XMM4, [ECX] ;
  1520. MOVAPD XMM5, [ECX+16] ;
  1521. MOVAPD XMM6, [ECX+32] ;
  1522. MULPD XMM1, XMM4 ;
  1523. ADDPD XMM0, XMM1 ;
  1524. MULPD XMM2, XMM5 ;
  1525. ADDPD XMM0, XMM2 ;
  1526. MULPD XMM3, XMM6 ;
  1527. ADDPD XMM0, XMM3 ;
  1528. ADD EBX, 48 ;
  1529. ADD ECX, 48 ;
  1530. SUB EAX, 6 ;
  1531. JMP aligned6 ;
  1532. ; LOOP FOR 2 pieces aligned
  1533. aligned2:
  1534. CMP EAX, 2 ;
  1535. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1536. MOVAPD XMM1, [EBX] ;
  1537. MOVAPD XMM2, [ECX] ;
  1538. MULPD XMM1, XMM2 ;
  1539. ADDPD XMM0, XMM1 ;
  1540. ADD EBX, 16 ;
  1541. ADD ECX, 16 ;
  1542. SUB EAX, 2 ;
  1543. JMP aligned2 ;
  1544. unaligned:
  1545. MOV ESI, unalignedC ;
  1546. INC ESI ;
  1547. MOV unalignedC, ESI ;
  1548. unaligned6:
  1549. CMP EAX, 6 ;
  1550. JL unaligned2 ; len < 4- > EXIT TO singlepieces
  1551. MOVUPD XMM1, [EBX] ;
  1552. MOVUPD XMM2, [EBX+16] ;
  1553. MOVUPD XMM3, [EBX+32] ;
  1554. MOVUPD XMM4, [ECX] ;
  1555. MOVUPD XMM5, [ECX+16] ;
  1556. MOVUPD XMM6, [ECX+32] ;
  1557. MULPD XMM1, XMM4 ;
  1558. ADDPD XMM0, XMM1 ;
  1559. MULPD XMM2, XMM5 ;
  1560. ADDPD XMM0, XMM2 ;
  1561. MULPD XMM3, XMM6 ;
  1562. ADDPD XMM0, XMM3 ;
  1563. ADD EBX, 48 ;
  1564. ADD ECX, 48 ;
  1565. SUB EAX, 6 ;
  1566. JMP unaligned6 ;
  1567. ; LOOP FOR 2 pieces aligned
  1568. unaligned2:
  1569. CMP EAX, 2 ;
  1570. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1571. MOVUPD XMM1, [EBX] ;
  1572. MOVUPD XMM2, [ECX] ;
  1573. MULPD XMM1, XMM2 ;
  1574. ADDPD XMM0, XMM1 ;
  1575. ADD EBX, 16 ;
  1576. ADD ECX, 16 ;
  1577. SUB EAX, 2 ;
  1578. JMP unaligned2 ;
  1579. horizontaladd: ;
  1580. MOVAPD XMM1, XMM0 ;
  1581. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  1582. ADDPD XMM0, XMM1 ;
  1583. JMP singlepieces ;
  1584. single:
  1585. MOV ESI, singleC ;
  1586. INC ESI ;
  1587. MOV singleC, ESI ;
  1588. singlepieces: ;
  1589. CMP EAX, 0 ;
  1590. JLE store ; len <= 0- > EXIT
  1591. MOVSD XMM1, [EBX]
  1592. MOVSD XMM2, [ECX]
  1593. MULSD XMM1, XMM2
  1594. ADDSD XMM0, XMM1
  1595. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1596. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1597. DEC EAX ; DEC(len)
  1598. JMP singlepieces ;
  1599. store:
  1600. MOVSD [EDX], XMM0 ;
  1601. endL:
  1602. END SPAXAXLoopSSE;
  1603. (* sse version of scalar product *)
  1604. PROCEDURE SPARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  1605. CODE {SYSTEM.i386, SYSTEM.SSE}
  1606. ; register initialization
  1607. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1608. CMP EAX, 0 ;
  1609. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1610. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1611. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  1612. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1613. XORPS XMM0, XMM0 ;
  1614. MOVSS XMM0, [EDX] ; destination- > low bytes OF xmm0
  1615. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1616. JNE single ; not continuous- > simplest method
  1617. CMP [EBP+rinc], 4 ; check dest FOR continuity
  1618. JNE single ; not continuous- > simplest method
  1619. ; check FOR alignment
  1620. MOV ESI, EBX ;
  1621. AND ESI, 3 ; ladr MOD 4
  1622. CMP ESI, 0 ; ECX = 0- > 32 Bit alignment
  1623. JNE unaligned ; not 32 bit aligned
  1624. MOV ESI, ECX ;
  1625. AND ESI, 3 ; radr MOD 4
  1626. CMP ESI, 0 ; = 0- > 32 Bit alignment
  1627. JNE unaligned ; not 32 bit aligned
  1628. MOV ESI, EBX ;
  1629. AND ESI, 8+4 ; 16 byte alignment
  1630. MOV EDI, ECX ;
  1631. AND EDI, 8+4 ; 16 byte alignment
  1632. CMP ESI, EDI ;
  1633. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1634. CMP ESI, 0 ;
  1635. JE aligned ; already aligned
  1636. align:
  1637. ; one single element processing UNTIL 128 bt alignment achieved
  1638. MOVSS XMM1, [EBX] ;
  1639. MOVSS XMM2, [ECX] ;
  1640. MULSS XMM1, XMM2 ;
  1641. ADDSS XMM0, XMM1 ;
  1642. ADD EBX, 4 ;
  1643. ADD ECX, 4 ;
  1644. DEC EAX ; one element has been processed ;
  1645. CMP EAX, 0 ; all elements already processed?
  1646. JLE single ;
  1647. MOV ESI, EBX ;
  1648. AND ESI, 8+4 ;
  1649. CMP ESI, 0 ;
  1650. JNE align ;
  1651. aligned:
  1652. MOV ESI, alignedC ;
  1653. INC ESI ;
  1654. MOV alignedC, ESI ;
  1655. aligned12:
  1656. CMP EAX, 12 ;
  1657. JL aligned4 ; len < 4- > EXIT TO singlepieces
  1658. MOVAPS XMM1, [EBX] ;
  1659. MOVAPS XMM2, [EBX+16] ;
  1660. MOVAPS XMM3, [EBX+32] ;
  1661. MOVAPS XMM4, [ECX] ;
  1662. MOVAPS XMM5, [ECX+16] ;
  1663. MOVAPS XMM6, [ECX+32] ;
  1664. MULPS XMM1, XMM4 ;
  1665. ADDPS XMM0, XMM1 ;
  1666. MULPS XMM2, XMM5 ;
  1667. ADDPS XMM0, XMM2 ;
  1668. MULPS XMM3, XMM6 ;
  1669. ADDPS XMM0, XMM3 ;
  1670. ADD EBX, 48 ;
  1671. ADD ECX, 48 ;
  1672. SUB EAX, 12 ;
  1673. JMP aligned12 ;
  1674. ; LOOP FOR 2 pieces aligned
  1675. aligned4:
  1676. CMP EAX, 4 ;
  1677. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1678. MOVAPS XMM1, [EBX] ;
  1679. MOVAPS XMM2, [ECX] ;
  1680. MULPS XMM1, XMM2 ;
  1681. ADDPS XMM0, XMM1 ;
  1682. ADD EBX, 16 ;
  1683. ADD ECX, 16 ;
  1684. SUB EAX, 4 ;
  1685. JMP aligned4 ;
  1686. unaligned:
  1687. MOV ESI, unalignedC ;
  1688. INC ESI ;
  1689. MOV unalignedC, ESI ;
  1690. unaligned12:
  1691. CMP EAX, 12 ;
  1692. JL unaligned4 ; len < 4- > EXIT TO singlepieces
  1693. MOVUPS XMM1, [EBX] ;
  1694. MOVUPS XMM2, [EBX+16] ;
  1695. MOVUPS XMM3, [EBX+32] ;
  1696. MOVUPS XMM4, [ECX] ;
  1697. MOVUPS XMM5, [ECX+16] ;
  1698. MOVUPS XMM6, [ECX+32] ;
  1699. MULPS XMM1, XMM4 ;
  1700. ADDPS XMM0, XMM1 ;
  1701. MULPS XMM2, XMM5 ;
  1702. ADDPS XMM0, XMM2 ;
  1703. MULPS XMM3, XMM6 ;
  1704. ADDPS XMM0, XMM3 ;
  1705. ADD EBX, 48 ;
  1706. ADD ECX, 48 ;
  1707. SUB EAX, 12 ;
  1708. JMP unaligned12 ;
  1709. ; LOOP FOR 2 pieces aligned
  1710. unaligned4:
  1711. CMP EAX, 4 ;
  1712. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  1713. MOVUPS XMM1, [EBX] ;
  1714. MOVUPS XMM2, [ECX] ;
  1715. MULPS XMM1, XMM2 ;
  1716. ADDPS XMM0, XMM1 ;
  1717. ADD EBX, 16 ;
  1718. ADD ECX, 16 ;
  1719. SUB EAX, 4 ;
  1720. JMP unaligned4 ;
  1721. horizontaladd: ;
  1722. MOVAPS XMM1, XMM0 ;
  1723. ; 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *)
  1724. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  1725. ADDPS XMM1, XMM0 ;
  1726. MOVAPS XMM0, XMM1
  1727. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  1728. ADDPS XMM0, XMM1 ;
  1729. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  1730. JMP singlepieces ;
  1731. single:
  1732. MOV ESI, singleC ;
  1733. INC ESI ;
  1734. MOV singleC, ESI ;
  1735. singlepieces: ;
  1736. CMP EAX, 0 ;
  1737. JLE store ; len <= 0- > EXIT
  1738. MOVSS XMM1, [EBX]
  1739. MOVSS XMM2, [ECX]
  1740. MULSS XMM1, XMM2
  1741. ADDSS XMM0, XMM1
  1742. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1743. ADD ECX, [EBP+rinc] ; INC(radr, incr)
  1744. DEC EAX ; DEC(len)
  1745. JMP singlepieces ;
  1746. store:
  1747. MOVSS [EDX], XMM0 ;
  1748. endL:
  1749. END SPARARLoopSSE;
  1750. PROCEDURE MulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1751. CODE {SYSTEM.i386, SYSTEM.FPU}
  1752. MOV EAX, [EBP+len] ; eax := len
  1753. MOV EBX, [EBP+ladr] ; ebx := ladr
  1754. MOV ECX, [EBP+radr] ; ecx := radr
  1755. MOV EDX, [EBP+dadr] ; edx := dadr
  1756. start:
  1757. CMP EAX, 0 ; WHILE len > 0 DO
  1758. JLE endL
  1759. FLD QWORD [EBX] ; S.GET(ladr, x)
  1760. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1761. FLD QWORD [ECX] ; S.GET(ladr, y)
  1762. FMULP ; x := x*y
  1763. FSTP QWORD [EDX]
  1764. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1765. DEC EAX ; DEC(len)
  1766. JMP start ;
  1767. endL:
  1768. FWAIT ;
  1769. END MulAXSXLoopA;
  1770. PROCEDURE MulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1771. CODE {SYSTEM.i386, SYSTEM.FPU}
  1772. MOV EAX, [EBP+len] ; eax := len
  1773. MOV EBX, [EBP+ladr] ; ebx := ladr
  1774. MOV ECX, [EBP+radr] ; ecx := radr
  1775. MOV EDX, [EBP+dadr] ; edx := dadr
  1776. start:
  1777. CMP EAX, 0 ; WHILE len > 0 DO
  1778. JLE endL
  1779. FLD DWORD [EBX] ; S.GET(ladr, x)
  1780. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1781. FLD DWORD [ECX] ; S.GET(ladr, y)
  1782. FMULP ; x := x*y
  1783. FSTP DWORD [EDX]
  1784. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1785. DEC EAX ; DEC(len)
  1786. JMP start ;
  1787. endL:
  1788. FWAIT ;
  1789. END MulARSRLoopA;
  1790. PROCEDURE IncMulAXSXLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1791. CODE {SYSTEM.i386, SYSTEM.FPU}
  1792. MOV EAX, [EBP+len] ; eax := len
  1793. MOV EBX, [EBP+ladr] ; ebx := ladr
  1794. MOV ECX, [EBP+radr] ; ecx := radr
  1795. MOV EDX, [EBP+dadr] ; edx := dadr
  1796. start:
  1797. CMP EAX, 0 ; WHILE len > 0 DO
  1798. JLE endL
  1799. FLD QWORD [EBX] ; S.GET(ladr, x)
  1800. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1801. FLD QWORD [ECX] ; S.GET(ladr, y)
  1802. FMULP ; x := x*y
  1803. FLD QWORD [EDX+8] ;
  1804. FADDP ;
  1805. FSTP QWORD [EDX]
  1806. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1807. DEC EAX ; DEC(len)
  1808. JMP start ;
  1809. endL:
  1810. FWAIT ;
  1811. END IncMulAXSXLoopA;
  1812. PROCEDURE IncMulARSRLoopA( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1813. CODE {SYSTEM.i386, SYSTEM.FPU}
  1814. MOV EAX, [EBP+len] ; eax := len
  1815. MOV EBX, [EBP+ladr] ; ebx := ladr
  1816. MOV ECX, [EBP+radr] ; ecx := radr
  1817. MOV EDX, [EBP+dadr] ; edx := dadr
  1818. start:
  1819. CMP EAX, 0 ; WHILE len > 0 DO
  1820. JLE endL
  1821. FLD DWORD [EBX] ; S.GET(ladr, x)
  1822. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1823. FLD DWORD [ECX] ; S.GET(ladr, y)
  1824. FMULP ; x := x*y
  1825. FLD DWORD [EDX+8] ;
  1826. FADDP ;
  1827. FSTP DWORD [EDX]
  1828. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1829. DEC EAX ; DEC(len)
  1830. JMP start ;
  1831. endL:
  1832. FWAIT ;
  1833. END IncMulARSRLoopA;
  1834. PROCEDURE MulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1835. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  1836. (*
  1837. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  1838. 2.) process starting unaligned data ( using single instructions)
  1839. 3.) process aligned data
  1840. 4.) process remaining unaligned data (using single instructions)
  1841. *)
  1842. CODE {SYSTEM.i386, SYSTEM.SSE2}
  1843. ; register initialization
  1844. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1845. CMP EAX, 0 ;
  1846. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1847. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1848. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1849. MOV ECX, [EBP+radr] ;
  1850. MOVSD XMM0, [ECX] ;
  1851. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  1852. ; check IF data are contiguous IN memory
  1853. CMP [EBP+linc], 8 ; check left FOR contiunuity
  1854. JNE single ; not continuous- > simplest method
  1855. CMP [EBP+dinc], 8 ; check dest FOR continuity
  1856. JNE single ; not continuous- > simplest method
  1857. ; check FOR alignment
  1858. MOV ECX, EBX ;
  1859. AND ECX, 7 ; ladr MOD 8
  1860. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  1861. JNE unaligned ; not 64 bit aligned
  1862. MOV ECX, EDX ;
  1863. AND ECX, 7 ; dadr MOD 8
  1864. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  1865. JNE unaligned ; not 64 bit aligned
  1866. MOV ESI, EBX ;
  1867. AND ESI, 8 ; 16 byte alignment
  1868. MOV EDI, EDX ;
  1869. AND EDI, 8 ; 16 byte alignment
  1870. CMP ESI, EDI ;
  1871. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  1872. CMP ESI, 8 ;
  1873. JNE aligned ; ladr and dadr already 128 bit aligned
  1874. ; one single element processing TO achieve 128 bt alignment
  1875. MOVSD XMM1, [EBX] ;
  1876. MULSD XMM1, XMM0 ;
  1877. MOVSD [EDX], XMM1 ;
  1878. ADD EBX, 8 ; now EBX IS 16 byte aligned
  1879. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  1880. DEC EAX ; one element has been processed
  1881. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  1882. aligned:
  1883. MOV ECX, alignedC ;
  1884. INC ECX ;
  1885. MOV alignedC, ECX ;
  1886. aligned8:
  1887. CMP EAX, 8 ;
  1888. JL aligned2 ; len < 4- > EXIT TO singlepieces
  1889. MOVAPD XMM1, [EBX] ;
  1890. MOVAPD XMM2, [EBX+16] ;
  1891. MOVAPD XMM3, [EBX+32] ;
  1892. MOVAPD XMM4, [EBX+48] ;
  1893. ADD EBX, 64 ;
  1894. MULPD XMM1, XMM0 ;
  1895. MULPD XMM2, XMM0 ;
  1896. MULPD XMM3, XMM0 ;
  1897. MULPD XMM4, XMM0 ;
  1898. MOVAPD [EDX], XMM1 ;
  1899. MOVAPD [EDX+16], XMM2 ;
  1900. MOVAPD [EDX+32], XMM3 ;
  1901. MOVAPD [EDX+48], XMM4 ;
  1902. ADD EDX, 64 ;
  1903. SUB EAX, 8 ;
  1904. JMP aligned8 ;
  1905. ; LOOP FOR 2 pieces aligned
  1906. aligned2: ;
  1907. CMP EAX, 2 ;
  1908. JL singlepieces ; len < 2- > EXIT TO singlepieces
  1909. MOVAPD XMM1, [EBX] ;
  1910. ADD EBX, 16 ;
  1911. MULPD XMM1, XMM0 ;
  1912. MOVAPD [EDX], XMM1 ;
  1913. ADD EDX, 16 ;
  1914. SUB EAX, 2 ;
  1915. JMP aligned2 ;
  1916. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  1917. unaligned: ;
  1918. MOV ECX, unalignedC ;
  1919. INC ECX ;
  1920. MOV unalignedC, ECX ;
  1921. unaligned8: ;
  1922. CMP EAX, 8 ;
  1923. JL unaligned2 ; len < 12- > EXIT
  1924. MOVUPD XMM1, [EBX] ;
  1925. MOVUPD XMM2, [EBX+16] ;
  1926. MOVUPD XMM3, [EBX+32] ;
  1927. MOVUPD XMM4, [EBX+48] ;
  1928. ADD EBX, 64
  1929. MULPD XMM1, XMM0 ;
  1930. MULPD XMM2, XMM0 ;
  1931. MULPD XMM3, XMM0 ;
  1932. MULPD XMM4, XMM0 ;
  1933. MOVUPD [EDX], XMM1 ;
  1934. MOVUPD [EDX+16], XMM2 ;
  1935. MOVUPD [EDX+32], XMM3 ;
  1936. MOVUPD [EDX+48], XMM4 ;
  1937. ADD EDX, 64 ;
  1938. SUB EAX, 8 ;
  1939. JMP unaligned8 ;
  1940. ; LOOP FOR 2 pieces unaligned
  1941. unaligned2: ;
  1942. CMP EAX, 2 ;
  1943. JL singlepieces ; len < 2- > EXIT
  1944. MOVUPD XMM1, [EBX] ;
  1945. ADD EBX, 16 ;
  1946. MULPD XMM1, XMM0 ;
  1947. MOVUPD [EDX], XMM1 ;
  1948. ADD EDX, 16 ;
  1949. SUB EAX, 2 ;
  1950. JMP unaligned2 ;
  1951. ; one piece left OR non-contiguous data
  1952. single:
  1953. MOV ECX, singleC ;
  1954. INC ECX ;
  1955. MOV singleC, ECX ;
  1956. singlepieces: ;
  1957. CMP EAX, 0 ;
  1958. JLE endL ; len <= 0- > EXIT
  1959. MOVSD XMM1, [EBX]
  1960. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  1961. MULSD XMM1, XMM0
  1962. MOVSD [EDX], XMM1
  1963. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  1964. DEC EAX ; DEC(len)
  1965. JMP singlepieces ;
  1966. endL:
  1967. END MulAXSXLoopSSE;
  1968. PROCEDURE MulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  1969. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  1970. (*
  1971. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  1972. 2.) process starting unaligned data ( using single instructions)
  1973. 3.) process aligned data
  1974. 4.) process remaining unaligned data (using single instructions)
  1975. *)
  1976. CODE {SYSTEM.i386, SYSTEM.SSE}
  1977. ; register initialization
  1978. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  1979. CMP EAX, 0 ;
  1980. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  1981. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  1982. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  1983. MOV ECX, [EBP+radr] ;
  1984. MOVSS XMM0, [ECX] ;
  1985. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  1986. ; check IF data are contiguous IN memory
  1987. CMP [EBP+linc], 4 ; check left FOR contiunuity
  1988. JNE single ; not continuous- > simplest method
  1989. CMP [EBP+dinc], 4 ; check dest FOR continuity
  1990. JNE single ; not continuous- > simplest method
  1991. ; check FOR alignment
  1992. MOV ECX, EBX ;
  1993. AND ECX, 3 ; ladr MOD 4
  1994. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  1995. JNE unaligned ; not 32 bit aligned
  1996. MOV ECX, EDX ;
  1997. AND ECX, 3 ; dadr MOD 4
  1998. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  1999. JNE unaligned ; not 64 bit aligned
  2000. MOV ESI, EBX ;
  2001. AND ESI, 8+4 ; 16 byte alignment
  2002. MOV EDI, EDX ;
  2003. AND EDI, 8+4 ; 16 byte alignment
  2004. CMP ESI, EDI ;
  2005. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2006. CMP ESI, 0 ;
  2007. JE aligned ; already aligned
  2008. align:
  2009. ; one single element processing UNTIL 128 bt alignment achieved
  2010. MOVSS XMM1, [EBX] ;
  2011. MULSS XMM1, XMM0 ;
  2012. MOVSS [EDX], XMM1 ;
  2013. ADD EBX, 4 ;
  2014. ADD EDX, 4 ;
  2015. DEC EAX ; one element has been processed ;
  2016. CMP EAX, 0 ; all elements already processed?
  2017. JLE single
  2018. MOV ESI, EBX ;
  2019. AND ESI, 8+4 ;
  2020. CMP ESI, 0 ;
  2021. JNE align ;
  2022. aligned:
  2023. MOV ECX, alignedC ;
  2024. INC ECX ;
  2025. MOV alignedC, ECX ;
  2026. aligned16:
  2027. CMP EAX, 16 ;
  2028. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2029. MOVAPS XMM1, [EBX] ;
  2030. MOVAPS XMM2, [EBX+16] ;
  2031. MOVAPS XMM3, [EBX+32] ;
  2032. MOVAPS XMM4, [EBX+48] ;
  2033. ADD EBX, 64 ;
  2034. MULPS XMM1, XMM0 ;
  2035. MULPS XMM2, XMM0 ;
  2036. MULPS XMM3, XMM0 ;
  2037. MULPS XMM4, XMM0 ;
  2038. MOVAPS [EDX], XMM1 ;
  2039. MOVAPS [EDX+16], XMM2 ;
  2040. MOVAPS [EDX+32], XMM3 ;
  2041. MOVAPS [EDX+48], XMM4 ;
  2042. ADD EDX, 64 ;
  2043. SUB EAX, 16 ;
  2044. JMP aligned16 ;
  2045. ; LOOP FOR 2 pieces aligned
  2046. aligned4: ;
  2047. CMP EAX, 4 ;
  2048. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2049. MOVAPS XMM1, [EBX] ;
  2050. ADD EBX, 16 ;
  2051. MULPS XMM1, XMM0 ;
  2052. MOVAPS [EDX], XMM1 ;
  2053. ADD EDX, 16 ;
  2054. SUB EAX, 4 ;
  2055. JMP aligned4 ;
  2056. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2057. unaligned: ;
  2058. MOV ECX, unalignedC ;
  2059. INC ECX ;
  2060. MOV unalignedC, ECX ;
  2061. unaligned16: ;
  2062. CMP EAX, 16 ;
  2063. JL unaligned4 ; len < 12- > EXIT
  2064. MOVUPS XMM1, [EBX] ;
  2065. MOVUPS XMM2, [EBX+16] ;
  2066. MOVUPS XMM3, [EBX+32] ;
  2067. MOVUPS XMM4, [EBX+48] ;
  2068. ADD EBX, 64
  2069. MULPS XMM1, XMM0 ;
  2070. MULPS XMM2, XMM0 ;
  2071. MULPS XMM3, XMM0 ;
  2072. MULPS XMM4, XMM0 ;
  2073. MOVUPS [EDX], XMM1 ;
  2074. MOVUPS [EDX+16], XMM2 ;
  2075. MOVUPS [EDX+32], XMM3 ;
  2076. MOVUPS [EDX+48], XMM4 ;
  2077. ADD EDX, 64 ;
  2078. SUB EAX, 16 ;
  2079. JMP unaligned16 ;
  2080. ; LOOP FOR 2 pieces unaligned
  2081. unaligned4: ;
  2082. CMP EAX, 4 ;
  2083. JL singlepieces ; len < 2- > EXIT
  2084. MOVUPS XMM1, [EBX] ;
  2085. ADD EBX, 16 ;
  2086. MULPS XMM1, XMM0 ;
  2087. MOVUPS [EDX], XMM1 ;
  2088. ADD EDX, 16 ;
  2089. SUB EAX, 4 ;
  2090. JMP unaligned4 ;
  2091. ; one piece left OR non-contiguous data
  2092. single:
  2093. MOV ECX, singleC ;
  2094. INC ECX ;
  2095. MOV singleC, ECX ;
  2096. singlepieces: ;
  2097. CMP EAX, 0 ;
  2098. JLE endL ; len <= 0- > EXIT
  2099. MOVSS XMM1, [EBX]
  2100. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2101. MULSS XMM1, XMM0
  2102. MOVSS [EDX], XMM1
  2103. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2104. DEC EAX ; DEC(len)
  2105. JMP singlepieces ;
  2106. endL:
  2107. END MulARSRLoopSSE;
  2108. PROCEDURE IncMulAXSXLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2109. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2110. (*
  2111. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2112. 2.) process starting unaligned data ( using single instructions)
  2113. 3.) process aligned data
  2114. 4.) process remaining unaligned data (using single instructions)
  2115. *)
  2116. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2117. ; register initialization
  2118. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2119. CMP EAX, 0 ;
  2120. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2121. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2122. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2123. MOV ECX, [EBP+radr] ;
  2124. MOVSD XMM0, [ECX] ;
  2125. SHUFPD XMM0, XMM0, 0 ; high bits := low bits
  2126. ; check IF data are contiguous IN memory
  2127. CMP [EBP+linc], 8 ; check left FOR contiunuity
  2128. JNE single ; not continuous- > simplest method
  2129. CMP [EBP+dinc], 8 ; check dest FOR continuity
  2130. JNE single ; not continuous- > simplest method
  2131. ; check FOR alignment
  2132. MOV ECX, EBX ;
  2133. AND ECX, 7 ; ladr MOD 8
  2134. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2135. JNE unaligned ; not 64 bit aligned
  2136. MOV ECX, EDX ;
  2137. AND ECX, 7 ; dadr MOD 8
  2138. CMP ECX, 0 ; ECX = 0- > 64 Bit alignment
  2139. JNE unaligned ; not 64 bit aligned
  2140. MOV ESI, EBX ;
  2141. AND ESI, 8 ; 16 byte alignment
  2142. MOV EDI, EDX ;
  2143. AND EDI, 8 ; 16 byte alignment
  2144. CMP ESI, EDI ;
  2145. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2146. CMP ESI, 8 ;
  2147. JNE aligned ; ladr and dadr already 128 bit aligned
  2148. ; one single element processing TO achieve 128 bt alignment
  2149. MOVSD XMM1, [EBX] ;
  2150. MULSD XMM1, XMM0 ;
  2151. MOVSD XMM2, [EDX] ;
  2152. ADDSD XMM1, XMM2 ;
  2153. MOVSD [EDX], XMM1 ;
  2154. ADD EBX, 8 ; now EBX IS 16 byte aligned
  2155. ADD EDX, 8 ; now EDX IS 16 byte aligned ;
  2156. DEC EAX ; one element has been processed
  2157. ; LOOP FOR 8 pieces aligned(no better performance with 14 pieces!)
  2158. aligned:
  2159. MOV ECX, alignedC ;
  2160. INC ECX ;
  2161. MOV alignedC, ECX ;
  2162. aligned8:
  2163. CMP EAX, 8 ;
  2164. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2165. MOVAPD XMM1, [EBX] ;
  2166. MOVAPD XMM2, [EBX+16] ;
  2167. MOVAPD XMM3, [EBX+32] ;
  2168. MOVAPD XMM4, [EBX+48] ;
  2169. ADD EBX, 64 ;
  2170. MULPD XMM1, XMM0 ;
  2171. MULPD XMM2, XMM0 ;
  2172. MULPD XMM3, XMM0 ;
  2173. MULPD XMM4, XMM0 ;
  2174. MOVAPD XMM5, [EDX] ;
  2175. ADDPD XMM1, XMM5
  2176. MOVAPD [EDX], XMM1 ;
  2177. MOVAPD XMM6, [EDX+16] ;
  2178. ADDPD XMM2, XMM6
  2179. MOVAPD [EDX+16], XMM2 ;
  2180. MOVAPD XMM7, [EDX+32] ;
  2181. ADDPD XMM3, XMM7
  2182. MOVAPD [EDX+32], XMM3 ;
  2183. MOVAPD XMM5, [EDX+48] ;
  2184. ADDPD XMM4, XMM5
  2185. MOVAPD [EDX+48], XMM4 ;
  2186. ADD EDX, 64 ;
  2187. SUB EAX, 8 ;
  2188. JMP aligned8 ;
  2189. ; LOOP FOR 2 pieces aligned
  2190. aligned2: ;
  2191. CMP EAX, 2 ;
  2192. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2193. MOVAPD XMM1, [EBX] ;
  2194. ADD EBX, 16 ;
  2195. MULPD XMM1, XMM0 ;
  2196. MOVAPD XMM2, [EDX] ;
  2197. ADDPD XMM1, XMM2
  2198. MOVAPD [EDX], XMM1 ;
  2199. ADD EDX, 16 ;
  2200. SUB EAX, 2 ;
  2201. JMP aligned2 ;
  2202. ; LOOP FOR 8 unaligned pieces(14 pieces not better!)
  2203. unaligned: ;
  2204. MOV ECX, unalignedC ;
  2205. INC ECX ;
  2206. MOV unalignedC, ECX ;
  2207. unaligned8: ;
  2208. CMP EAX, 8 ;
  2209. JL unaligned2 ; len < 12- > EXIT
  2210. MOVUPD XMM1, [EBX] ;
  2211. MOVUPD XMM2, [EBX+16] ;
  2212. MOVUPD XMM3, [EBX+32] ;
  2213. MOVUPD XMM4, [EBX+48] ;
  2214. ADD EBX, 64
  2215. MULPD XMM1, XMM0 ;
  2216. MULPD XMM2, XMM0 ;
  2217. MULPD XMM3, XMM0 ;
  2218. MULPD XMM4, XMM0 ;
  2219. MOVUPD XMM5, [EDX] ;
  2220. ADDPD XMM1, XMM5
  2221. MOVUPD [EDX], XMM1 ;
  2222. MOVUPD XMM6, [EDX+16] ;
  2223. ADDPD XMM2, XMM6
  2224. MOVUPD [EDX+16], XMM2 ;
  2225. MOVUPD XMM7, [EDX+32] ;
  2226. ADDPD XMM3, XMM7
  2227. MOVUPD [EDX+32], XMM3 ;
  2228. MOVUPD XMM5, [EDX+48] ;
  2229. ADDPD XMM4, XMM5
  2230. MOVUPD [EDX+48], XMM4 ;
  2231. ADD EDX, 64 ;
  2232. SUB EAX, 8 ;
  2233. JMP unaligned8 ;
  2234. ; LOOP FOR 2 pieces unaligned
  2235. unaligned2: ;
  2236. CMP EAX, 2 ;
  2237. JL singlepieces ; len < 2- > EXIT
  2238. MOVUPD XMM1, [EBX] ;
  2239. ADD EBX, 16 ;
  2240. MULPD XMM1, XMM0 ;
  2241. MOVUPD XMM2, [EDX] ;
  2242. ADDPD XMM1, XMM2
  2243. MOVUPD [EDX], XMM1 ;
  2244. ADD EDX, 16 ;
  2245. SUB EAX, 2 ;
  2246. JMP unaligned2 ;
  2247. ; one piece left OR non-contiguous data
  2248. single:
  2249. MOV ECX, singleC ;
  2250. INC ECX ;
  2251. MOV singleC, ECX ;
  2252. singlepieces: ;
  2253. CMP EAX, 0 ;
  2254. JLE endL ; len <= 0- > EXIT
  2255. MOVSD XMM1, [EBX]
  2256. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2257. MULSD XMM1, XMM0
  2258. MOVSD XMM2, [EDX] ;
  2259. ADDSD XMM1, XMM2
  2260. MOVSD [EDX], XMM1
  2261. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2262. DEC EAX ; DEC(len)
  2263. JMP singlepieces ;
  2264. endL:
  2265. END IncMulAXSXLoopSSE;
  2266. PROCEDURE IncMulARSRLoopSSE( ladr, radr, dadr: ADDRESS; linc, dinc, len: SIZE );
  2267. (* simple version, does not yet check for alignment, no parallel executions yet: use full 8 registers! *)
  2268. (*
  2269. 1.) check for same alignment of ladr and dadr (ladr MOD 128 = dadr MOD 128)
  2270. 2.) process starting unaligned data ( using single instructions)
  2271. 3.) process aligned data
  2272. 4.) process remaining unaligned data (using single instructions)
  2273. *)
  2274. CODE {SYSTEM.i386, SYSTEM.SSE}
  2275. ; register initialization
  2276. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2277. CMP EAX, 0 ;
  2278. JLE endL ; nothing TO be done, EAX > 0 guaranteed from here on
  2279. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2280. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2281. MOV ECX, [EBP+radr] ;
  2282. MOVSS XMM0, [ECX] ;
  2283. SHUFPS XMM0, XMM0, 0 ; all positions now carrie same value
  2284. ; check IF data are contiguous IN memory
  2285. CMP [EBP+linc], 4 ; check left FOR contiunuity
  2286. JNE single ; not continuous- > simplest method
  2287. CMP [EBP+dinc], 4 ; check dest FOR continuity
  2288. JNE single ; not continuous- > simplest method
  2289. ; check FOR alignment
  2290. MOV ECX, EBX ;
  2291. AND ECX, 3 ; ladr MOD 4
  2292. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2293. JNE unaligned ; not 32 bit aligned
  2294. MOV ECX, EDX ;
  2295. AND ECX, 3 ; dadr MOD 4
  2296. CMP ECX, 0 ; ECX = 0- > 32 Bit alignment
  2297. JNE unaligned ; not 64 bit aligned
  2298. MOV ESI, EBX ;
  2299. AND ESI, 8+4 ; 16 byte alignment
  2300. MOV EDI, EDX ;
  2301. AND EDI, 8+4 ; 16 byte alignment
  2302. CMP ESI, EDI ;
  2303. JNE unaligned ; different 16 byte = 128 bit alignment OF ladr and dadr
  2304. CMP ESI, 0 ;
  2305. JE aligned ; already aligned
  2306. align:
  2307. ; one single element processing UNTIL 128 bt alignment achieved
  2308. MOVSS XMM1, [EBX] ;
  2309. MULSS XMM1, XMM0 ;
  2310. MOVSS XMM2, [EDX] ;
  2311. ADDSS XMM1, XMM2 ;
  2312. MOVSS [EDX], XMM1 ;
  2313. ADD EBX, 4 ;
  2314. ADD EDX, 4 ;
  2315. DEC EAX ; one element has been processed ;
  2316. CMP EAX, 0 ; all elements already processed?
  2317. JLE single
  2318. MOV ESI, EBX ;
  2319. AND ESI, 8+4 ;
  2320. CMP ESI, 0 ;
  2321. JNE align ;
  2322. aligned:
  2323. MOV ECX, alignedC ;
  2324. INC ECX ;
  2325. MOV alignedC, ECX ;
  2326. aligned16:
  2327. CMP EAX, 16 ;
  2328. JL aligned4 ; len < 4- > EXIT TO singlepieces
  2329. MOVAPS XMM1, [EBX] ;
  2330. MOVAPS XMM2, [EBX+16] ;
  2331. MOVAPS XMM3, [EBX+32] ;
  2332. MOVAPS XMM4, [EBX+48] ;
  2333. ADD EBX, 64 ;
  2334. MULPS XMM1, XMM0 ;
  2335. MULPS XMM2, XMM0 ;
  2336. MULPS XMM3, XMM0 ;
  2337. MULPS XMM4, XMM0 ;
  2338. MOVAPS XMM5, [EDX] ;
  2339. ADDPS XMM1, XMM5 ;
  2340. MOVAPS [EDX], XMM1 ;
  2341. MOVAPS XMM6, [EDX+16] ;
  2342. ADDPS XMM2, XMM6 ;
  2343. MOVAPS [EDX+16], XMM2 ;
  2344. MOVAPS XMM7, [EDX+32] ;
  2345. ADDPS XMM3, XMM7 ;
  2346. MOVAPS [EDX+32], XMM3 ;
  2347. MOVAPS XMM5, [EDX+48] ;
  2348. ADDPS XMM4, XMM5 ;
  2349. MOVAPS [EDX+48], XMM4 ;
  2350. ADD EDX, 64 ;
  2351. SUB EAX, 16 ;
  2352. JMP aligned16 ;
  2353. ; LOOP FOR 2 pieces aligned
  2354. aligned4: ;
  2355. CMP EAX, 4 ;
  2356. JL singlepieces ; len < 2- > EXIT TO singlepieces
  2357. MOVAPS XMM1, [EBX] ;
  2358. ADD EBX, 16 ;
  2359. MULPS XMM1, XMM0 ;
  2360. MOVAPS XMM2, [EDX] ;
  2361. ADDPS XMM1, XMM2 ;
  2362. MOVAPS [EDX], XMM1 ;
  2363. ADD EDX, 16 ;
  2364. SUB EAX, 4 ;
  2365. JMP aligned4 ;
  2366. ; LOOP FOR 16 unaligned pieces(20 pieces not better!)
  2367. unaligned: ;
  2368. MOV ECX, unalignedC ;
  2369. INC ECX ;
  2370. MOV unalignedC, ECX ;
  2371. unaligned16: ;
  2372. CMP EAX, 16 ;
  2373. JL unaligned4 ; len < 12- > EXIT
  2374. MOVUPS XMM1, [EBX] ;
  2375. MOVUPS XMM2, [EBX+16] ;
  2376. MOVUPS XMM3, [EBX+32] ;
  2377. MOVUPS XMM4, [EBX+48] ;
  2378. ADD EBX, 64
  2379. MULPS XMM1, XMM0 ;
  2380. MULPS XMM2, XMM0 ;
  2381. MULPS XMM3, XMM0 ;
  2382. MULPS XMM4, XMM0 ;
  2383. MOVUPS XMM5, [EDX] ;
  2384. ADDPS XMM1, XMM5 ;
  2385. MOVUPS [EDX], XMM1 ;
  2386. MOVUPS XMM6, [EDX+16] ;
  2387. ADDPS XMM2, XMM6 ;
  2388. MOVUPS [EDX+16], XMM2 ;
  2389. MOVUPS XMM7, [EDX+32] ;
  2390. ADDPS XMM3, XMM7 ;
  2391. MOVUPS [EDX+32], XMM3 ;
  2392. MOVUPS XMM5, [EDX+48] ;
  2393. ADDPS XMM4, XMM5 ;
  2394. MOVUPS [EDX+48], XMM4 ;
  2395. ADD EDX, 64 ;
  2396. SUB EAX, 16 ;
  2397. JMP unaligned16 ;
  2398. ; LOOP FOR 2 pieces unaligned
  2399. unaligned4: ;
  2400. CMP EAX, 4 ;
  2401. JL singlepieces ; len < 2- > EXIT
  2402. MOVUPS XMM1, [EBX] ;
  2403. ADD EBX, 16 ;
  2404. MULPS XMM1, XMM0 ;
  2405. MOVUPS XMM2, [EDX] ;
  2406. ADDPS XMM1, XMM2 ;
  2407. MOVUPS [EDX], XMM1 ;
  2408. ADD EDX, 16 ;
  2409. SUB EAX, 4 ;
  2410. JMP unaligned4 ;
  2411. ; one piece left OR non-contiguous data
  2412. single:
  2413. MOV ECX, singleC ;
  2414. INC ECX ;
  2415. MOV singleC, ECX ;
  2416. singlepieces: ;
  2417. CMP EAX, 0 ;
  2418. JLE endL ; len <= 0- > EXIT
  2419. MOVSS XMM1, [EBX]
  2420. ADD EBX, [EBP+linc] ; INC(ladr, incl)
  2421. MULSS XMM1, XMM0
  2422. MOVSS XMM2, [EDX] ;
  2423. ADDSS XMM1, XMM2 ;
  2424. MOVSS [EDX], XMM1
  2425. ADD EDX, [EBP+dinc] ; INC(radr, incr)
  2426. DEC EAX ; DEC(len)
  2427. JMP singlepieces ;
  2428. endL:
  2429. END IncMulARSRLoopSSE;
  2430. (*
  2431. PROCEDURE AlignedSPXSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2432. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2433. ; ; register initialization
  2434. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  2435. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  2436. MOV ESI, [EBP+radr] ; ESI reserved for radr
  2437. MOV EAX, [EBP+len] ; EAX reserverd for length
  2438. MOV ECX, [EBP+stride] ; ECX reserved for stride
  2439. XORPD XMM2, XMM2 ;
  2440. XORPD XMM3, XMM3 ;
  2441. XORPD XMM4, XMM4 ;
  2442. XORPD XMM5, XMM5 ;
  2443. XORPD XMM6, XMM6 ;
  2444. XOR EDI, EDI ;
  2445. aligned4:
  2446. CMP EAX, 4 ;
  2447. JL aligned2 ; ; len < 4- > exit to singlepieces
  2448. MOV ESI, [EBP+radr] ;
  2449. ADD ESI, EDI ;
  2450. MOVAPD XMM7, [EBX] ;
  2451. MOVAPD XMM0, [ESI] ;
  2452. ADD ESI, ECX ;
  2453. MOVAPD XMM1, [ESI] ;
  2454. MULPD XMM0, XMM7 ;
  2455. ADDPD XMM2, XMM0 ;
  2456. ADD ESI, ECX ;
  2457. MOVAPD XMM0, [ESI] ;
  2458. MULPD XMM1, XMM7 ;
  2459. ADDPD XMM3, XMM1 ;
  2460. ADD ESI, ECX ;
  2461. MOVAPD XMM1, [ESI] ;
  2462. MULPD XMM0, XMM7 ;
  2463. ADDPD XMM4, XMM0 ;
  2464. ADD ESI, ECX ;
  2465. MOVAPD XMM0, [ESI] ;
  2466. MULPD XMM1, XMM7 ;
  2467. ADDPD XMM5, XMM1 ;
  2468. MULPD XMM0, XMM7 ;
  2469. ADDPD XMM6, XMM0 ;
  2470. ADD EBX, 16 ;
  2471. ADD EDI, 16 ;
  2472. MOV ESI, [EBP+radr] ;
  2473. ADD ESI, EDI ;
  2474. MOVAPD XMM7, [EBX] ;
  2475. MOVAPD XMM0, [ESI] ;
  2476. ADD ESI, ECX ;
  2477. MOVAPD XMM1, [ESI] ;
  2478. MULPD XMM0, XMM7 ;
  2479. ADDPD XMM2, XMM0 ;
  2480. ADD ESI, ECX ;
  2481. MOVAPD XMM0, [ESI] ;
  2482. MULPD XMM1, XMM7 ;
  2483. ADDPD XMM3, XMM1 ;
  2484. ADD ESI, ECX ;
  2485. MOVAPD XMM1, [ESI] ;
  2486. MULPD XMM0, XMM7 ;
  2487. ADDPD XMM4, XMM0 ;
  2488. ADD ESI, ECX ;
  2489. MOVAPD XMM0, [ESI] ;
  2490. MULPD XMM1, XMM7 ;
  2491. ADDPD XMM5, XMM1 ;
  2492. MULPD XMM0, XMM7 ;
  2493. ADDPD XMM6, XMM0 ;
  2494. ADD EBX, 16 ;
  2495. ADD EDI, 16 ;
  2496. SUB EAX, 4 ;
  2497. JMP aligned4 ;
  2498. aligned2:
  2499. CMP EAX, 2 ;
  2500. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2501. MOV ESI, [EBP+radr] ;
  2502. ADD ESI, EDI ;
  2503. MOVAPD XMM7, [EBX] ;
  2504. MOVAPD XMM0, [ESI] ;
  2505. ADD ESI, ECX ;
  2506. MOVAPD XMM1, [ESI] ;
  2507. MULPD XMM0, XMM7 ;
  2508. ADDPD XMM2, XMM0 ;
  2509. ADD ESI, ECX ;
  2510. MOVAPD XMM0, [ESI] ;
  2511. MULPD XMM1, XMM7 ;
  2512. ADDPD XMM3, XMM1 ;
  2513. ADD ESI, ECX ;
  2514. MOVAPD XMM1, [ESI] ;
  2515. MULPD XMM0, XMM7 ;
  2516. ADDPD XMM4, XMM0 ;
  2517. ADD ESI, ECX ;
  2518. MOVAPD XMM0, [ESI] ;
  2519. MULPD XMM1, XMM7 ;
  2520. ADDPD XMM5, XMM1 ;
  2521. MULPD XMM0, XMM7 ;
  2522. ADDPD XMM6, XMM0 ;
  2523. ADD EBX, 16 ;
  2524. ADD EDI, 16 ;
  2525. SUB EAX, 2 ;
  2526. JMP aligned2 ;
  2527. horizontaladd: ;
  2528. MOVAPD XMM1, XMM2 ;
  2529. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2530. ADDPD XMM2, XMM1 ;
  2531. MOVAPD XMM1, XMM3 ;
  2532. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2533. ADDPD XMM3, XMM1 ;
  2534. MOVAPD XMM1, XMM4 ;
  2535. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2536. ADDPD XMM4, XMM1 ;
  2537. MOVAPD XMM1, XMM5 ;
  2538. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2539. ADDPD XMM5, XMM1 ;
  2540. MOVAPD XMM1, XMM6 ;
  2541. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2542. ADDPD XMM6, XMM1 ;
  2543. singlepieces: ;
  2544. CMP EAX, 0 ;
  2545. JLE store ; len <= 0- > exit
  2546. MOV ESI, [EBP+radr] ;
  2547. MOVSD XMM7, [EBX] ;
  2548. MOVSD XMM0, [ESI+EDI] ;
  2549. ADD ESI, ECX ;
  2550. MOVSD XMM1, [ESI+EDI] ;
  2551. MULSD XMM0, XMM7 ;
  2552. ADDSD XMM2, XMM0 ;
  2553. ADD ESI, ECX ;
  2554. MOVSD XMM0, [ESI+EDI] ;
  2555. MULSD XMM1, XMM7 ;
  2556. ADDSD XMM3, XMM1 ;
  2557. ADD ESI, ECX ;
  2558. MOVSD XMM1, [ESI+EDI] ;
  2559. MULSD XMM0, XMM7 ;
  2560. ADDSD XMM4, XMM0 ;
  2561. ADD ESI, ECX ;
  2562. MOVSD XMM1, [ESI+EDI] ;
  2563. MULSD XMM0, XMM7 ;
  2564. ADDSD XMM4, XMM0 ;
  2565. ADD ESI, ECX ;
  2566. MOVSD XMM0, [ESI+EDI] ;
  2567. MULSD XMM1, XMM7 ;
  2568. ADDSD XMM5, XMM1 ;
  2569. MULSD XMM0, XMM7 ;
  2570. ADDSD XMM6, XMM0 ;
  2571. ADD EBX, 4 (* INC(ladr,incl) *)
  2572. ADD EDI, 4 (* INC(radr,incr) *)
  2573. DEC EAX ; DEC(len)
  2574. JMP singlepieces ;
  2575. store:
  2576. MOVSD [EDX], XMM2 ;
  2577. ADD EDX, [EBP+incd] ;
  2578. MOVSD [EDX], XMM3 ;
  2579. ADD EDX, [EBP+incd] ;
  2580. MOVSD [EDX], XMM4 ;
  2581. ADD EDX, [EBP+incd] ;
  2582. MOVSD [EDX], XMM5 ;
  2583. ADD EDX, [EBP+incd] ;
  2584. MOVSD [EDX], XMM6 ;
  2585. end:
  2586. END AlignedSPXSSE5;
  2587. *)
  2588. (* sse version of scalar product *)
  2589. PROCEDURE AlignedSPXSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  2590. add: BOOLEAN );
  2591. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2592. ; register initialization
  2593. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2594. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2595. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  2596. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2597. XORPD XMM0, XMM0 ;
  2598. CMP [EBP+add], 0 ; add?
  2599. JE aligned8 ; no add
  2600. MOVSD XMM0, [EDX] ;
  2601. aligned8:
  2602. CMP EAX, 8 ;
  2603. JL aligned2 ; len < 4- > EXIT TO singlepieces
  2604. MOVAPD XMM1, [EBX] ;
  2605. MOVAPD XMM2, [EBX+16] ;
  2606. MOVAPD XMM3, [EBX+32] ;
  2607. MOVAPD XMM4, [ECX] ;
  2608. MOVAPD XMM5, [ECX+16] ;
  2609. MOVAPD XMM6, [ECX+32] ;
  2610. MULPD XMM1, XMM4 ;
  2611. ADDPD XMM0, XMM1 ;
  2612. MULPD XMM2, XMM5 ;
  2613. ADDPD XMM0, XMM2 ;
  2614. MULPD XMM3, XMM6 ;
  2615. ADDPD XMM0, XMM3 ;
  2616. MOVAPD XMM7, [EBX+48] ;
  2617. MOVAPD XMM1, [ECX+48] ;
  2618. MULPD XMM1, XMM7 ;
  2619. ADDPD XMM0, XMM1 ;
  2620. ADD EBX, 64 ;
  2621. ADD ECX, 64 ;
  2622. SUB EAX, 8 ;
  2623. JMP aligned8 ;
  2624. ; LOOP FOR 2 pieces aligned
  2625. aligned4:
  2626. CMP EAX, 4 ;
  2627. JL aligned2 ; ; len < 4- > EXIT TO singlepieces
  2628. MOVAPD XMM1, [EBX] ;
  2629. MOVAPD XMM2, [ECX] ;
  2630. MOVAPD XMM3, [EBX+16] ;
  2631. MOVAPD XMM4, [ECX+16] ;
  2632. MULPD XMM1, XMM2 ;
  2633. ADDPD XMM0, XMM1 ;
  2634. MULPD XMM3, XMM4 ;
  2635. ADDPD XMM0, XMM3 ;
  2636. ADD EBX, 32 ;
  2637. ADD ECX, 32 ;
  2638. SUB EAX, 4 ;
  2639. JMP aligned4 ;
  2640. aligned2:
  2641. CMP EAX, 2 ;
  2642. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2643. MOVAPD XMM1, [EBX] ;
  2644. MOVAPD XMM2, [ECX] ;
  2645. MULPD XMM1, XMM2 ;
  2646. ADDPD XMM0, XMM1 ;
  2647. ADD EBX, 16 ;
  2648. ADD ECX, 16 ;
  2649. SUB EAX, 2 ;
  2650. JMP aligned2 ;
  2651. horizontaladd: ;
  2652. MOVAPD XMM1, XMM0 ;
  2653. SHUFPD XMM1, XMM1, 1 ; low bits < -high bits
  2654. ADDPD XMM0, XMM1 ;
  2655. singlepieces: ;
  2656. CMP EAX, 0 ;
  2657. JLE store ; len <= 0- > EXIT
  2658. MOVSD XMM1, [EBX]
  2659. MOVSD XMM2, [ECX]
  2660. MULSD XMM1, XMM2
  2661. ADDSD XMM0, XMM1
  2662. ADD EBX, 8 ; INC(ladr, incl)
  2663. ADD ECX, 8 ; INC(radr, incr)
  2664. DEC EAX ; DEC(len)
  2665. JMP singlepieces ;
  2666. store:
  2667. MOVSD [EDX], XMM0 ;
  2668. endL:
  2669. END AlignedSPXSSE;
  2670. (*
  2671. PROCEDURE AlignedSPRSSE5( ladr, radr, dadr, stride, incd, len: LONGINT );
  2672. CODE {SYSTEM.i386, SYSTEM.SSE}
  2673. ; register initialization
  2674. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  2675. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  2676. MOV ESI, [EBP+radr] ; ECX reserved for radr
  2677. MOV EAX, [EBP+len] ; EAX reserverd for length
  2678. MOV ECX, [EBP+stride] ;
  2679. XORPS XMM2, XMM2 ;
  2680. XORPS XMM3, XMM3 ;
  2681. XORPS XMM4, XMM4 ;
  2682. XORPS XMM5, XMM5 ;
  2683. XORPS XMM6, XMM6 ;
  2684. XOR EDI, EDI ;
  2685. aligned8:
  2686. CMP EAX, 8 ;
  2687. JL aligned4 ; ; len < 4- > exit to singlepieces
  2688. PREFETCH0 24[EBX] ;
  2689. ; PREFETCH0[ESI] ;
  2690. MOV ESI, [EBP+radr] ;
  2691. ADD ESI, EDI ;
  2692. MOVAPS XMM7, [EBX] ;
  2693. MOVAPS XMM0, [ESI] ;
  2694. ADD ESI, ECX ;
  2695. MOVAPS XMM1, [ESI] ;
  2696. MULPS XMM0, XMM7 ;
  2697. ADDPS XMM2, XMM0 ;
  2698. ADD ESI, ECX ;
  2699. MOVAPS XMM0, [ESI] ;
  2700. MULPS XMM1, XMM7 ;
  2701. ADDPS XMM3, XMM1 ;
  2702. ADD ESI, ECX ;
  2703. MOVAPS XMM1, [ESI] ;
  2704. MULPS XMM0, XMM7 ;
  2705. ADDPS XMM4, XMM0 ;
  2706. ADD ESI, ECX ;
  2707. MOVAPS XMM0, [ESI] ;
  2708. MULPS XMM1, XMM7 ;
  2709. ADDPS XMM5, XMM1 ;
  2710. MULPS XMM0, XMM7 ;
  2711. ADDPS XMM6, XMM0 ;
  2712. ADD EBX, 16 ;
  2713. ADD EDI, 16 ;
  2714. MOV ESI, [EBP+radr] ;
  2715. ADD ESI, EDI ;
  2716. MOVAPS XMM7, [EBX] ;
  2717. MOVAPS XMM0, [ESI] ;
  2718. ADD ESI, ECX ;
  2719. MOVAPS XMM1, [ESI] ;
  2720. MULPS XMM0, XMM7 ;
  2721. ADDPS XMM2, XMM0 ;
  2722. ADD ESI, ECX ;
  2723. MOVAPS XMM0, [ESI] ;
  2724. MULPS XMM1, XMM7 ;
  2725. ADDPS XMM3, XMM1 ;
  2726. ADD ESI, ECX ;
  2727. MOVAPS XMM1, [ESI] ;
  2728. MULPS XMM0, XMM7 ;
  2729. ADDPS XMM4, XMM0 ;
  2730. ADD ESI, ECX ;
  2731. MOVAPS XMM0, [ESI] ;
  2732. MULPS XMM1, XMM7 ;
  2733. ADDPS XMM5, XMM1 ;
  2734. MULPS XMM0, XMM7 ;
  2735. ADDPS XMM6, XMM0 ;
  2736. ADD EBX, 16 ;
  2737. ADD EDI, 16 ;
  2738. SUB EAX, 8 ;
  2739. JMP aligned8 ;
  2740. aligned4:
  2741. CMP EAX, 4 ;
  2742. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2743. MOV ESI, [EBP+radr] ;
  2744. ADD ESI, EDI ;
  2745. MOVAPS XMM7, [EBX] ;
  2746. MOVAPS XMM0, [ESI] ;
  2747. ADD ESI, ECX ;
  2748. MOVAPS XMM1, [ESI] ;
  2749. MULPS XMM0, XMM7 ;
  2750. ADDPS XMM2, XMM0 ;
  2751. ADD ESI, ECX ;
  2752. MOVAPS XMM0, [ESI] ;
  2753. MULPS XMM1, XMM7 ;
  2754. ADDPS XMM3, XMM1 ;
  2755. ADD ESI, ECX ;
  2756. MOVAPS XMM1, [ESI] ;
  2757. MULPS XMM0, XMM7 ;
  2758. ADDPS XMM4, XMM0 ;
  2759. ADD ESI, ECX ;
  2760. MOVAPS XMM0, [ESI] ;
  2761. MULPS XMM1, XMM7 ;
  2762. ADDPS XMM5, XMM1 ;
  2763. MULPS XMM0, XMM7 ;
  2764. ADDPS XMM6, XMM0 ;
  2765. ADD EBX, 16 ;
  2766. ADD EDI, 16 ;
  2767. SUB EAX, 4 ;
  2768. JMP aligned4 ;
  2769. horizontaladd: ;
  2770. MOVLHPS XMM1, XMM2 ;
  2771. ADDPS XMM1, XMM2 ;
  2772. SHUFPS XMM2, XMM1, 48 ;
  2773. ADDPS XMM2, XMM1 ;
  2774. MOVHLPS XMM2, XMM2 ;
  2775. MOVLHPS XMM1, XMM3 ;
  2776. ADDPS XMM1, XMM3 ;
  2777. SHUFPS XMM3, XMM1, 48 ;
  2778. ADDPS XMM3, XMM1 ;
  2779. MOVHLPS XMM3, XMM3 ;
  2780. MOVLHPS XMM1, XMM4 ;
  2781. ADDPS XMM1, XMM4 ;
  2782. SHUFPS XMM4, XMM1, 48 ;
  2783. ADDPS XMM4, XMM1 ;
  2784. MOVHLPS XMM4, XMM4 ;
  2785. MOVLHPS XMM1, XMM5 ;
  2786. ADDPS XMM1, XMM5 ;
  2787. SHUFPS XMM5, XMM1, 48 ;
  2788. ADDPS XMM5, XMM1 ;
  2789. MOVHLPS XMM5, XMM5 ;
  2790. MOVLHPS XMM1, XMM6 ;
  2791. ADDPS XMM1, XMM6 ;
  2792. SHUFPS XMM6, XMM1, 48 ;
  2793. ADDPS XMM6, XMM1 ;
  2794. MOVHLPS XMM6, XMM6 ;
  2795. singlepieces: ;
  2796. CMP EAX, 0 ;
  2797. JLE store ; len <= 0- > exit
  2798. MOV ESI, [EBP+radr] ;
  2799. MOVSS XMM7, [EBX] ;
  2800. MOVSS XMM0, [ESI+EDI] ;
  2801. ADD ESI, ECX ;
  2802. MOVSS XMM1, [ESI+EDI] ;
  2803. MULSS XMM0, XMM7 ;
  2804. ADDSS XMM2, XMM0 ;
  2805. ADD ESI, ECX ;
  2806. MOVSS XMM0, [ESI+EDI] ;
  2807. MULSS XMM1, XMM7 ;
  2808. ADDSS XMM3, XMM1 ;
  2809. ADD ESI, ECX ;
  2810. MOVSS XMM1, [ESI+EDI] ;
  2811. MULSS XMM0, XMM7 ;
  2812. ADDSS XMM4, XMM0 ;
  2813. ADD ESI, ECX ;
  2814. MOVSS XMM0, [ESI+EDI] ;
  2815. MULSS XMM1, XMM7 ;
  2816. ADDSS XMM5, XMM1 ;
  2817. MULSS XMM0, XMM7 ;
  2818. ADDSS XMM6, XMM0 ;
  2819. ADD EBX, 4 (* INC(ladr,incl) *)
  2820. ADD EDI, 4 (* INC(radr,incr) *)
  2821. DEC EAX ; DEC(len)
  2822. JMP singlepieces ;
  2823. store:
  2824. MOVSS [EDX], XMM2 ;
  2825. ADD EDX, [EBP+incd] ;
  2826. MOVSS [EDX], XMM3 ;
  2827. ADD EDX, [EBP+incd] ;
  2828. MOVSS [EDX], XMM4 ;
  2829. ADD EDX, [EBP+incd] ;
  2830. MOVSS [EDX], XMM5 ;
  2831. ADD EDX, [EBP+incd] ;
  2832. MOVSS [EDX], XMM6 ;
  2833. end:
  2834. END AlignedSPRSSE5;
  2835. *)
  2836. PROCEDURE AlignedSPRSSE( ladr, radr, dadr: ADDRESS; len: SIZE;
  2837. add: BOOLEAN );
  2838. CODE {SYSTEM.i386, SYSTEM.SSE}
  2839. ; register initialization
  2840. MOV EDX, [EBP+dadr] ; EDX reserved FOR dadr
  2841. MOV EBX, [EBP+ladr] ; EBX reserved FOR ladr
  2842. MOV ECX, [EBP+radr] ; ECX reserved FOR radr
  2843. MOV EAX, [EBP+len] ; EAX reserverd FOR length
  2844. XORPS XMM0, XMM0 ;
  2845. CMP [EBP+add], 0 ; add?
  2846. JE aligned16 ; no add
  2847. MOVSS XMM0, [EDX] ;
  2848. aligned16:
  2849. CMP EAX, 16 ;
  2850. JL aligned8 ; len < 4- > EXIT TO singlepieces
  2851. MOVAPS XMM1, [EBX] ;
  2852. MOVAPS XMM4, [ECX] ;
  2853. MOVAPS XMM2, [EBX+16] ;
  2854. MOVAPS XMM5, [ECX+16] ;
  2855. MULPS XMM1, XMM4 ;
  2856. ADDPS XMM0, XMM1 ;
  2857. MOVAPS XMM3, [EBX+32] ;
  2858. MOVAPS XMM6, [ECX+32] ;
  2859. MULPS XMM2, XMM5 ;
  2860. ADDPS XMM0, XMM2 ;
  2861. MOVAPS XMM7, [EBX+48] ;
  2862. MOVAPS XMM1, [ECX+48] ;
  2863. MULPS XMM3, XMM6 ;
  2864. ADDPS XMM0, XMM3 ;
  2865. MULPS XMM1, XMM7 ;
  2866. ADDPS XMM0, XMM1 ;
  2867. ADD EBX, 64 ;
  2868. ADD ECX, 64 ;
  2869. SUB EAX, 16 ;
  2870. JMP aligned16 ;
  2871. ; LOOP FOR 8 pieces aligned
  2872. aligned8:
  2873. CMP EAX, 8 ;
  2874. JL aligned4 ; ; len < 4- > EXIT TO singlepieces
  2875. MOVAPS XMM1, [EBX] ;
  2876. MOVAPS XMM4, [ECX] ;
  2877. MOVAPS XMM2, [EBX+16] ;
  2878. MOVAPS XMM5, [ECX+16] ;
  2879. MULPS XMM1, XMM4 ;
  2880. ADDPS XMM0, XMM1 ;
  2881. MULPS XMM2, XMM5 ;
  2882. ADDPS XMM0, XMM2 ;
  2883. ADD EBX, 32 ;
  2884. ADD ECX, 32 ;
  2885. SUB EAX, 8 ;
  2886. JMP aligned8 ;
  2887. aligned4:
  2888. CMP EAX, 4 ;
  2889. JL horizontaladd ; ; len < 4- > EXIT TO singlepieces
  2890. MOVAPS XMM1, [EBX] ;
  2891. MOVAPS XMM2, [ECX] ;
  2892. MULPS XMM1, XMM2 ;
  2893. ADDPS XMM0, XMM1 ;
  2894. ADD EBX, 16 ;
  2895. ADD ECX, 16 ;
  2896. SUB EAX, 4 ;
  2897. JMP aligned4 ;
  2898. horizontaladd: ;
  2899. MOVAPS XMM1, XMM0 ;
  2900. ; 1*0 (* dest 0 -> dest 0 *) + 4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  2901. SHUFPS XMM1, XMM1, 1*0 +4*1 +16*0 +64*1
  2902. ADDPS XMM1, XMM0 ;
  2903. MOVAPS XMM0, XMM1
  2904. SHUFPS XMM0, XMM0, 16*3 ; src 3- > dest 2
  2905. ADDPS XMM0, XMM1 ;
  2906. SHUFPS XMM0, XMM0, 1*2 ; dest 2- > dest 0
  2907. singlepieces: ;
  2908. CMP EAX, 0 ;
  2909. JLE store ; len <= 0- > EXIT
  2910. MOVSS XMM1, [EBX]
  2911. MOVSS XMM2, [ECX]
  2912. MULSS XMM1, XMM2
  2913. ADDSS XMM0, XMM1
  2914. ADD EBX, 4 ; INC(ladr, incl)
  2915. ADD ECX, 4 ; INC(radr, incr)
  2916. DEC EAX ; DEC(len)
  2917. JMP singlepieces ;
  2918. store:
  2919. MOVSS [EDX], XMM0 ;
  2920. endL:
  2921. END AlignedSPRSSE;
  2922. (*
  2923. (* sse version of scalar product *)
  2924. PROCEDURE AlignedSPRSSE( ladr, radr, dadr, rows, stride, dinc, len: LONGINT );
  2925. CODE {SYSTEM.i386, SYSTEM.SSE2}
  2926. ; register initialization
  2927. MOV EDI, [EBP+radr] ; radr start
  2928. MOV EDX, [EBP+dadr] ; EDX reserved for dadr
  2929. MOV ESI, [EBP+rows] ; outer loop counter
  2930. outerloop:
  2931. CMP ESI, 0 ;
  2932. JLE end ;
  2933. MOV EBX, [EBP+ladr] ; EBX reserved for ladr
  2934. MOV ECX, EDI ; ECX reserved for radr
  2935. MOV EAX, [EBP+len] ; EAX reserverd for length
  2936. XORPS XMM0, XMM0 ;
  2937. aligned16:
  2938. CMP EAX, 16 ;
  2939. JL aligned8 ; len < 4- > exit to singlepieces
  2940. MOVAPS XMM1, [EBX] ;
  2941. MOVAPS XMM2, [EBX+16] ;
  2942. MOVAPS XMM3, [EBX+32] ;
  2943. MOVAPS XMM4, [ECX] ;
  2944. MOVAPS XMM5, [ECX+16] ;
  2945. MOVAPS XMM6, [ECX+32] ;
  2946. MULPS XMM1, XMM4 ;
  2947. ADDPS XMM0, XMM1 ;
  2948. MULPS XMM2, XMM5 ;
  2949. ADDPS XMM0, XMM2 ;
  2950. MULPS XMM3, XMM6 ;
  2951. ADDPS XMM0, XMM3 ;
  2952. MOVAPS XMM7, [EBX+48] ;
  2953. MOVAPS XMM1, [ECX+48] ;
  2954. MULPS XMM1, XMM7 ;
  2955. ADDPS XMM0, XMM1 ;
  2956. ADD EBX, 64 ;
  2957. ADD ECX, 64 ;
  2958. SUB EAX, 16 ;
  2959. JMP aligned16 ;
  2960. ; loop for 8 pieces aligned
  2961. aligned8:
  2962. CMP EAX, 8 ;
  2963. JL aligned4 ; ; len < 4- > exit to singlepieces
  2964. MOVAPS XMM1, [EBX] ;
  2965. MOVAPS XMM2, [EBX+16] ;
  2966. MOVAPS XMM4, [ECX] ;
  2967. MOVAPS XMM5, [ECX+16] ;
  2968. MULPS XMM1, XMM4 ;
  2969. ADDPS XMM0, XMM1 ;
  2970. MULPS XMM2, XMM5 ;
  2971. ADDPS XMM0, XMM2 ;
  2972. ADD EBX, 32 ;
  2973. ADD ECX, 32 ;
  2974. SUB EAX, 8 ;
  2975. JMP aligned8 ;
  2976. aligned4:
  2977. CMP EAX, 4 ;
  2978. JL horizontaladd ; ; len < 4- > exit to singlepieces
  2979. MOVAPS XMM1, [EBX] ;
  2980. MOVAPS XMM2, [ECX] ;
  2981. MULPS XMM1, XMM2 ;
  2982. ADDPS XMM0, XMM1 ;
  2983. ADD EBX, 16 ;
  2984. ADD ECX, 16 ;
  2985. SUB EAX, 4 ;
  2986. JMP aligned4 ;
  2987. horizontaladd: ;
  2988. MOVAPS XMM1, XMM0 ;
  2989. SHUFPS XMM1, XMM1, 1*0 (* dest 0 -> dest 0 *) +4*1 (* dest 1 -> dest 1 *) +16*0 (* src 0 -> dest 2 *) +64*1 (* src 1 -> dest 3 *) ;
  2990. ADDPS XMM1, XMM0 ;
  2991. MOVAPS XMM0, XMM1
  2992. SHUFPS XMM0, XMM0, 16*3 ; (* src 3-> dest 2 *)
  2993. ADDPS XMM0, XMM1 ;
  2994. SHUFPS XMM0, XMM0, 1*2 ; (* dest 2 -> dest 0 *)
  2995. singlepieces: ;
  2996. CMP EAX, 0 ;
  2997. JLE store ; len <= 0- > exit
  2998. MOVSS XMM1, [EBX]
  2999. MOVSS XMM2, [ECX]
  3000. MULSS XMM1, XMM2
  3001. ADDSS XMM0, XMM1
  3002. ADD EBX, 4 (* INC(ladr,incl) *)
  3003. ADD ECX, 4 (* INC(radr,incr) *)
  3004. DEC EAX ; DEC(len)
  3005. JMP singlepieces ;
  3006. store:
  3007. MOVSS [EDX], XMM0 ;
  3008. ADD EDX, [EBP+dinc] ;
  3009. ADD EDI, [EBP+stride] ;
  3010. DEC ESI ;
  3011. JMP outerloop ;
  3012. end:
  3013. END AlignedSPRSSE;
  3014. *)
  3015. PROCEDURE Copy4( ladr, dadr: ADDRESS; linc, dinc, len: SIZE);
  3016. CODE {SYSTEM.i386}
  3017. MOV ESI, [EBP+ladr] ; ECX := ladr
  3018. MOV EDI, [EBP+dadr] ; EDX := dadr
  3019. MOV ECX, [EBP+len] ; EBX := len
  3020. MOV EAX, [EBP+linc] ;
  3021. CMP EAX, 4 ;
  3022. JNE loopL ;
  3023. MOV EAX, [EBP+dinc] ;
  3024. CMP EAX, 4 ;
  3025. JNE loopL ;
  3026. fastmove:
  3027. CLD ; incremental
  3028. REP ;
  3029. MOVSD ; move rest IN one byte steps
  3030. JMP endL ;
  3031. loopL:
  3032. CMP ECX, 0 ;
  3033. JLE endL ; WHILE ECX > 0 DO
  3034. MOV EAX, [ESI] ; EAX := SYSTEM.GET32(ESI)
  3035. MOV [EDI], EAX ; SYSTEM.PUT32(EDI, EAX))
  3036. ADD ESI, [EBP+linc] ; INC(ESI, linc)
  3037. ADD EDI, [EBP+dinc] ; INC(EDI, rinc)
  3038. DEC ECX ; DEC(ECX)
  3039. JMP loopL
  3040. endL:
  3041. END Copy4;
  3042. PROCEDURE Copy8( ladr, dadr: ADDRESS; linc, dinc, len: SIZE );
  3043. CODE {SYSTEM.i386}
  3044. MOV ESI, [EBP+ladr] ; ECX := ladr
  3045. MOV EDI, [EBP+dadr] ; EDX := dadr
  3046. MOV ECX, [EBP+len] ; EBX := len
  3047. MOV EAX, [EBP+linc] ;
  3048. CMP EAX, 8 ;
  3049. JNE loopL ;
  3050. MOV EAX, [EBP+dinc] ;
  3051. CMP EAX, 8 ;
  3052. JNE loopL ;
  3053. fastmove:
  3054. SHL ECX, 1 ;
  3055. CLD ; incremental
  3056. REP ;
  3057. MOVSD ; move rest IN one byte steps
  3058. JMP endL ;
  3059. loopL:
  3060. CMP ECX, 0 ;
  3061. JLE endL ; WHILE EBX > 0 DO
  3062. MOV EAX, [ESI] ; EAX := SYSTEM.GET32(ECX)
  3063. MOV [EDI], EAX ; SYSTEM.PUT32(EDX, EAX))
  3064. MOV EAX, [ESI+4] ; EAX := SYSTEM.GET32(ECX+4)
  3065. MOV [EDI+4], EAX ; SYSTEM.PUT32(EDX+4, EAX))
  3066. ADD ESI, [EBP+linc] ; INC(ECX, linc)
  3067. ADD EDI, [EBP+dinc] ; INC(EDX, rinc)
  3068. DEC ECX ; DEC(EBX)
  3069. JMP loopL
  3070. endL:
  3071. END Copy8;
  3072. PROCEDURE Transpose4A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3073. CODE {SYSTEM.i386}
  3074. startrows:
  3075. MOV EAX, [EBP+rows] ;
  3076. startouter:
  3077. CMP EAX, 0 ;
  3078. JLE endL ;
  3079. MOV ESI, [EBP+ladr] ;
  3080. MOV EDI, [EBP+dadr] ;
  3081. MOV EBX, [EBP+linc] ;
  3082. MOV ECX, [EBP+dstride] ;
  3083. MOV EAX, [EBP+cols] ;
  3084. startinner:
  3085. CMP EAX, 0 ;
  3086. JLE endinner ;
  3087. MOV EDX, [ESI] ;
  3088. MOV [EDI], EDX ;
  3089. ADD ESI, EBX ;
  3090. ADD EDI, ECX ;
  3091. DEC EAX ;
  3092. JMP startinner ;
  3093. endinner:
  3094. MOV ESI, [EBP+ladr] ;
  3095. ADD ESI, [EBP+lstride] ;
  3096. MOV [EBP+ladr], ESI
  3097. MOV EDI, [EBP+dadr] ;
  3098. ADD EDI, [EBP+dinc] ;
  3099. MOV [EBP+dadr], EDI ;
  3100. MOV EAX, [EBP+rows] ;
  3101. DEC EAX ;
  3102. MOV [EBP+rows], EAX ;
  3103. JMP startouter ;
  3104. endL:
  3105. END Transpose4A;
  3106. PROCEDURE Transpose4( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3107. VAR l, d, c: SIZE; BlockSize: SIZE;
  3108. BEGIN
  3109. BlockSize :=
  3110. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3111. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3112. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3113. BlockSize := MAX( 8, BlockSize );
  3114. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3115. WHILE (rows >= BlockSize) DO
  3116. c := cols; l := ladr; d := dadr;
  3117. WHILE (c >= BlockSize) DO
  3118. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3119. BlockSize );
  3120. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3121. INC( d, BlockSize * dstride );
  3122. END;
  3123. IF c > 0 THEN
  3124. Transpose4A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3125. END;
  3126. DEC( rows, BlockSize ); INC( ladr, BlockSize * lstride );
  3127. INC( dadr, BlockSize * dinc );
  3128. END;
  3129. IF (rows > 0) THEN
  3130. c := cols; l := ladr; d := dadr;
  3131. WHILE (c >= BlockSize) DO
  3132. Transpose4A( l, d, lstride, linc, dstride, dinc, rows,
  3133. BlockSize );
  3134. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3135. INC( d, BlockSize * dstride );
  3136. END;
  3137. IF c > 0 THEN
  3138. Transpose4A( l, d, lstride, linc, dstride, dinc, rows, c );
  3139. END;
  3140. END;
  3141. END Transpose4;
  3142. PROCEDURE Transpose8( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3143. VAR l, d, c: SIZE; BlockSize: SIZE;
  3144. BEGIN
  3145. BlockSize :=
  3146. MIN( L2BlockSize DIV lstride, L2BlockSize DIV lstride );
  3147. BlockSize := MIN( BlockSize, L2BlockSize DIV linc );
  3148. BlockSize := MIN( BlockSize, L2BlockSize DIV dinc );
  3149. BlockSize := MAX( 8, BlockSize );
  3150. (* KernelLog.String("Blocksize = "); KernelLog.Int(BlockSize,10); KernelLog.Ln; *)
  3151. WHILE (rows >= BlockSize) DO
  3152. c := cols; l := ladr; d := dadr;
  3153. WHILE (c >= BlockSize) DO
  3154. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize,
  3155. BlockSize );
  3156. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3157. INC( d, BlockSize * dstride );
  3158. END;
  3159. IF c > 0 THEN
  3160. Transpose8A( l, d, lstride, linc, dstride, dinc, BlockSize, c );
  3161. END;
  3162. DEC( rows, BlockSize ); INC( ladr, lstride * BlockSize );
  3163. INC( dadr, dinc * BlockSize );
  3164. END;
  3165. IF (rows > 0) THEN
  3166. c := cols; l := ladr; d := dadr;
  3167. WHILE (c >= BlockSize) DO
  3168. Transpose8A( l, d, lstride, linc, dstride, dinc, rows,
  3169. BlockSize );
  3170. DEC( c, BlockSize ); INC( l, BlockSize * linc );
  3171. INC( d, BlockSize * dstride );
  3172. END;
  3173. IF c > 0 THEN
  3174. Transpose8A( l, d, lstride, linc, dstride, dinc, rows, c );
  3175. END;
  3176. END;
  3177. END Transpose8;
  3178. PROCEDURE Transpose8A( ladr, dadr: ADDRESS; lstride, linc, dstride, dinc, rows, cols: SIZE );
  3179. CODE {SYSTEM.i386}
  3180. startrows:
  3181. MOV EAX, [EBP+rows] ;
  3182. startouter:
  3183. CMP EAX, 0 ;
  3184. JLE endL ;
  3185. MOV ESI, [EBP+ladr] ;
  3186. MOV EDI, [EBP+dadr] ;
  3187. MOV EBX, [EBP+linc] ;
  3188. MOV ECX, [EBP+dstride] ;
  3189. MOV EAX, [EBP+cols] ;
  3190. startinner:
  3191. CMP EAX, 0 ;
  3192. JLE endinner ;
  3193. MOV EDX, [ESI] ;
  3194. MOV [EDI], EDX ;
  3195. MOV EDX, [ESI+4] ;
  3196. MOV [EDI+4], EDX ;
  3197. ADD ESI, EBX ;
  3198. ADD EDI, ECX ;
  3199. DEC EAX ;
  3200. JMP startinner ;
  3201. endinner:
  3202. MOV ESI, [EBP+ladr] ;
  3203. ADD ESI, [EBP+lstride] ;
  3204. MOV [EBP+ladr], ESI
  3205. MOV EDI, [EBP+dadr] ;
  3206. ADD EDI, [EBP+dinc] ;
  3207. MOV [EBP+dadr], EDI ;
  3208. MOV EAX, [EBP+rows] ;
  3209. DEC EAX ;
  3210. MOV [EBP+rows], EAX ;
  3211. JMP startouter ;
  3212. endL:
  3213. END Transpose8A;
  3214. PROCEDURE SSEMul24BlockR( VAR CbFirst: SIZE;
  3215. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3216. add: BOOLEAN );
  3217. CODE {SYSTEM.i386, SYSTEM.SSE}
  3218. MatrixOfResultsSetup:
  3219. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3220. RowOfResultsLoop:
  3221. MOV EBX, 0 ; counter FOR columns IN B-Cb
  3222. DotProductSetup:
  3223. MOV ESI, [EBP+matrixA] ; matrixA
  3224. MOV EDI, [EBP+matrixB] ; matrixB
  3225. LEA EDI, [EDI+EBX*4] ; current position IN matrixB
  3226. XORPS XMM2, XMM2
  3227. XORPS XMM3, XMM3
  3228. XORPS XMM4, XMM4
  3229. XORPS XMM5, XMM5
  3230. XORPS XMM6, XMM6
  3231. XORPS XMM7, XMM7
  3232. MOV EAX, 0 ;
  3233. MOV AL, [EBP+add] ;
  3234. CMP AL, 0 ; add?
  3235. JE DotProductLoop ;
  3236. MOV EAX, [EBP+matrixC] ; matrixC
  3237. LEA EAX, [EAX+EBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3238. MOVUPS XMM2, [EAX]
  3239. MOVUPS XMM3, [EAX+16]
  3240. MOVUPS XMM4, [EAX+32]
  3241. MOVUPS XMM5, [EAX+48]
  3242. MOVUPS XMM6, [EAX+64]
  3243. MOVUPS XMM7, [EAX+80]
  3244. MOV EAX, 0
  3245. DotProductLoop:
  3246. MOV EDX, [ESI+EAX*4]
  3247. SHL EDX, 1
  3248. CMP EDX, 0
  3249. JE SparseEntryEscape
  3250. MOVSS XMM0, [ESI+EAX*4]
  3251. SHUFPS XMM0, XMM0, 0H
  3252. MOVUPS XMM1, [EDI]
  3253. MULPS XMM1, XMM0
  3254. ADDPS XMM2, XMM1
  3255. MOVUPS XMM1, [EDI+16]
  3256. MULPS XMM1, XMM0
  3257. ADDPS XMM3, XMM1
  3258. MOVUPS XMM1, [EDI+32]
  3259. MULPS XMM1, XMM0
  3260. ADDPS XMM4, XMM1
  3261. MOVUPS XMM1, [EDI+48]
  3262. MULPS XMM1, XMM0
  3263. ADDPS XMM5, XMM1
  3264. MOVUPS XMM1, [EDI+64]
  3265. MULPS XMM1, XMM0
  3266. ADDPS XMM6, XMM1
  3267. MOVUPS XMM1, [EDI+80]
  3268. MULPS XMM1, XMM0
  3269. ADDPS XMM7, XMM1
  3270. SparseEntryEscape:
  3271. ADD EDI, [EBP+StrideB] ; StrideB
  3272. INC EAX
  3273. CMP EAX, [EBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3274. JL DotProductLoop
  3275. ; endL DopProductLoop
  3276. MOV EAX, [EBP+matrixC] ; matrixC
  3277. LEA EAX, [EAX+EBX*4] ; adjust POINTER horizontally TO correct batch OF 24
  3278. MOVUPS [EAX], XMM2
  3279. MOVUPS [EAX+16], XMM3
  3280. MOVUPS [EAX+32], XMM4
  3281. MOVUPS [EAX+48], XMM5
  3282. MOVUPS [EAX+64], XMM6
  3283. MOVUPS [EAX+80], XMM7
  3284. ADD EBX, 24 ; move over TO next batch OF 24
  3285. MOV EDX, EBX
  3286. ADD EDX, 24
  3287. CMP EDX, [EBP+Cb] ; Cb, check TO see IF row IS complete
  3288. JLE DotProductSetup
  3289. ; endL RowOfResultsLoop
  3290. MOV EAX, [EBP+matrixA] ; matrixA
  3291. ADD EAX, [EBP+StrideA] ; StrideA
  3292. MOV [EBP+matrixA], EAX ; matrixA
  3293. MOV EAX, [EBP+matrixC] ; matrixC
  3294. ADD EAX, [EBP+StrideC] ; StrideC
  3295. MOV [EBP+matrixC], EAX ; matrixC
  3296. INC ECX
  3297. CMP ECX, [EBP+Ra] ; Ra
  3298. JL RowOfResultsLoop
  3299. Done:
  3300. MOV EAX, [EBP+CbFirst] ; CbFirst
  3301. MOV [EAX], EBX ;
  3302. END SSEMul24BlockR;
  3303. (*! might be better to make a 10Block operation and utilize 2 registers for temporary calculations, see article abaout Emmerald*)
  3304. PROCEDURE SSEMul12BlockX( VAR CbFirst: SIZE;
  3305. StrideA, StrideB, StrideC, Ca, Ra, Cb, Rb: SIZE; matrixA, matrixB, matrixC :ADDRESS;
  3306. add: BOOLEAN );
  3307. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3308. MatrixOfResultsSetup:
  3309. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3310. RowOfResultsLoop:
  3311. MOV EBX, 0 ; counter FOR columns IN B-Cb
  3312. DotProductSetup:
  3313. MOV ESI, [EBP+matrixA] ; matrixA
  3314. MOV EDI, [EBP+matrixB] ; matrixB
  3315. LEA EDI, [EDI+EBX*8]
  3316. XORPD XMM2, XMM2
  3317. XORPD XMM3, XMM3
  3318. XORPD XMM4, XMM4
  3319. XORPD XMM5, XMM5
  3320. XORPD XMM6, XMM6
  3321. XORPD XMM7, XMM7
  3322. MOV EAX, 0 ;
  3323. MOV AL, [EBP+add] ;
  3324. CMP AL, 0 ; add?
  3325. JE DotProductLoop ;
  3326. MOV EAX, [EBP+matrixC] ; matrixC
  3327. LEA EAX, [EAX+EBX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3328. MOVUPD XMM2, [EAX]
  3329. MOVUPD XMM3, [EAX+16]
  3330. MOVUPD XMM4, [EAX+32]
  3331. MOVUPD XMM5, [EAX+48]
  3332. MOVUPD XMM6, [EAX+64]
  3333. MOVUPD XMM7, [EAX+80]
  3334. MOV EAX, 0
  3335. DotProductLoop:
  3336. ; MOV EDX, [ESI+EAX*8]
  3337. ; SHL EDX, 1
  3338. ; CMP EDX, 0
  3339. ; JE SparseEntryEscape
  3340. MOVSD XMM0, [ESI+EAX*8]
  3341. SHUFPD XMM0, XMM0, 0H
  3342. MOVUPD XMM1, [EDI]
  3343. MULPD XMM1, XMM0
  3344. ADDPD XMM2, XMM1
  3345. MOVUPD XMM1, [EDI+16]
  3346. MULPD XMM1, XMM0
  3347. ADDPD XMM3, XMM1
  3348. MOVUPD XMM1, [EDI+32]
  3349. MULPD XMM1, XMM0
  3350. ADDPD XMM4, XMM1
  3351. MOVUPD XMM1, [EDI+48]
  3352. MULPD XMM1, XMM0
  3353. ADDPD XMM5, XMM1
  3354. MOVUPD XMM1, [EDI+64]
  3355. MULPD XMM1, XMM0
  3356. ADDPD XMM6, XMM1
  3357. MOVUPD XMM1, [EDI+80]
  3358. MULPD XMM1, XMM0
  3359. ADDPD XMM7, XMM1
  3360. SparseEntryEscape:
  3361. ADD EDI, [EBP+StrideB] ; StrideB
  3362. INC EAX
  3363. CMP EAX, [EBP+Ca] ; Ca, could also compare TO Rb since they must be equal
  3364. JL DotProductLoop ; endL DopProductLoop
  3365. MOV EAX , [EBP+matrixC] ; matrixC
  3366. LEA EAX, [EAX+EBX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3367. MOVUPD [EAX], XMM2
  3368. MOVUPD [EAX+16], XMM3
  3369. MOVUPD [EAX+32], XMM4
  3370. MOVUPD [EAX+48], XMM5
  3371. MOVUPD [EAX+64], XMM6
  3372. MOVUPD [EAX+80], XMM7
  3373. ADD EBX, 12 ; move over TO next batch OF 12
  3374. MOV EDX, EBX
  3375. ADD EDX, 12
  3376. CMP EDX, [EBP+Cb] ; Cb, check TO see IF row IS complete
  3377. JLE DotProductSetup ; end RowOfResultsLoop
  3378. MOV EAX , [EBP+matrixA] ; matrixA
  3379. ADD EAX, [EBP+StrideA] ; StrideA
  3380. MOV [EBP+matrixA], EAX ; matrixA
  3381. MOV EAX, [EBP+matrixC] ; matrixC
  3382. ADD EAX, [EBP+StrideC] ; StrideC
  3383. MOV [EBP+matrixC], EAX ; matrixC
  3384. INC ECX
  3385. CMP ECX, [EBP+Ra] ; Ra
  3386. JL RowOfResultsLoop
  3387. Done:
  3388. MOV EAX, [EBP+CbFirst] ; CbFirst
  3389. MOV [EAX], EBX ;
  3390. END SSEMul12BlockX;
  3391. PROCEDURE SSEMul16BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3392. add: BOOLEAN );
  3393. CODE {SYSTEM.i386, SYSTEM.SSE}
  3394. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3395. DotProductSetup:
  3396. MOV ESI, [EBP+matrixA] ; matrixA
  3397. MOV EDI, [EBP+matrixB] ; matrixB
  3398. MOV EDX, [EBP+CbFrom] ; CbFrom
  3399. LEA EDI, [EDI+EDX*4]
  3400. XORPS XMM2, XMM2
  3401. XORPS XMM3, XMM3
  3402. XORPS XMM4, XMM4
  3403. XORPS XMM5, XMM5
  3404. MOV EAX, 0 ;
  3405. MOV AL, [EBP+add] ;
  3406. CMP AL, 0 ; add?
  3407. JE DotProductLoop ;
  3408. MOV EAX, [EBP+matrixC] ; matrixC
  3409. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally
  3410. MOVUPS XMM2, [EAX]
  3411. MOVUPS XMM3, [EAX+16]
  3412. MOVUPS XMM4, [EAX+32]
  3413. MOVUPS XMM5, [EAX+48]
  3414. MOV EAX, 0
  3415. DotProductLoop:
  3416. MOV EDX, [ESI+EAX*4]
  3417. SHL EDX, 1
  3418. CMP EDX, 0
  3419. JE SparseEntryEscape
  3420. MOVSS XMM0, [ESI+EAX*4]
  3421. SHUFPS XMM0, XMM0, 0H
  3422. MOVUPS XMM1, [EDI]
  3423. MULPS XMM1, XMM0
  3424. ADDPS XMM2, XMM1
  3425. MOVUPS XMM1, [EDI+16]
  3426. MULPS XMM1, XMM0
  3427. ADDPS XMM3, XMM1
  3428. MOVUPS XMM1, [EDI+32]
  3429. MULPS XMM1, XMM0
  3430. ADDPS XMM4, XMM1
  3431. MOVUPS XMM1, [EDI+48]
  3432. MULPS XMM1, XMM0
  3433. ADDPS XMM5, XMM1
  3434. SparseEntryEscape:
  3435. ADD EDI, [EBP+StrideB] ; StrideB
  3436. INC EAX
  3437. CMP EAX, [EBP+Ca] ; Ca
  3438. JL DotProductLoop ; end DotProductLoop
  3439. MOV EAX , [EBP+matrixC] ; matrixC
  3440. MOV EDX, [EBP+CbFrom] ; CbFirst
  3441. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 12
  3442. MOVUPS [EAX], XMM2
  3443. MOVUPS [EAX+16], XMM3
  3444. MOVUPS [EAX+32], XMM4
  3445. MOVUPS [EAX+48], XMM5
  3446. MOV EAX, [EBP+matrixA] ; matrixA
  3447. ADD EAX, [EBP+StrideA] ; StrideA
  3448. MOV [EBP+matrixA], EAX ; matrixA
  3449. MOV EAX, [EBP+matrixC] ; matrixC
  3450. ADD EAX, [EBP+StrideC] ; StrideC
  3451. MOV [EBP+matrixC], EAX ; matrixC
  3452. INC ECX
  3453. CMP ECX, [EBP+Ra] ; Ra
  3454. JL DotProductSetup ;
  3455. END SSEMul16BlockR;
  3456. PROCEDURE SSEMul8BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3457. add: BOOLEAN );
  3458. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3459. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3460. DotProductSetup:
  3461. MOV ESI, [EBP+matrixA] ; matrixA
  3462. MOV EDI, [EBP+matrixB] ; matrixB
  3463. MOV EDX, [EBP+CbFrom] ; CbFrom
  3464. LEA EDI, [EDI+EDX*8]
  3465. XORPD XMM2, XMM2
  3466. XORPD XMM3, XMM3
  3467. XORPD XMM4, XMM4
  3468. XORPD XMM5, XMM5
  3469. MOV EAX, 0 ;
  3470. MOV AL, [EBP+add] ;
  3471. CMP AL, 0 ; add?
  3472. JE DotProductLoop ;
  3473. MOV EAX, [EBP+matrixC] ; matrixC
  3474. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 24
  3475. MOVUPD XMM2, [EAX]
  3476. MOVUPD XMM3, [EAX+16]
  3477. MOVUPD XMM4, [EAX+32]
  3478. MOVUPD XMM5, [EAX+48]
  3479. MOV EAX, 0
  3480. DotProductLoop:
  3481. ; MOV EDX, [ESI+EAX*8]
  3482. ; SHL EDX, 1
  3483. ; CMP EDX, 0
  3484. ; JE SparseEntryEscape
  3485. MOVSD XMM0, [ESI+EAX*8]
  3486. SHUFPD XMM0, XMM0, 0H
  3487. MOVUPD XMM1, [EDI]
  3488. MULPD XMM1, XMM0
  3489. ADDPD XMM2, XMM1
  3490. MOVUPD XMM1, [EDI+16]
  3491. MULPD XMM1, XMM0
  3492. ADDPD XMM3, XMM1
  3493. MOVUPD XMM1, [EDI+32]
  3494. MULPD XMM1, XMM0
  3495. ADDPD XMM4, XMM1
  3496. MOVUPD XMM1, [EDI+48]
  3497. MULPD XMM1, XMM0
  3498. ADDPD XMM5, XMM1
  3499. SparseEntryEscape:
  3500. ADD EDI, [EBP+StrideB] ; StrideB
  3501. INC EAX
  3502. CMP EAX, [EBP+Ca] ; Ca
  3503. JL DotProductLoop ; end DotProductLoop
  3504. MOV EAX , [EBP+matrixC] ; matrixC
  3505. MOV EDX, [EBP+CbFrom] ; CbFirst
  3506. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 12
  3507. MOVUPD [EAX], XMM2
  3508. MOVUPD [EAX+16], XMM3
  3509. MOVUPD [EAX+32], XMM4
  3510. MOVUPD [EAX+48], XMM5
  3511. MOV EAX, [EBP+matrixA] ; matrixA
  3512. ADD EAX, [EBP+StrideA] ; StrideA
  3513. MOV [EBP+matrixA], EAX ; matrixA
  3514. MOV EAX, [EBP+matrixC] ; matrixC
  3515. ADD EAX, [EBP+StrideC] ; StrideC
  3516. MOV [EBP+matrixC], EAX ; matrixC
  3517. INC ECX
  3518. CMP ECX, [EBP+Ra] ; Ra
  3519. JL DotProductSetup ;
  3520. END SSEMul8BlockX;
  3521. PROCEDURE SSEMul8BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3522. add: BOOLEAN );
  3523. CODE {SYSTEM.i386, SYSTEM.SSE}
  3524. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3525. DotProductSetup:
  3526. MOV ESI, [EBP+matrixA] ; matrixA
  3527. MOV EDI, [EBP+matrixB] ; matrixB
  3528. MOV EDX, [EBP+CbFrom] ; CbFrom
  3529. LEA EDI, [EDI+EDX*4]
  3530. XORPS XMM2, XMM2
  3531. XORPS XMM3, XMM3
  3532. MOV EAX, 0 ;
  3533. MOV AL, [EBP+add] ;
  3534. CMP AL, 0 ; add?
  3535. JE DotProductLoop ;
  3536. MOV EAX, [EBP+matrixC] ; matrixC
  3537. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3538. MOVUPS XMM2, [EAX]
  3539. MOVUPS XMM3, [EAX+16]
  3540. MOV EAX, 0
  3541. DotProductLoop:
  3542. MOV EDX, [ESI+EAX*4]
  3543. SHL EDX, 1
  3544. CMP EDX, 0
  3545. JE SparseEntryEscape
  3546. MOVSS XMM0, [ESI+EAX*4]
  3547. SHUFPS XMM0, XMM0, 0H
  3548. MOVUPS XMM1, [EDI]
  3549. MULPS XMM1, XMM0
  3550. ADDPS XMM2, XMM1
  3551. MOVUPS XMM1, [EDI+16]
  3552. MULPS XMM1, XMM0
  3553. ADDPS XMM3, XMM1
  3554. SparseEntryEscape:
  3555. ADD EDI, [EBP+StrideB] ; StrideB
  3556. INC EAX
  3557. CMP EAX, [EBP+Ca] ; Ca
  3558. JL DotProductLoop ; end DotProductLoop
  3559. MOV EAX , [EBP+matrixC] ; matrixC
  3560. MOV EDX, [EBP+CbFrom] ; CbFrom
  3561. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 8
  3562. MOVUPS [EAX], XMM2
  3563. MOVUPS [EAX+16], XMM3
  3564. MOV EAX, [EBP+matrixA] ; matrixA
  3565. ADD EAX, [EBP+StrideA] ; StrideA
  3566. MOV [EBP+matrixA], EAX ; matrixA
  3567. MOV EAX, [EBP+matrixC] ; matrixC
  3568. ADD EAX, [EBP+StrideC] ; StrideC
  3569. MOV [EBP+matrixC], EAX ; matrixC
  3570. INC ECX
  3571. CMP ECX, [EBP+Ra] ; Ra
  3572. JL DotProductSetup ;
  3573. END SSEMul8BlockR;
  3574. PROCEDURE SSEMul4BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3575. add: BOOLEAN );
  3576. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3577. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3578. DotProductSetup:
  3579. MOV EAX, 0 ; cols IN A
  3580. MOV ESI, [EBP+matrixA] ; matrixA
  3581. MOV EDI, [EBP+matrixB] ; matrixB
  3582. MOV EDX, [EBP+CbFrom] ; CbFrom
  3583. LEA EDI, [EDI+EDX*8]
  3584. XORPS XMM2, XMM2
  3585. XORPS XMM3, XMM3
  3586. MOV EAX, 0 ;
  3587. MOV AL, [EBP+add] ;
  3588. CMP AL, 0 ; add?
  3589. JE DotProductLoop ;
  3590. MOV EAX, [EBP+matrixC] ; matrixC
  3591. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3592. MOVUPD XMM2, [EAX]
  3593. MOVUPD XMM3, [EAX+16]
  3594. MOV EAX, 0
  3595. DotProductLoop:
  3596. ; MOV EDX, [ESI+EAX*8]
  3597. ; SHL EDX, 1
  3598. ; CMP EDX, 0
  3599. ; JE SparseEntryEscape
  3600. MOVSD XMM0, [ESI+EAX*8]
  3601. SHUFPD XMM0, XMM0, 0H
  3602. MOVUPD XMM1, [EDI]
  3603. MULPD XMM1, XMM0
  3604. ADDPD XMM2, XMM1
  3605. MOVUPD XMM1, [EDI+16]
  3606. MULPD XMM1, XMM0
  3607. ADDPD XMM3, XMM1
  3608. SparseEntryEscape:
  3609. ADD EDI, [EBP+StrideB] ; StrideB
  3610. INC EAX
  3611. CMP EAX, [EBP+Ca] ; Ca
  3612. JL DotProductLoop ; end DotProductLoop
  3613. MOV EAX , [EBP+matrixC] ; matrixC
  3614. MOV EDX, [EBP+CbFrom] ; CbFrom
  3615. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3616. MOVUPD [EAX], XMM2
  3617. MOVUPD [EAX+16], XMM3
  3618. MOV EAX, [EBP+matrixA] ; matrixA
  3619. ADD EAX, [EBP+StrideA] ; StrideA
  3620. MOV [EBP+matrixA], EAX ; matrixA
  3621. MOV EAX, [EBP+matrixC] ; matrixC
  3622. ADD EAX, [EBP+StrideC] ; StrideC
  3623. MOV [EBP+matrixC], EAX ; matrixC
  3624. INC ECX
  3625. CMP ECX, [EBP+Ra] ; Ra
  3626. JL DotProductSetup ;
  3627. END SSEMul4BlockX;
  3628. PROCEDURE SSEMul4BlockR( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3629. add: BOOLEAN );
  3630. CODE {SYSTEM.i386, SYSTEM.SSE}
  3631. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3632. DotProductSetup:
  3633. MOV EAX, 0 ; cols IN A
  3634. MOV ESI, [EBP+matrixA] ; matrixA
  3635. MOV EDI, [EBP+matrixB] ; matrixB
  3636. MOV EDX, [EBP+CbFrom] ; CbFrom
  3637. LEA EDI, [EDI+EDX*4]
  3638. XORPS XMM2, XMM2
  3639. MOV EAX, 0 ;
  3640. MOV AL, [EBP+add] ;
  3641. CMP AL, 0 ; add?
  3642. JE DotProductLoop ;
  3643. MOV EAX, [EBP+matrixC] ; matrixC
  3644. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3645. MOVUPS XMM2, [EAX]
  3646. MOV EAX, 0
  3647. DotProductLoop:
  3648. MOV EDX, [ESI+EAX*4]
  3649. SHL EDX, 1
  3650. CMP EDX, 0
  3651. JE SparseEntryEscape
  3652. MOVSS XMM0, [ESI+EAX*4]
  3653. SHUFPS XMM0, XMM0, 0H
  3654. MOVUPS XMM1, [EDI]
  3655. MULPS XMM1, XMM0
  3656. ADDPS XMM2, XMM1
  3657. SparseEntryEscape:
  3658. ADD EDI, [EBP+StrideB] ; StrideB
  3659. INC EAX
  3660. CMP EAX, [EBP+Ca] ; Ca
  3661. JL DotProductLoop ; end DopProductLoop
  3662. MOV EAX, [EBP+matrixC] ; matrixC
  3663. MOV EDX, [EBP+CbFrom] ; CbFrom
  3664. LEA EAX, [EAX+EDX*4] ; adjust POINTER horizontally TO correct batch OF 4
  3665. MOVUPS [EAX], XMM2
  3666. MOV EAX, [EBP+matrixA] ; matrixA
  3667. ADD EAX, [EBP+StrideA] ; StrideA
  3668. MOV [EBP+matrixA], EAX ; matrixA
  3669. MOV EAX, [EBP+matrixC] ; matrixC
  3670. ADD EAX, [EBP+StrideC] ; StrideC
  3671. MOV [EBP+matrixC], EAX ; matrixC
  3672. INC ECX
  3673. CMP ECX, [EBP+Ra] ; Ra
  3674. JL DotProductSetup ;
  3675. END SSEMul4BlockR;
  3676. PROCEDURE SSEMul2BlockX( StrideA, StrideB, StrideC, Ca, Ra, CbFrom: SIZE; matrixA, matrixB, matrixC: ADDRESS;
  3677. add: BOOLEAN );
  3678. CODE {SYSTEM.i386, SYSTEM.SSE2}
  3679. MOV ECX, 0 ; counter FOR rows IN A-Ra
  3680. DotProductSetup:
  3681. MOV EAX, 0 ; cols IN A
  3682. MOV ESI, [EBP+matrixA] ; matrixA
  3683. MOV EDI, [EBP+matrixB] ; matrixB
  3684. MOV EDX, [EBP+CbFrom] ; CbFrom
  3685. LEA EDI, [EDI+EDX*8]
  3686. XORPD XMM2, XMM2
  3687. MOV EAX, 0 ;
  3688. MOV AL, [EBP+add] ;
  3689. CMP AL, 0 ; add?
  3690. JE DotProductLoop ;
  3691. MOV EAX, [EBP+matrixC] ; matrixC
  3692. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3693. MOVUPD XMM2, [EAX]
  3694. MOV EAX, 0
  3695. DotProductLoop:
  3696. ; MOV EDX, [ESI+EAX*4] ;
  3697. ; SHL EDX, 1 ;
  3698. ; CMP EDX, 0
  3699. ; JE SparseEntryEscape
  3700. MOVSD XMM0, [ESI+EAX*8]
  3701. SHUFPD XMM0, XMM0, 0H
  3702. MOVUPD XMM1, [EDI]
  3703. MULPD XMM1, XMM0
  3704. ADDPD XMM2, XMM1
  3705. SparseEntryEscape:
  3706. ADD EDI, [EBP+StrideB] ; StrideB
  3707. INC EAX
  3708. CMP EAX, [EBP+Ca] ; Ca
  3709. JL DotProductLoop ; end DotProductLoop
  3710. MOV EAX , [EBP+matrixC] ; matrixC
  3711. MOV EDX, [EBP+CbFrom] ; CbFrom
  3712. LEA EAX, [EAX+EDX*8] ; adjust POINTER horizontally TO correct batch OF 8
  3713. MOVUPD [EAX], XMM2
  3714. MOV EAX, [EBP+matrixA] ; matrixA
  3715. ADD EAX, [EBP+StrideA] ; StrideA
  3716. MOV [EBP+matrixA], EAX ; matrixA
  3717. MOV EAX, [EBP+matrixC] ; matrixC
  3718. ADD EAX, [EBP+StrideC] ; StrideC
  3719. MOV [EBP+matrixC], EAX ; matrixC
  3720. INC ECX
  3721. CMP ECX, [EBP+Ra] ; Ra
  3722. JL DotProductSetup ;
  3723. END SSEMul2BlockX;
  3724. (****** blocking matrix multiplication with copy of data ******)
  3725. PROCEDURE MagicBlockR( M, N, K: SIZE;
  3726. VAR L2BlockM, L2BlockN, L2BlockK: SIZE );
  3727. BEGIN
  3728. K := (K DIV L0BlockKR) * L0BlockKR;
  3729. N := (N DIV L1BlockN) * L1BlockN;
  3730. IF M = 0 THEN M := 1 END;
  3731. IF N = 0 THEN N := 1 END;
  3732. IF K = 0 THEN K := 1 END;
  3733. L2BlockK :=
  3734. K DIV ((K + L1MaxBlockKR - 1) DIV L1MaxBlockKR);
  3735. (* Round up to next multiple of 16 *)
  3736. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  3737. L2BlockN :=
  3738. L2BlockSize DIV SIZEOF( REAL ) DIV
  3739. (L2BlockK * (L2BARatio + 1));
  3740. IF L2BlockN > N THEN L2BlockN := N
  3741. ELSIF L2BlockN < 1 THEN L2BlockN := 1;
  3742. END;
  3743. L2BlockM :=
  3744. (L2BlockSize DIV SIZEOF( REAL ) - L2BlockN * L2BlockK) DIV
  3745. L2BlockK;
  3746. (* Round up to next multiple of 5 *)
  3747. IF L2BlockM > M THEN L2BlockM := M
  3748. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  3749. END;
  3750. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  3751. END MagicBlockR;
  3752. PROCEDURE MagicBlockX( M, N, K: SIZE;
  3753. VAR L2BlockM, L2BlockN, L2BlockK:SIZE );
  3754. BEGIN
  3755. K := (K DIV L0BlockKX) * L0BlockKX;
  3756. N := (N DIV L1BlockN) * L1BlockN;
  3757. IF M = 0 THEN M := 1 END;
  3758. IF N = 0 THEN N := 1 END;
  3759. IF K = 0 THEN K := 1 END;
  3760. L2BlockK :=
  3761. K DIV ((K + L1MaxBlockKX - 1) DIV L1MaxBlockKX);
  3762. (* Round up to next multiple of 16 *)
  3763. L2BlockK := L2BlockK + (-L2BlockK) MOD 16;
  3764. L2BlockN :=
  3765. L2BlockSize DIV SIZEOF( LONGREAL ) DIV
  3766. (L2BlockK * (L2BARatio + 1));
  3767. IF L2BlockN > N THEN L2BlockN := N END;
  3768. L2BlockM :=
  3769. (L2BlockSize DIV SIZEOF( LONGREAL ) - L2BlockN * L2BlockK) DIV
  3770. L2BlockK;
  3771. (* Round up to next multiple of 5 *)
  3772. IF L2BlockM > M THEN L2BlockM := M
  3773. ELSIF L2BlockM < 1 THEN L2BlockM := 1
  3774. END;
  3775. L2BlockN := L2BlockN + (-L2BlockN) MOD L1BlockN;
  3776. END MagicBlockX;
  3777. (*
  3778. PROCEDURE L1Block1X( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3779. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  3780. PROCEDURE null( i: LONGINT );
  3781. BEGIN
  3782. reg[i, 0] := 0; reg[i, 1] := 0;
  3783. END null;
  3784. PROCEDURE get1( adr, i: LONGINT );
  3785. BEGIN
  3786. SYSTEM.GET( adr, reg[i, 0] );
  3787. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3788. END get1;
  3789. PROCEDURE get2( adr, i: LONGINT );
  3790. BEGIN
  3791. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  3792. IF debug THEN
  3793. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3794. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  3795. END;
  3796. END get2;
  3797. PROCEDURE mul2( i, j: LONGINT );
  3798. BEGIN
  3799. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3800. END mul2;
  3801. PROCEDURE add2( i, j: LONGINT );
  3802. BEGIN
  3803. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3804. END add2;
  3805. PROCEDURE put1( adr, i: LONGINT );
  3806. BEGIN
  3807. SYSTEM.PUT( adr, reg[i, 0] );
  3808. END put1;
  3809. PROCEDURE horadd( i: LONGINT );
  3810. BEGIN
  3811. reg[i, 0] := reg[i, 0] + reg[i, 1];
  3812. END horadd;
  3813. BEGIN
  3814. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  3815. null( 2 ); get1( adrC, 2 );
  3816. WHILE (K > 0) DO (* padding guaranteed *)
  3817. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 ); INC( adrB, 16 );
  3818. INC( adrA, 16 ); DEC( K, 2 );
  3819. END;
  3820. horadd( 2 ); put1( adrC, 2 );
  3821. END L1Block1X;
  3822. PROCEDURE L1Block5X( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3823. VAR reg: ARRAY 8 OF ARRAY 2 OF LONGREAL;
  3824. PROCEDURE null( i: LONGINT );
  3825. BEGIN
  3826. reg[i, 0] := 0; reg[i, 1] := 0;
  3827. END null;
  3828. PROCEDURE get1( adr, i: LONGINT );
  3829. BEGIN
  3830. SYSTEM.GET( adr, reg[i, 0] );
  3831. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3832. END get1;
  3833. PROCEDURE get2( adr, i: LONGINT );
  3834. BEGIN
  3835. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 8, reg[i, 1] );
  3836. IF debug THEN
  3837. KernelLog.String( "get 2: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3838. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Ln;
  3839. END;
  3840. END get2;
  3841. PROCEDURE mul2( i, j: LONGINT );
  3842. BEGIN
  3843. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3844. END mul2;
  3845. PROCEDURE add2( i, j: LONGINT );
  3846. BEGIN
  3847. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3848. END add2;
  3849. PROCEDURE put1( adr, i: LONGINT );
  3850. BEGIN
  3851. SYSTEM.PUT( adr, reg[i, 0] );
  3852. END put1;
  3853. PROCEDURE horadd( i: LONGINT );
  3854. BEGIN
  3855. reg[i, 0] := reg[i, 0] + reg[i, 1];
  3856. END horadd;
  3857. BEGIN
  3858. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  3859. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  3860. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  3861. get1( adrC + 4 * IncC, 6 );
  3862. WHILE (K > 0) DO (* padding guaranteed *)
  3863. get2( adrA, 7 ); get2( adrB, 0 ); mul2( 0, 7 ); add2( 2, 0 );
  3864. get2( adrB + 16, 0 ); mul2( 0, 7 ); add2( 3, 0 ); get2( adrB + 32, 0 );
  3865. mul2( 0, 7 ); add2( 4, 0 ); get2( adrB + 48, 0 ); mul2( 0, 7 );
  3866. add2( 5, 0 ); get2( adrB + 64, 0 ); mul2( 0, 7 ); add2( 6, 0 ); INC( adrB, 80 );
  3867. INC( adrA, 16 ); DEC( K, 2 );
  3868. END;
  3869. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  3870. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  3871. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  3872. END L1Block5X;
  3873. PROCEDURE L1Block1R( adrA, adrB, adrC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3874. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  3875. PROCEDURE null( i: LONGINT );
  3876. BEGIN
  3877. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  3878. END null;
  3879. PROCEDURE get1( adr, i: LONGINT );
  3880. BEGIN
  3881. SYSTEM.GET( adr, reg[i, 0] );
  3882. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3883. END get1;
  3884. PROCEDURE get4( adr, i: LONGINT );
  3885. BEGIN
  3886. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  3887. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  3888. IF debug THEN
  3889. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3890. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  3891. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  3892. END;
  3893. END get4;
  3894. PROCEDURE mul4( i, j: LONGINT );
  3895. BEGIN
  3896. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3897. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  3898. END mul4;
  3899. PROCEDURE add4( i, j: LONGINT );
  3900. BEGIN
  3901. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3902. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  3903. END add4;
  3904. PROCEDURE put1( adr, i: LONGINT );
  3905. BEGIN
  3906. SYSTEM.PUT( adr, reg[i, 0] );
  3907. END put1;
  3908. PROCEDURE horadd( i: LONGINT );
  3909. BEGIN
  3910. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  3911. END horadd;
  3912. BEGIN
  3913. IF debug THEN KernelLog.String( "L1Block1" ); KernelLog.Ln; END;
  3914. null( 2 ); get1( adrC, 2 );
  3915. WHILE (K > 0) DO (* padding guaranteed *)
  3916. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 ); INC( adrB, 16 );
  3917. INC( adrA, 16 ); DEC( K, 4 );
  3918. END;
  3919. horadd( 2 ); put1( adrC, 2 );
  3920. END L1Block1R;
  3921. PROCEDURE L1Block5R( adrA, adrB, adrC, IncC, K: LONGINT ); (* condition: K MOD 4 = 0 *)
  3922. VAR reg: ARRAY 8 OF ARRAY 4 OF REAL;
  3923. PROCEDURE null( i: LONGINT );
  3924. BEGIN
  3925. reg[i, 0] := 0; reg[i, 1] := 0; reg[i, 2] := 0; reg[i, 3] := 0;
  3926. END null;
  3927. PROCEDURE get1( adr, i: LONGINT );
  3928. BEGIN
  3929. SYSTEM.GET( adr, reg[i, 0] );
  3930. IF debug THEN KernelLog.String( "get 1: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 ); KernelLog.Ln; END;
  3931. END get1;
  3932. PROCEDURE get4( adr, i: LONGINT );
  3933. BEGIN
  3934. SYSTEM.GET( adr, reg[i, 0] ); SYSTEM.GET( adr + 4, reg[i, 1] );
  3935. SYSTEM.GET( adr + 8, reg[i, 2] ); SYSTEM.GET( adr + 12, reg[i, 3] );
  3936. IF debug THEN
  3937. KernelLog.String( "get 4: " ); KernelLog.Int( ENTIER( reg[i, 0] + 0.5 ), 5 );
  3938. KernelLog.Int( ENTIER( reg[i, 1] + 0.5 ), 5 ); KernelLog.Int( ENTIER( reg[i, 2] + 0.5 ), 5 );
  3939. KernelLog.Int( ENTIER( reg[i, 3] + 0.5 ), 5 ); KernelLog.Ln;
  3940. END;
  3941. END get4;
  3942. PROCEDURE mul4( i, j: LONGINT );
  3943. BEGIN
  3944. reg[i, 0] := reg[i, 0] * reg[j, 0]; reg[i, 1] := reg[i, 1] * reg[j, 1];
  3945. reg[i, 2] := reg[i, 2] * reg[j, 2]; reg[i, 3] := reg[i, 3] * reg[j, 3];
  3946. END mul4;
  3947. PROCEDURE add4( i, j: LONGINT );
  3948. BEGIN
  3949. reg[i, 0] := reg[i, 0] + reg[j, 0]; reg[i, 1] := reg[i, 1] + reg[j, 1];
  3950. reg[i, 2] := reg[i, 2] + reg[j, 2]; reg[i, 3] := reg[i, 3] + reg[j, 3];
  3951. END add4;
  3952. PROCEDURE put1( adr, i: LONGINT );
  3953. BEGIN
  3954. SYSTEM.PUT( adr, reg[i, 0] );
  3955. END put1;
  3956. PROCEDURE horadd( i: LONGINT );
  3957. BEGIN
  3958. reg[i, 0] := reg[i, 0] + reg[i, 1] + reg[i, 2] + reg[i, 3];
  3959. END horadd;
  3960. BEGIN
  3961. IF debug THEN KernelLog.String( "L1Block5" ); KernelLog.Ln; END;
  3962. null( 2 ); null( 3 ); null( 4 ); null( 5 ); null( 6 ); get1( adrC, 2 );
  3963. get1( adrC + IncC, 3 ); get1( adrC + 2 * IncC, 4 ); get1( adrC + 3 * IncC, 5 );
  3964. get1( adrC + 4 * IncC, 6 );
  3965. WHILE (K > 0) DO (* padding guaranteed *)
  3966. get4( adrA, 7 ); get4( adrB, 0 ); mul4( 0, 7 ); add4( 2, 0 );
  3967. get4( adrB + 16, 0 ); mul4( 0, 7 ); add4( 3, 0 ); get4( adrB + 32, 0 );
  3968. mul4( 0, 7 ); add4( 4, 0 ); get4( adrB + 48, 0 ); mul4( 0, 7 );
  3969. add4( 5, 0 ); get4( adrB + 64, 0 ); mul4( 0, 7 ); add4( 6, 0 ); INC( adrB, 80 );
  3970. INC( adrA, 16 ); DEC( K, 4 );
  3971. END;
  3972. horadd( 2 ); horadd( 3 ); horadd( 4 ); horadd( 5 ); horadd( 6 );
  3973. put1( adrC, 2 ); put1( adrC + IncC, 3 ); put1( adrC + 2 * IncC, 4 );
  3974. put1( adrC + 3 * IncC, 5 ); put1( adrC + 4 * IncC, 6 );
  3975. END L1Block5R;
  3976. *)
  3977. PROCEDURE DispCR( adrM: ADDRESS;
  3978. inc, stride, M, N: SIZE );
  3979. VAR i, j: SIZE; adr: ADDRESS; val: REAL;
  3980. BEGIN
  3981. FOR i := 0 TO M - 1 DO
  3982. adr := adrM + i * stride;
  3983. FOR j := 0 TO N - 1 DO
  3984. SYSTEM.GET( adr, val );
  3985. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  3986. END;
  3987. KernelLog.Ln;
  3988. END;
  3989. END DispCR;
  3990. PROCEDURE DispCX( adrM: ADDRESS;
  3991. inc, stride, M, N: SIZE );
  3992. VAR i, j: SIZE; adr: ADDRESS; val: LONGREAL;
  3993. BEGIN
  3994. FOR i := 0 TO M - 1 DO
  3995. adr := adrM + i * stride;
  3996. FOR j := 0 TO N - 1 DO
  3997. SYSTEM.GET( adr, val );
  3998. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); INC( adr, inc );
  3999. END;
  4000. KernelLog.Ln;
  4001. END;
  4002. END DispCX;
  4003. PROCEDURE L3BlockX( matrixA, matrixB, matrixC: ADDRESS;
  4004. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4005. (*
  4006. K N
  4007. *** N *****
  4008. M *** ****** -> ***** M
  4009. *** K ****** *****
  4010. *** ****** *****
  4011. A * B -> C
  4012. *)
  4013. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4014. KAligned: SIZE;
  4015. CONST Size = SIZEOF( LONGREAL );
  4016. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4017. (* M,N and K arbitrary ! *)
  4018. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4019. m, k, KAligned: SIZE;
  4020. BEGIN
  4021. KAligned := Align2( K ) * 8;
  4022. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4023. END;
  4024. adrB := matrixB;
  4025. WHILE (N >= L1BlockN) DO
  4026. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4027. adrC := matrixC; adrA := matrixA; m := M;
  4028. WHILE (m > 0) DO
  4029. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4030. IF SSE THEN
  4031. L1Block5XSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4032. ELSE
  4033. aadrA := adrA; aadrB := adrB; k := K;
  4034. WHILE (k > 0) DO
  4035. L1Block1XA( aadrA, aadrB, adrC, 2 );
  4036. L1Block1XA( aadrA, aadrB + 16, adrC + incC, 2 );
  4037. L1Block1XA( aadrA, aadrB + 32, adrC + 2 * incC,
  4038. 2 );
  4039. L1Block1XA( aadrA, aadrB + 48, adrC + 3 * incC,
  4040. 2 );
  4041. L1Block1XA( aadrA, aadrB + 64, adrC + 4 * incC,
  4042. 2 );
  4043. DEC( k, 2 ); INC( aadrA, 16 );
  4044. INC( aadrB, 16 * L1BlockN );
  4045. END;
  4046. END;
  4047. IF debug THEN
  4048. DispCX( matrixC, incC, strideC, M, N );
  4049. END;
  4050. INC( adrA, KAligned ); INC( adrC, strideC );
  4051. DEC( m );
  4052. END;
  4053. INC( matrixC, L1BlockN * incC );
  4054. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4055. END;
  4056. WHILE (N > 0) DO
  4057. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4058. adrC := matrixC; adrA := matrixA; m := M;
  4059. WHILE (m > 0) DO
  4060. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4061. IF SSE THEN
  4062. L1Block1XSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4063. ELSE L1Block1XA( adrA, adrB, adrC, K );
  4064. END;
  4065. IF debug THEN
  4066. DispCX( matrixC, incC, strideC, M, N );
  4067. END;
  4068. INC( adrA, KAligned ); INC( adrC, strideC );
  4069. DEC( m );
  4070. END;
  4071. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4072. END;
  4073. END L2Block;
  4074. BEGIN
  4075. KAligned := Align2( K ) * 8;
  4076. ASSERT( L2BlockK MOD 2 = 0 );
  4077. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4078. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4079. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4080. WHILE (n >= L2BlockN) DO
  4081. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4082. a1 := matrixA; adrC := matrixC; m := M;
  4083. WHILE (m >= L2BlockM) DO
  4084. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4085. adrA := a1; adrB := b1; k := K;
  4086. (* core: do matching level 2 cache Blocks *)
  4087. WHILE (k >= L2BlockK) DO
  4088. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4089. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4090. L2BlockK );
  4091. INC( adrA, L2BlockK * L2BlockM * Size );
  4092. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4093. DEC( k, L2BlockK );
  4094. END;
  4095. (* core: do rest of k *)
  4096. IF k > 0 THEN
  4097. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4098. END;
  4099. INC( a1, KAligned * L2BlockM );
  4100. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4101. END;
  4102. IF m > 0 THEN
  4103. (* clean up M *)
  4104. adrA := a1; adrB := b1; k := K;
  4105. WHILE (k >= L2BlockK) DO
  4106. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4107. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4108. INC( adrA, L2BlockK * Size * m );
  4109. INC( adrB, L2BlockK * L2BlockN * Size );
  4110. DEC( k, L2BlockK );
  4111. END;
  4112. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4113. IF k > 0 THEN
  4114. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4115. END;
  4116. END;
  4117. INC( b1, L2BlockN * KAligned );
  4118. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4119. END;
  4120. IF (n = 0) THEN RETURN
  4121. END;
  4122. a1 := matrixA; adrC := matrixC; m := M;
  4123. WHILE (m >= L2BlockM) DO
  4124. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4125. adrA := a1; adrB := b1; k := K;
  4126. WHILE (k >= L2BlockK) DO
  4127. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4128. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4129. INC( adrA, L2BlockM * L2BlockK * Size );
  4130. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4131. END;
  4132. IF k > 0 THEN
  4133. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4134. END;
  4135. INC( a1, L2BlockM * KAligned );
  4136. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4137. END;
  4138. IF (m = 0) THEN RETURN
  4139. END;
  4140. adrA := a1; adrB := b1; k := K;
  4141. WHILE (k >= L2BlockK) DO
  4142. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4143. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4144. INC( adrA, L2BlockK * m * Size );
  4145. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4146. END;
  4147. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4148. END;
  4149. END L3BlockX;
  4150. PROCEDURE L3BlockR( matrixA, matrixB, matrixC: ADDRESS;
  4151. M, N, K, incC, strideC, L2BlockM, L2BlockN, L2BlockK: SIZE );
  4152. (*
  4153. K N
  4154. *** N *****
  4155. M *** ****** -> ***** M
  4156. *** K ****** *****
  4157. *** ****** *****
  4158. A * B -> C
  4159. *)
  4160. VAR m, n, k, a1, b1: SIZE; adrA, adrB, adrC: ADDRESS;
  4161. KAligned: SIZE;
  4162. CONST Size = SIZEOF( REAL );
  4163. PROCEDURE L2Block( matrixA, matrixB, matrixC: ADDRESS; M, N, K: SIZE );
  4164. (* M,N and K arbitrary ! *)
  4165. VAR adrA, adrB, adrC: ADDRESS; aadrA, aadrB: ADDRESS;
  4166. m, KAligned, k: SIZE;
  4167. BEGIN
  4168. KAligned := Align4( K ) * 4;
  4169. IF debug THEN ASSERT( M > 0 ); ASSERT( K > 0 );
  4170. END;
  4171. adrB := matrixB;
  4172. WHILE (N >= L1BlockN) DO
  4173. IF debug THEN KernelLog.String( "LoopL2N" ); KernelLog.Ln END;
  4174. adrC := matrixC; adrA := matrixA; m := M;
  4175. WHILE (m > 0) DO
  4176. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4177. IF SSE THEN
  4178. L1Block5RSSE( adrA, adrB, adrC, incC, K ); (* L1 Block *)
  4179. ELSE
  4180. aadrA := adrA; aadrB := adrB; k := K;
  4181. WHILE (k > 0) DO
  4182. L1Block1RA( aadrA, aadrB, adrC, 4 );
  4183. L1Block1RA( aadrA, aadrB + 16, adrC + incC, 4 );
  4184. L1Block1RA( aadrA, aadrB + 32, adrC + 2 * incC,
  4185. 4 );
  4186. L1Block1RA( aadrA, aadrB + 48, adrC + 3 * incC,
  4187. 4 );
  4188. L1Block1RA( aadrA, aadrB + 64, adrC + 4 * incC,
  4189. 4 );
  4190. DEC( k, 4 ); INC( aadrA, 16 );
  4191. INC( aadrB, 16 * L1BlockN );
  4192. END;
  4193. END;
  4194. IF debug THEN
  4195. DispCR( matrixC, incC, strideC, M, N );
  4196. END;
  4197. INC( adrA, KAligned ); INC( adrC, strideC );
  4198. DEC( m );
  4199. END;
  4200. INC( matrixC, L1BlockN * incC );
  4201. INC( adrB, L1BlockN * KAligned ); DEC( N, L1BlockN );
  4202. END;
  4203. WHILE (N > 0) DO
  4204. IF debug THEN KernelLog.String( "LoopL2N rest" ); KernelLog.Ln END;
  4205. adrC := matrixC; adrA := matrixA; m := M;
  4206. WHILE (m > 0) DO
  4207. IF debug THEN KernelLog.String( "LoopL2M" ); KernelLog.Ln END;
  4208. IF SSE THEN
  4209. L1Block1RSSE( adrA, adrB, adrC, K ); (* L1 Block *)
  4210. ELSE L1Block1RA( adrA, adrB, adrC, K );
  4211. END;
  4212. IF debug THEN
  4213. DispCR( matrixC, incC, strideC, M, N );
  4214. END;
  4215. INC( adrA, KAligned ); INC( adrC, strideC );
  4216. DEC( m );
  4217. END;
  4218. INC( matrixC, incC ); INC( adrB, KAligned ); DEC( N );
  4219. END;
  4220. END L2Block;
  4221. BEGIN
  4222. KAligned := Align4( K ) * 4;
  4223. ASSERT( L2BlockK MOD 4 = 0 );
  4224. IF SSE THEN ASSERT( L2BlockN MOD L1BlockN = 0 ); END;
  4225. m := M; n := N; k := K; a1 := matrixA; adrA := matrixA;
  4226. b1 := matrixB; adrB := matrixB; adrC := matrixC;
  4227. WHILE (n >= L2BlockN) DO
  4228. IF debug THEN KernelLog.String( "LoopL3N" ); KernelLog.Ln END;
  4229. a1 := matrixA; adrC := matrixC; m := M;
  4230. WHILE (m >= L2BlockM) DO
  4231. IF debug THEN KernelLog.String( "LoopL3M" ); KernelLog.Ln END;
  4232. adrA := a1; adrB := b1; k := K;
  4233. (* core: do matching level 2 cache Blocks *)
  4234. WHILE (k >= L2BlockK) DO
  4235. IF debug THEN KernelLog.String( "LoopL3K" ); KernelLog.Ln END;
  4236. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN,
  4237. L2BlockK );
  4238. INC( adrA, L2BlockK * L2BlockM * Size );
  4239. INC( adrB, L2BlockK * L2BlockN * Size ); (* no padding required *)
  4240. DEC( k, L2BlockK );
  4241. END;
  4242. (* core: do rest of k *)
  4243. IF k > 0 THEN
  4244. L2Block( adrA, adrB, adrC, L2BlockM, L2BlockN, k );
  4245. END;
  4246. INC( a1, KAligned * L2BlockM );
  4247. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4248. END;
  4249. IF m > 0 THEN
  4250. (* clean up M *)
  4251. adrA := a1; adrB := b1; k := K;
  4252. WHILE (k >= L2BlockK) DO
  4253. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4254. L2Block( adrA, adrB, adrC, m, L2BlockN, L2BlockK );
  4255. INC( adrA, L2BlockK * Size * m );
  4256. INC( adrB, L2BlockK * L2BlockN * Size );
  4257. DEC( k, L2BlockK );
  4258. END;
  4259. IF debug THEN KernelLog.String( "LoopL3K rest k" ); KernelLog.Ln END;
  4260. IF k > 0 THEN
  4261. L2Block( adrA, adrB, adrC, m, L2BlockN, k );
  4262. END;
  4263. END;
  4264. INC( b1, L2BlockN * KAligned );
  4265. INC( matrixC, L2BlockN * incC ); DEC( n, L2BlockN );
  4266. END;
  4267. IF (n = 0) THEN RETURN
  4268. END;
  4269. a1 := matrixA; adrC := matrixC; m := M;
  4270. WHILE (m >= L2BlockM) DO
  4271. IF debug THEN KernelLog.String( "LoopL3M rest" ); KernelLog.Ln END;
  4272. adrA := a1; adrB := b1; k := K;
  4273. WHILE (k >= L2BlockK) DO
  4274. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4275. L2Block( adrA, adrB, adrC, L2BlockM, n, L2BlockK );
  4276. INC( adrA, L2BlockM * L2BlockK * Size );
  4277. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4278. END;
  4279. IF k > 0 THEN
  4280. L2Block( adrA, adrB, adrC, L2BlockM, n, k );
  4281. END;
  4282. INC( a1, L2BlockM * KAligned );
  4283. INC( adrC, L2BlockM * strideC ); DEC( m, L2BlockM );
  4284. END;
  4285. IF (m = 0) THEN RETURN
  4286. END;
  4287. adrA := a1; adrB := b1; k := K;
  4288. WHILE (k >= L2BlockK) DO
  4289. IF debug THEN KernelLog.String( "LoopL3K rest" ); KernelLog.Ln END;
  4290. L2Block( adrA, adrB, adrC, m, n, L2BlockK );
  4291. INC( adrA, L2BlockK * m * Size );
  4292. INC( adrB, L2BlockK * n * Size ); DEC( k, L2BlockK );
  4293. END;
  4294. IF k > 0 THEN L2Block( adrA, adrB, adrC, m, n, k );
  4295. END;
  4296. END L3BlockR;
  4297. PROCEDURE Align( adr: ADDRESS; align: SIZE ): ADDRESS;
  4298. BEGIN
  4299. RETURN adr + (-adr) MOD align; (* 128 bit = 16 byte alignment *)
  4300. END Align;
  4301. PROCEDURE CopyAX( matrixA, dest: ADDRESS;
  4302. IncA, StrideA: SIZE;
  4303. K, M, L2BlockK, L2BlockM: SIZE );
  4304. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4305. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4306. VAR rest: SIZE;
  4307. BEGIN
  4308. IF debug THEN
  4309. KernelLog.String( "CopyMK:" ); KernelLog.Int( M, 10 ); KernelLog.Int( K, 10 );
  4310. KernelLog.Ln;
  4311. END;
  4312. rest := (-K) MOD 2;
  4313. WHILE (M > 0) DO
  4314. MovX( matrixA, dest, IncA, K ); INC( dest, K * 8 );
  4315. IF rest # 0 THEN
  4316. ZeroX( dest, rest ); INC( dest, 8 * rest );
  4317. END;
  4318. INC( matrixA, StrideA ); DEC( M );
  4319. END;
  4320. END CopyMK;
  4321. BEGIN
  4322. Tic( t ); m := M;
  4323. WHILE (m >= L2BlockM) DO
  4324. k := K; adrA := matrixA;
  4325. WHILE (k >= L2BlockK) DO
  4326. CopyMK( adrA, L2BlockM, L2BlockK );
  4327. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4328. END;
  4329. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4330. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4331. END;
  4332. adrA := matrixA; k := K;
  4333. WHILE (k >= L2BlockK) DO
  4334. CopyMK( adrA, m, L2BlockK );
  4335. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4336. END;
  4337. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4338. Toc( t, copyT );
  4339. END CopyAX;
  4340. PROCEDURE CopyAR( matrixA, dest: ADDRESS;
  4341. IncA, StrideA: SIZE;
  4342. K, M, L2BlockK, L2BlockM: SIZE );
  4343. VAR m, k: SIZE; adrA: ADDRESS; t: HUGEINT;
  4344. PROCEDURE CopyMK( matrixA: ADDRESS; M, K: SIZE );
  4345. VAR rest: SIZE;
  4346. BEGIN
  4347. rest := (-K) MOD 4;
  4348. WHILE (M > 0) DO
  4349. MovR( matrixA, dest, IncA, K ); INC( dest, K * 4 );
  4350. IF rest # 0 THEN
  4351. ZeroR( dest, rest ); INC( dest, 4 * rest );
  4352. END;
  4353. INC( matrixA, StrideA ); DEC( M );
  4354. END;
  4355. END CopyMK;
  4356. BEGIN
  4357. Tic( t ); m := M;
  4358. WHILE (m >= L2BlockM) DO
  4359. k := K; adrA := matrixA;
  4360. WHILE (k >= L2BlockK) DO
  4361. CopyMK( adrA, L2BlockM, L2BlockK );
  4362. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4363. END;
  4364. IF k > 0 THEN CopyMK( adrA, L2BlockM, k ); END;
  4365. INC( matrixA, L2BlockM * StrideA ); DEC( m, L2BlockM );
  4366. END;
  4367. adrA := matrixA; k := K;
  4368. WHILE (k >= L2BlockK) DO
  4369. CopyMK( adrA, m, L2BlockK );
  4370. INC( adrA, L2BlockK * IncA ); DEC( k, L2BlockK );
  4371. END;
  4372. IF k > 0 THEN CopyMK( adrA, m, k ); END;
  4373. Toc( t, copyT );
  4374. END CopyAR;
  4375. PROCEDURE CopyBX( matrixB, dest: ADDRESS;
  4376. IncB, StrideB: SIZE;
  4377. N, K, L2BlockN, L2BlockK: SIZE );
  4378. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4379. PROCEDURE Copy5x2k( matrixB: ADDRESS; k: SIZE );
  4380. VAR i: SIZE; adrB: ADDRESS; rest: SIZE;
  4381. BEGIN
  4382. rest := (-k) MOD 2;
  4383. WHILE (k >= 2) DO (* store 5x4 Block in line *)
  4384. adrB := matrixB;
  4385. FOR i := 1 TO L1BlockN DO
  4386. MovX( adrB, dest, StrideB, 2 ); INC( dest, 16 );
  4387. INC( adrB, IncB );
  4388. END;
  4389. INC( matrixB, 2 * StrideB ); DEC( k, 2 );
  4390. END;
  4391. IF k > 0 THEN
  4392. adrB := matrixB;
  4393. FOR i := 1 TO L1BlockN DO
  4394. MovX( adrB, dest, StrideB, k ); INC( dest, 8 * k );
  4395. IF rest # 0 THEN
  4396. ZeroX( dest, rest ); INC( dest, rest * 8 );
  4397. END;
  4398. INC( adrB, IncB );
  4399. END;
  4400. END;
  4401. END Copy5x2k;
  4402. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4403. VAR n, rest: SIZE;
  4404. BEGIN
  4405. rest := (-K) MOD 2;
  4406. IF debug THEN
  4407. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4408. END;
  4409. n := N;
  4410. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4411. Copy5x2k( matrixB, K );
  4412. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4413. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4414. END;
  4415. IF debug THEN KernelLog.String( "Copy1, n=" ); KernelLog.Int( n, 10 ); KernelLog.Ln;
  4416. END;
  4417. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4418. MovX( matrixB, dest, StrideB, K ); INC( dest, K * 8 );
  4419. ZeroR( dest, rest ); INC( dest, rest * 8 );
  4420. INC( matrixB, IncB ); DEC( n );
  4421. END;
  4422. END Copy1;
  4423. BEGIN
  4424. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4425. ASSERT( L2BlockK MOD 2 = 0 ); n := N;
  4426. WHILE (n >= L2BlockN) DO
  4427. k := K; adrB := matrixB;
  4428. WHILE (k >= L2BlockK) DO
  4429. Copy1( adrB, L2BlockK, L2BlockN );
  4430. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4431. END;
  4432. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4433. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4434. END;
  4435. IF (n = 0) THEN RETURN
  4436. END;
  4437. k := K; adrB := matrixB;
  4438. WHILE (k >= L2BlockK) DO
  4439. Copy1( adrB, L2BlockK, n );
  4440. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4441. END;
  4442. Copy1( adrB, k, n ); Toc( t, copyT );
  4443. END CopyBX;
  4444. PROCEDURE CopyBR( matrixB, dest: ADDRESS;
  4445. IncB, StrideB: SIZE;
  4446. N, K, L2BlockN, L2BlockK: SIZE );
  4447. VAR n, k: SIZE; adrB: ADDRESS; t: HUGEINT;
  4448. PROCEDURE Copy5x4k( matrixB: ADDRESS; k: SIZE );
  4449. VAR i: SIZE; adrB: ADDRESS; rest, k4: SIZE;
  4450. BEGIN
  4451. k4 := k - k MOD 4; rest := (-k) MOD 4;
  4452. IF k4 > 0 THEN
  4453. MovR5( matrixB, IncB, StrideB, dest, k4 );
  4454. INC( matrixB, k4 * StrideB ); INC( dest, k4 * 80 DIV 4 );
  4455. DEC( k, k4 );
  4456. END;
  4457. (*
  4458. WHILE (k >= 4) DO (* store 5x4 Block in line *)
  4459. adrB := matrixB;
  4460. FOR i := 1 TO L1BlockN DO
  4461. MovR( adrB, dest, StrideB, 4 ); INC( dest, 16 ); INC( adrB, IncB );
  4462. END;
  4463. INC( matrixB, 4 * StrideB ); DEC( k, 4 );
  4464. END;
  4465. *)
  4466. IF k > 0 THEN
  4467. adrB := matrixB;
  4468. FOR i := 1 TO L1BlockN DO
  4469. MovR( adrB, dest, StrideB, k ); INC( dest, 4 * k );
  4470. IF rest # 0 THEN
  4471. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4472. END;
  4473. INC( adrB, IncB );
  4474. END;
  4475. END;
  4476. END Copy5x4k;
  4477. PROCEDURE Copy1( matrixB: ADDRESS; K, N:SIZE );
  4478. VAR n, rest: SIZE;
  4479. BEGIN
  4480. rest := (-K) MOD 4;
  4481. IF debug THEN
  4482. KernelLog.String( ">>Copy1" ); KernelLog.Int( K, 10 ); KernelLog.Int( N, 10 ); KernelLog.Ln;
  4483. END;
  4484. n := N;
  4485. WHILE (n >= L1BlockN) DO (* store Kx5 Blocks as one line *)
  4486. Copy5x4k( matrixB, K );
  4487. IF debug THEN ASSERT( dest MOD 16 = 0 ); END;
  4488. INC( matrixB, L1BlockN * IncB ); DEC( n, L1BlockN );
  4489. END;
  4490. WHILE (n > 0) DO (* store remaining rectangle in separate lines *)
  4491. MovR( matrixB, dest, StrideB, K ); INC( dest, K * 4 );
  4492. ZeroR( dest, rest ); INC( dest, rest * 4 );
  4493. INC( matrixB, IncB ); DEC( n );
  4494. END;
  4495. END Copy1;
  4496. BEGIN
  4497. Tic( t ); ASSERT( L2BlockN MOD L1BlockN = 0 );
  4498. ASSERT( L2BlockK MOD 4 = 0 ); n := N;
  4499. WHILE (n >= L2BlockN) DO
  4500. k := K; adrB := matrixB;
  4501. WHILE (k >= L2BlockK) DO
  4502. Copy1( adrB, L2BlockK, L2BlockN );
  4503. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4504. END;
  4505. IF k > 0 THEN Copy1( adrB, k, L2BlockN ) END;
  4506. INC( matrixB, L2BlockN * IncB ); DEC( n, L2BlockN );
  4507. END;
  4508. IF (n = 0) THEN RETURN
  4509. END;
  4510. k := K; adrB := matrixB;
  4511. WHILE (k >= L2BlockK) DO
  4512. Copy1( adrB, L2BlockK, n );
  4513. INC( adrB, L2BlockK * StrideB ); DEC( k, L2BlockK );
  4514. END;
  4515. Copy1( adrB, k, n ); Toc( t, copyT );
  4516. END CopyBR;
  4517. (*
  4518. PROCEDURE FillMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4519. VAR i, j: LONGINT;
  4520. BEGIN
  4521. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4522. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4523. A[i, j] := ran.Dice( 10 );
  4524. IF debug THEN A[i, j] := 10 * i + j; END;
  4525. END;
  4526. END;
  4527. END FillMR;
  4528. PROCEDURE DispMR( VAR A: ARRAY [ .. , .. ] OF REAL );
  4529. VAR i, j: LONGINT;
  4530. BEGIN
  4531. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4532. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4533. KernelLog.Ln;
  4534. END;
  4535. END DispMR;
  4536. PROCEDURE FillMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4537. VAR i, j: LONGINT;
  4538. BEGIN
  4539. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4540. FOR j := 0 TO LEN( A, 1 ) - 1 DO
  4541. A[i, j] := ran.Dice( 10 );
  4542. IF debug THEN A[i, j] := 10 * i + j; END;
  4543. END;
  4544. END;
  4545. END FillMX;
  4546. PROCEDURE DispMX( VAR A: ARRAY [ .. , .. ] OF LONGREAL );
  4547. VAR i, j: LONGINT;
  4548. BEGIN
  4549. FOR i := 0 TO LEN( A, 0 ) - 1 DO
  4550. FOR j := 0 TO LEN( A, 1 ) - 1 DO KernelLog.Int( ENTIER( A[i, j] + 0.5 ), 5 ); END;
  4551. KernelLog.Ln;
  4552. END;
  4553. END DispMX;
  4554. *)
  4555. PROCEDURE -GetTimer( ): HUGEINT;
  4556. CODE {SYSTEM.Pentium}
  4557. CPUID ;
  4558. RDTSC
  4559. END GetTimer;
  4560. PROCEDURE Tic( VAR t: HUGEINT );
  4561. BEGIN
  4562. t := GetTimer();
  4563. END Tic;
  4564. PROCEDURE Toc( VAR t, addto: HUGEINT );
  4565. BEGIN
  4566. INC( addto, GetTimer() - t ); t := GetTimer();
  4567. END Toc;
  4568. PROCEDURE MultiplyX( A, B, C: ADDRESS;
  4569. M, N, K, L2BlockM, L2BlockN, L2BlockK:SIZE;
  4570. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4571. add: BOOLEAN );
  4572. VAR lenA, lenB: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4573. M1, M2, i: SIZE; val: LONGREAL; t: HUGEINT;
  4574. inc: SIZE;
  4575. obj: POINTER TO ARRAY OF MultiplyObjectX;
  4576. cache: Cache;
  4577. BEGIN
  4578. NEW(obj,nrProcesses+1);
  4579. lenA := M * Align2( K ) * 8; lenB := N * Align2( K ) * 8;
  4580. cache := cachePool.Acquire( lenA + lenB );
  4581. adrA := cache.adr; adrB := adrA + lenA;
  4582. CopyAX( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4583. CopyBX( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4584. Tic( t ); m := M; adrC := C;
  4585. IF ~add THEN
  4586. WHILE (m > 0) DO
  4587. ZeroXI( adrC, IncC, N ); INC( adrC, StrideC ); DEC( m );
  4588. END;
  4589. END;
  4590. Toc( t, zeroT );
  4591. IF debug THEN
  4592. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4593. FOR i := 0 TO M * Align2( K ) - 1 DO
  4594. SYSTEM.GET( adrA + i * 8, val );
  4595. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4596. END;
  4597. END;
  4598. IF debug THEN
  4599. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4600. FOR i := 0 TO N * Align2( K ) - 1 DO
  4601. SYSTEM.GET( adrB + i * 8, val );
  4602. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4603. END;
  4604. END;
  4605. IF parallel & (M > L2BlockM) THEN
  4606. inc := Align( M DIV nrProcesses, L2BlockM ); M1 := 0;
  4607. i := 0;
  4608. WHILE (M1 < M) DO
  4609. M2 := M1 + inc;
  4610. IF M2 > M THEN M2 := M END;
  4611. NEW( obj[i], adrA + M1 * Align2( K ) * 8, adrB,
  4612. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4613. L2BlockM, L2BlockN, L2BlockK );
  4614. M1 := M2; INC( i );
  4615. END;
  4616. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4617. ELSE
  4618. L3BlockX( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4619. L2BlockN, L2BlockK );
  4620. END;
  4621. Toc( t, compT ); cachePool.Release( cache );
  4622. END MultiplyX;
  4623. PROCEDURE MultiplyR( A, B, C: ADDRESS;
  4624. M, N, K, L2BlockM, L2BlockN, L2BlockK: SIZE;
  4625. IncA, StrideA, IncB, StrideB, IncC, StrideC: SIZE;
  4626. add: BOOLEAN );
  4627. VAR lenA, lenB, len: SIZE; adrA, adrB, adrC: ADDRESS; m: SIZE;
  4628. M1, M2, i: SIZE; val: REAL; inc: SIZE;
  4629. obj: POINTER TO ARRAY OF MultiplyObjectR;
  4630. t: HUGEINT; cache: Cache;
  4631. BEGIN
  4632. NEW(obj,nrProcesses+1);
  4633. lenA := M * Align4( K ) * 4; lenB := N * Align4( K ) * 4;
  4634. cache := cachePool.Acquire( lenA + lenB );
  4635. adrA := cache.adr; adrB := adrA + lenA;
  4636. CopyAR( A, adrA, IncA, StrideA, K, M, L2BlockK, L2BlockM );
  4637. CopyBR( B, adrB, IncB, StrideB, N, K, L2BlockN, L2BlockK );
  4638. Tic( t ); m := M; adrC := C;
  4639. IF ~add THEN
  4640. WHILE (m > 0) DO
  4641. ZeroRI( adrC, IncC, N ); INC( adrC, StrideC );
  4642. DEC( m );
  4643. END;
  4644. END;
  4645. Toc( t, zeroT );
  4646. IF debug THEN
  4647. KernelLog.String( "copy of A: " ); KernelLog.Ln;
  4648. FOR i := 0 TO M * Align4( K ) - 1 DO
  4649. SYSTEM.GET( adrA + i * 4, val );
  4650. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4651. END;
  4652. END;
  4653. IF debug THEN
  4654. KernelLog.String( "copy of B: " ); KernelLog.Ln;
  4655. FOR i := 0 TO N * Align4( K ) - 1 DO
  4656. SYSTEM.GET( adrB + i * 4, val );
  4657. KernelLog.Int( ENTIER( val + 0.5 ), 5 ); KernelLog.Ln;
  4658. END;
  4659. END;
  4660. IF parallel & (M > L2BlockM) THEN
  4661. inc := Align( M DIV nrProcesses, L2BlockM ); M1 := 0;
  4662. i := 0;
  4663. WHILE (M1 < M) DO
  4664. M2 := M1 + inc;
  4665. IF M2 > M THEN M2 := M END;
  4666. NEW( obj[i], adrA + M1 * Align4( K ) * 4, adrB,
  4667. C + StrideC * M1, M2 - M1, N, K, IncC, StrideC,
  4668. L2BlockM, L2BlockN, L2BlockK );
  4669. M1 := M2; INC( i );
  4670. END;
  4671. WHILE (i > 0) DO DEC( i ); obj[i].Wait; END;
  4672. ELSE
  4673. L3BlockR( adrA, adrB, C, M, N, K, IncC, StrideC, L2BlockM,
  4674. L2BlockN, L2BlockK );
  4675. END;
  4676. Toc( t, compT ); cachePool.Release( cache );
  4677. END MultiplyR;
  4678. (*
  4679. PROCEDURE DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4680. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4681. A, B, C, D: ARRAY [ .. , .. ] OF LONGREAL;
  4682. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4683. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: LONGREAL; atime, time: LONGINT;
  4684. BEGIN
  4685. KernelLog.String( "LONGREAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4686. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4687. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4688. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4689. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMX( A ); FillMX( B );
  4690. IF debug THEN DispMX( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMX( B );
  4691. END;
  4692. atime := Input.Time(); (* C := 0; *)
  4693. WHILE (iter > 0) DO
  4694. MultiplyX( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4695. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4696. (*
  4697. 8,
  4698. LEN( A, 1 ) * 8, 8, LEN( B, 1 ) * 8, 8, LEN( C, 1 ) * 8
  4699. *)
  4700. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4701. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4702. );
  4703. DEC( iter );
  4704. END;
  4705. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4706. IF debug THEN
  4707. DispMX( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMX( B ); KernelLog.String( " = " );
  4708. KernelLog.Ln; DispMX( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  4709. END;
  4710. IF check THEN
  4711. (*
  4712. NEW(D,M,N);
  4713. MatMulAXAXNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4714. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4715. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  4716. *)
  4717. D := A * B;
  4718. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  4719. END;
  4720. END DoTestX;
  4721. PROCEDURE DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; check: BOOLEAN; iter: LONGINT );
  4722. VAR (* A, B, C, D: ARRAY [ .. , .. ] OF REAL; *)
  4723. A, B, C, D: ARRAY [ .. , .. ] OF REAL;
  4724. p, q: ANY; l1, l2: LONGINT; adrA, adrB, len, lenA, lenB: LONGINT;
  4725. matrixA, matrixB, matrixC: LONGINT; i: LONGINT; val: REAL; atime, time: LONGINT;
  4726. BEGIN
  4727. KernelLog.String( "REAL" ); KernelLog.Ln; KernelLog.String( "M=" ); KernelLog.Int( M, 0 ); KernelLog.Ln;
  4728. KernelLog.String( "N=" ); KernelLog.Int( N, 0 ); KernelLog.Ln; KernelLog.String( "K=" ); KernelLog.Int( K, 0 );
  4729. KernelLog.Ln; KernelLog.String( "L2BlockM=" ); KernelLog.Int( L2BlockM, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockN=" );
  4730. KernelLog.Int( L2BlockN, 0 ); KernelLog.Ln; KernelLog.String( "L2BlockK=" ); KernelLog.Int( L2BlockK, 0 );
  4731. KernelLog.Ln; NEW( A, M, K ); NEW( B, K, N ); NEW( C, M, N ); FillMR( A ); FillMR( B );
  4732. IF debug THEN DispMR( A ); KernelLog.String( "*" ); KernelLog.Ln; DispMR( B );
  4733. END;
  4734. atime := Input.Time(); (* C := 0; *)
  4735. FOR i := 1 TO iter DO
  4736. MultiplyR( ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4737. ADDRESSOF( C[0, 0] ), M, N, K, L2BlockM, L2BlockN, L2BlockK,
  4738. (* 4,
  4739. LEN( A, 1 ) * 4, 4, LEN( B, 1 ) * 4, 4, LEN( C, 1 ) * 4 *)
  4740. SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4741. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( C, 1 ), SYSTEM.INCR( C, 0 )
  4742. );
  4743. END;
  4744. atime := Input.Time() - atime; KernelLog.String( "overall time: " ); KernelLog.Int( atime, 10 ); KernelLog.Ln;
  4745. IF debug THEN
  4746. DispMR( A ); KernelLog.String( " * " ); KernelLog.Ln; DispMR( B ); KernelLog.String( " = " );
  4747. KernelLog.Ln; DispMR( C ); KernelLog.String( " ------------" ); KernelLog.Ln;
  4748. END;
  4749. IF check THEN
  4750. (*
  4751. NEW(D,M,N);
  4752. MatMulARARNaiive(ADDRESSOF( A[0, 0] ), ADDRESSOF( B[0, 0] ),
  4753. ADDRESSOF( D[0, 0] ), M, N, K, SYSTEM.INCR( A, 1 ), SYSTEM.INCR( A, 0 ), SYSTEM.INCR( B, 1 ),
  4754. SYSTEM.INCR( B, 0 ), SYSTEM.INCR( D, 1 ), SYSTEM.INCR( D, 0 ));
  4755. *)
  4756. D := A * B;
  4757. ASSERT ( ENTIER( D + 0.5 ) = ENTIER( C + 0.5 ) );
  4758. END;
  4759. END DoTestR;
  4760. PROCEDURE RandTestR*;
  4761. VAR iter, i, time: LONGINT;
  4762. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  4763. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  4764. BEGIN
  4765. IF Min = Max THEN RETURN Min
  4766. ELSE RETURN ran.Dice( Max - Min ) + Min
  4767. END;
  4768. END Ran;
  4769. BEGIN
  4770. In.Open(); In.LongInt( iter );
  4771. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  4772. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  4773. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  4774. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  4775. K := Ran( MinK, MaxK );
  4776. IF N < 5 THEN N := 5 END;
  4777. IF K < 4 THEN K := 4 END;
  4778. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  4779. BN := Align( BN, 5 );
  4780. IF BN > N THEN DEC( BN, 5 ) END;
  4781. BK := Align( BK, 4 );
  4782. IF BK > K THEN DEC( BK, 4 ) END;
  4783. DoTestR( M, N, K, BM, BN, BK, TRUE , 1 );
  4784. END;
  4785. END RandTestR;
  4786. PROCEDURE RandTestX*;
  4787. VAR iter, i, time: LONGINT;
  4788. MinM, MaxM, MinN, MaxN, MinK, MaxK, M, N, K, BM, BN, BK: LONGINT;
  4789. PROCEDURE Ran( Min, Max: LONGINT ): LONGINT;
  4790. BEGIN
  4791. IF Min = Max THEN RETURN Min
  4792. ELSE RETURN ran.Dice( Max - Min ) + Min
  4793. END;
  4794. END Ran;
  4795. BEGIN
  4796. In.Open(); In.LongInt( iter );
  4797. In.LongInt( MinM ); In.LongInt( MaxM ); In.LongInt( MinN );
  4798. In.LongInt( MaxN ); In.LongInt( MinK ); In.LongInt( MaxK );
  4799. FOR i := 1 TO iter DO KernelLog.Int( i, 10 ); KernelLog.Ln; DEC( iter );
  4800. M := Ran( MinM, MaxM ); N := Ran( MinN, MaxN );
  4801. K := Ran( MinK, MaxK );
  4802. IF N < 5 THEN N := 5 END;
  4803. IF K < 4 THEN K := 4 END;
  4804. BM := ran.Dice( M ) + 1; BN := ran.Dice( N ) + 1; BK := ran.Dice( K ) + 1;
  4805. BN := Align( BN, 5 );
  4806. IF BN > N THEN DEC( BN, 5 ) END;
  4807. BK := Align( BK, 4 );
  4808. IF BK > K THEN DEC( BK, 4 ) END;
  4809. DoTestX( M, N, K, BM, BN, BK, TRUE , 1 );
  4810. END;
  4811. END RandTestX;
  4812. *)
  4813. (*
  4814. PROCEDURE Times*;
  4815. VAR all: HUGEINT;
  4816. BEGIN
  4817. all := allocT + copyT + zeroT + compT; KernelLog.String( "alloc=" );
  4818. KernelLog.LongRealFix( allocT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4819. KernelLog.Int( ENTIER( 100 * allocT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4820. KernelLog.Ln; KernelLog.String( "copy=" );
  4821. KernelLog.LongRealFix( copyT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4822. KernelLog.Int( ENTIER( 100 * copyT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4823. KernelLog.Ln; KernelLog.String( "zero=" );
  4824. KernelLog.LongRealFix( zeroT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4825. KernelLog.Int( ENTIER( 100 * zeroT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4826. KernelLog.Ln; KernelLog.String( "comp=" );
  4827. KernelLog.LongRealFix( compT / 1000000, 0, 20 ); KernelLog.String( "[" );
  4828. KernelLog.Int( ENTIER( 100 * compT / all + 0.5 ), 5 ); KernelLog.String( "%]" );
  4829. KernelLog.Ln;
  4830. END Times;
  4831. *)
  4832. (*
  4833. PROCEDURE TestRMM*;
  4834. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  4835. check, iter: LONGINT;
  4836. BEGIN
  4837. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  4838. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  4839. In.LongInt( iter ); In.LongInt( check );
  4840. IF L2BlockM = 0 THEN
  4841. MagicBlockR( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  4842. END;
  4843. DoTestR( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  4844. END TestRMM;
  4845. PROCEDURE TestXMM*;
  4846. VAR M, N, K, L2BlockM, L2BlockN, L2BlockK: LONGINT; matrixA, matrixB, matrixC: LONGINT;
  4847. iter, check: LONGINT;
  4848. BEGIN
  4849. In.Open; In.LongInt( M ); In.LongInt( N ); In.LongInt( K );
  4850. In.LongInt( L2BlockM ); In.LongInt( L2BlockN ); In.LongInt( L2BlockK );
  4851. In.LongInt( iter ); In.LongInt( check );
  4852. IF L2BlockM = 0 THEN
  4853. MagicBlockX( M DIV nrProcesses, N, K, L2BlockM, L2BlockN, L2BlockK );
  4854. END;
  4855. DoTestX( M, N, K, L2BlockM, L2BlockN, L2BlockK, check = 1, iter ); Times();
  4856. END TestXMM;
  4857. *)
  4858. (****** matrix multiplication using fast scalar product ******)
  4859. PROCEDURE MatMulAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4860. BEGIN
  4861. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  4862. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4863. END MatMulAXAXLoopA;
  4864. PROCEDURE MatMulAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4865. BEGIN
  4866. SYSTEM.PUT( dadr, 0.0D0 ); (* initialization of scalar product to 0 *)
  4867. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4868. END MatMulAXAXLoopSSE;
  4869. PROCEDURE MatMulARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4870. BEGIN
  4871. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  4872. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4873. END MatMulARARLoopA;
  4874. PROCEDURE MatMulARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4875. BEGIN
  4876. SYSTEM.PUT( dadr, 0.0E0 ); (* initialization of scalar product to 0 *)
  4877. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4878. END MatMulARARLoopSSE;
  4879. PROCEDURE MatMulIncAXAXLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4880. BEGIN
  4881. SPAXAXLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4882. END MatMulIncAXAXLoopA;
  4883. PROCEDURE MatMulIncAXAXLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4884. BEGIN
  4885. SPAXAXLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4886. END MatMulIncAXAXLoopSSE;
  4887. PROCEDURE MatMulIncARARLoopA( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4888. BEGIN
  4889. SPARARLoopA( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4890. END MatMulIncARARLoopA;
  4891. PROCEDURE MatMulIncARARLoopSSE( ladr, radr, dadr: ADDRESS; linc, rinc, len: SIZE );
  4892. BEGIN
  4893. SPARARLoopSSE( ladr, radr, dadr, linc, rinc, len ); (* apply scalar product *)
  4894. END MatMulIncARARLoopSSE;
  4895. (****** matrix multiplication over rows with transposition of B *)
  4896. PROCEDURE MatMulHBlockR( MatrixA, MatrixB, MatrixC: ADDRESS;
  4897. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  4898. add: BOOLEAN );
  4899. VAR fromA, toA, fromB, toB: ADDRESS; BlockSize: SIZE;
  4900. (*
  4901. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  4902. *)
  4903. PROCEDURE Block( fromA, toA, fromB, toB: ADDRESS );
  4904. VAR i, j: ADDRESS; adrA, adrB, adrC: ADDRESS;
  4905. BEGIN
  4906. FOR i := fromA TO toA - 1 DO
  4907. adrA := MatrixA + i * Stride;
  4908. FOR j := fromB TO toB - 1 DO
  4909. adrB := MatrixB + j * Stride;
  4910. adrC := MatrixC + i * StrideC + j * IncC;
  4911. AlignedSPRSSE( adrA, adrB, adrC, Cols, add );
  4912. END;
  4913. END;
  4914. END Block;
  4915. BEGIN
  4916. IF cBlockSize = 0 THEN
  4917. BlockSize := L2CacheSize DIV Stride DIV 4;
  4918. ELSE BlockSize := cBlockSize;
  4919. END;
  4920. lastUsedBlockSize := BlockSize;
  4921. fromA := 0;
  4922. REPEAT
  4923. toA := fromA + BlockSize;
  4924. IF toA > RowsA THEN toA := RowsA END;
  4925. fromB := 0;
  4926. REPEAT
  4927. toB := fromB + BlockSize;
  4928. IF toB > RowsB THEN toB := RowsB END;
  4929. Block( fromA, toA, fromB, toB ); fromB := toB;
  4930. UNTIL toB = RowsB;
  4931. fromA := toA;
  4932. UNTIL toA = RowsA;
  4933. END MatMulHBlockR;
  4934. PROCEDURE MatMulHBlockX( MatrixA, MatrixB, MatrixC: ADDRESS;
  4935. (*inc=4*) Stride, IncC, StrideC, RowsA, RowsB, Cols: SIZE;
  4936. add: BOOLEAN );
  4937. VAR fromA, toA, fromB, toB: ADDRESS; BlockSize: SIZE;
  4938. (*
  4939. computation of C[i,j] = Sum{k=0..Cols-1} A[i,k]*B[j,k], i.e. A*B`
  4940. *)
  4941. PROCEDURE Block( fromA, toA, fromB, toB: ADDRESS );
  4942. VAR adrA, adrB, adrC: ADDRESS; i, j: ADDRESS;
  4943. BEGIN
  4944. FOR i := fromA TO toA - 1 DO
  4945. adrA := MatrixA + i * Stride;
  4946. FOR j := fromB TO toB - 1 DO
  4947. adrB := MatrixB + j * Stride;
  4948. adrC := MatrixC + i * StrideC + j * IncC;
  4949. AlignedSPXSSE( adrA, adrB, adrC, Cols, add );
  4950. END;
  4951. END;
  4952. END Block;
  4953. BEGIN
  4954. IF cBlockSize = 0 THEN
  4955. BlockSize := L2CacheSize DIV Stride DIV 8;
  4956. ELSE BlockSize := cBlockSize;
  4957. END;
  4958. lastUsedBlockSize := BlockSize;
  4959. fromA := 0;
  4960. REPEAT
  4961. toA := fromA + BlockSize;
  4962. IF toA > RowsA THEN toA := RowsA END;
  4963. fromB := 0;
  4964. REPEAT
  4965. toB := fromB + BlockSize;
  4966. IF toB > RowsB THEN toB := RowsB END;
  4967. Block( fromA, toA, fromB, toB ); fromB := toB;
  4968. UNTIL toB = RowsB;
  4969. fromA := toA;
  4970. UNTIL toA = RowsA;
  4971. END MatMulHBlockX;
  4972. PROCEDURE CopyDataR( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  4973. VAR i: SIZE; t: HUGEINT;
  4974. BEGIN
  4975. Tic( t );
  4976. FOR i := 0 TO rows - 1 DO
  4977. Copy4( src, dest, incSrc, incDest, cols );
  4978. INC( src, strideSrc ); INC( dest, strideDest );
  4979. END;
  4980. Toc( t, copyT );
  4981. END CopyDataR;
  4982. PROCEDURE CopyDataX( src, dest: ADDRESS; incSrc, strideSrc, incDest, strideDest, rows, cols:SIZE ); (*! optimize *)
  4983. VAR i: SIZE; t: HUGEINT;
  4984. BEGIN
  4985. Tic( t );
  4986. FOR i := 0 TO rows - 1 DO
  4987. Copy8( src, dest, incSrc, incDest, cols );
  4988. INC( src, strideSrc ); INC( dest, strideDest );
  4989. END;
  4990. Toc( t, copyT );
  4991. END CopyDataX;
  4992. PROCEDURE MatMulARARTransposed( matrixA, matrixB, matrixC: ADDRESS;
  4993. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  4994. add: BOOLEAN ): BOOLEAN;
  4995. VAR stride, adrB, adrC: ADDRESS;
  4996. proc: POINTER TO ARRAY OF MatMulHObjR;
  4997. from, to0, i: SIZE; cacheA, cacheB: Cache;
  4998. t: HUGEINT;
  4999. BEGIN
  5000. NEW(proc,nrProcesses);
  5001. ASSERT( ColsA = RowsB );
  5002. (* allocate 128 bit = 16 byte aligned matrix *)
  5003. stride := Align( ColsA * SIZEOF( REAL ), 16 );
  5004. IF (IncA # SIZEOF( REAL )) OR (StrideA # stride) OR
  5005. (matrixA MOD 16 # 0) THEN
  5006. cacheA := cachePool.Acquire( stride * RowsA );
  5007. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5008. SIZEOF( REAL ), stride, RowsA, ColsA ); (* copy to array *)
  5009. matrixA := cacheA.adr;
  5010. ELSE cacheA := NIL;
  5011. END;
  5012. IF (StrideB # SIZEOF( REAL )) OR (IncB # stride) OR
  5013. (matrixB MOD 16 # 0) THEN
  5014. cacheB := cachePool.Acquire( stride * ColsB );
  5015. CopyDataR( matrixB, cacheB.adr, StrideB, IncB,
  5016. SIZEOF( REAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5017. matrixB := cacheB.adr;
  5018. ELSE cacheB := NIL;
  5019. END;
  5020. Tic( t );
  5021. (*! needs decision rule if to split by rows or columns *)
  5022. IF nrProcesses > 1 THEN
  5023. from := 0;
  5024. FOR i := 0 TO nrProcesses - 1 DO
  5025. (*
  5026. to := RowsA * (i + 1) DIV nrProcesses; adrA := matrixA + from * stride;
  5027. adrC := matrixC + from * StrideC;
  5028. *)
  5029. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5030. adrB := matrixB + from * stride;
  5031. adrC := matrixC + from * IncC;
  5032. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5033. RowsA, to0 - from, RowsB, add );
  5034. from := to0;
  5035. END;
  5036. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5037. ELSE
  5038. MatMulHBlockR( matrixA, matrixB, matrixC, stride, IncC,
  5039. StrideC, RowsA, ColsB, RowsB, add );
  5040. END;
  5041. Toc( t, compT ); cachePool.Release( cacheA );
  5042. cachePool.Release( cacheB ); RETURN TRUE;
  5043. END MatMulARARTransposed;
  5044. PROCEDURE MatMulAXAXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5045. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5046. add: BOOLEAN ): BOOLEAN;
  5047. VAR stride, adrB, adrC: ADDRESS;
  5048. proc: POINTER TO ARRAY OF MatMulHObjX;
  5049. from, to0, i: SIZE; cacheA, cacheB: Cache;
  5050. t: HUGEINT;
  5051. BEGIN
  5052. NEW(proc,nrProcesses);
  5053. ASSERT( ColsA = RowsB );
  5054. stride := Align( ColsA * SIZEOF( LONGREAL ), 16 );
  5055. IF (IncA # SIZEOF( LONGREAL )) OR (StrideA # stride) OR
  5056. (matrixA MOD 16 # 0) THEN
  5057. cacheA := cachePool.Acquire( stride * RowsA );
  5058. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5059. SIZEOF( LONGREAL ), stride, RowsA, ColsA ); (* copy to array *)
  5060. matrixA := cacheA.adr;
  5061. ELSE cacheA := NIL;
  5062. END;
  5063. IF (StrideB # SIZEOF( LONGREAL )) OR (IncB # stride) OR
  5064. (matrixB MOD 16 # 0) THEN
  5065. cacheB := cachePool.Acquire( stride * ColsB );
  5066. CopyDataX( matrixB, cacheB.adr, StrideB, IncB,
  5067. SIZEOF( LONGREAL ), stride, ColsB, RowsB ); (* copy to transposed array *)
  5068. matrixB := cacheB.adr;
  5069. ELSE cacheB := NIL;
  5070. END;
  5071. Tic( t );
  5072. IF nrProcesses > 1 THEN
  5073. from := 0;
  5074. FOR i := 0 TO nrProcesses - 1 DO
  5075. to0 := ColsB * (i + 1) DIV nrProcesses; (*! move to rows ? *)
  5076. adrB := matrixB + from * stride;
  5077. adrC := matrixC + from * IncC;
  5078. NEW( proc[i], matrixA, adrB, adrC, stride, IncC, StrideC,
  5079. RowsA, to0 - from, RowsB, add );
  5080. from := to0;
  5081. END;
  5082. FOR i := 0 TO nrProcesses - 1 DO proc[i].Wait(); END;
  5083. ELSE
  5084. MatMulHBlockX( matrixA, matrixB, matrixC, stride, IncC,
  5085. StrideC, RowsA, ColsB, RowsB, add );
  5086. END;
  5087. Toc( t, compT ); cachePool.Release( cacheA );
  5088. cachePool.Release( cacheB ); RETURN TRUE;
  5089. END MatMulAXAXTransposed;
  5090. (****** strided matrix multiplication with restrictions to increments ******)
  5091. PROCEDURE MatMulARARSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5092. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5093. add: BOOLEAN ): BOOLEAN;
  5094. VAR sum: REAL; CbFrom, i, j, k: SIZE; valA, valB: REAL;
  5095. adrA, adrB, adrC: ADDRESS;
  5096. cacheA, cacheB, cacheC: Cache;
  5097. matrixCO, StrideCO, IncCO: SIZE; t: HUGEINT;
  5098. (*VAR fromA, toA: LONGINT; *)
  5099. BEGIN
  5100. IF (IncA # SIZEOF( REAL )) THEN
  5101. cacheA :=
  5102. cachePool.Acquire( RowsA * ColsA * SIZEOF( REAL ) );
  5103. CopyDataR( matrixA, cacheA.adr, IncA, StrideA,
  5104. SIZEOF( REAL ), SIZEOF( REAL ) * ColsA, RowsA,
  5105. ColsA );
  5106. matrixA := cacheA.adr; IncA := SIZEOF( REAL );
  5107. StrideA := SIZEOF( REAL ) * ColsA;
  5108. END;
  5109. IF (IncB # SIZEOF( REAL )) THEN
  5110. cacheB :=
  5111. cachePool.Acquire( RowsB * ColsB * SIZEOF( REAL ) );
  5112. CopyDataR( matrixB, cacheB.adr, IncB, StrideB,
  5113. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsB,
  5114. ColsB );
  5115. matrixB := cacheB.adr; IncB := SIZEOF( REAL );
  5116. StrideB := SIZEOF( REAL ) * ColsB;
  5117. END;
  5118. IF (IncC # SIZEOF( REAL )) THEN
  5119. cacheC :=
  5120. cachePool.Acquire( RowsA * ColsB * SIZEOF( REAL ) );
  5121. CopyDataR( matrixC, cacheC.adr, IncC, StrideC,
  5122. SIZEOF( REAL ), SIZEOF( REAL ) * ColsB, RowsA,
  5123. ColsB );
  5124. matrixCO := matrixC; StrideCO := StrideC;
  5125. IncCO := IncC; matrixC := cacheC.adr;
  5126. IncC := SIZEOF( REAL ); StrideC := SIZEOF( REAL ) * ColsB;
  5127. END;
  5128. Tic( t );
  5129. CbFrom := 0;
  5130. IF ColsB >= 24 THEN
  5131. SSEMul24BlockR( CbFrom, StrideA, StrideB, StrideC,
  5132. ColsA, RowsA, ColsB, RowsB, matrixA,
  5133. matrixB, matrixC, add );
  5134. END;
  5135. IF ColsB - CbFrom >= 16 THEN
  5136. SSEMul16BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5137. CbFrom, matrixA, matrixB, matrixC, add );
  5138. INC( CbFrom, 16 );
  5139. END;
  5140. IF ColsB - CbFrom >= 8 THEN
  5141. SSEMul8BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5142. CbFrom, matrixA, matrixB, matrixC, add );
  5143. INC( CbFrom, 8 );
  5144. END;
  5145. IF ColsB - CbFrom >= 4 THEN
  5146. SSEMul4BlockR( StrideA, StrideB, StrideC, ColsA, RowsA,
  5147. CbFrom, matrixA, matrixB, matrixC, add );
  5148. INC( CbFrom, 4 );
  5149. END;
  5150. IF ColsB - CbFrom > 0 THEN
  5151. (* do it in Oberon *)
  5152. FOR i := 0 TO RowsA - 1 DO
  5153. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5154. FOR j := CbFrom TO ColsB - 1 DO
  5155. adrA := matrixA + i * StrideA;
  5156. adrB := matrixB + j * IncB;
  5157. IF add THEN SYSTEM.GET( adrC, sum )
  5158. ELSE sum := 0
  5159. END;
  5160. FOR k := 0 TO RowsB - 1 DO
  5161. SYSTEM.GET( adrA, valA );
  5162. SYSTEM.GET( adrB, valB );
  5163. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5164. INC( adrA, IncA ); INC( adrB, StrideB );
  5165. END;
  5166. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5167. (* C[i, j] := sum; *)
  5168. END;
  5169. END;
  5170. END;
  5171. Toc( t, compT );
  5172. IF cacheC # NIL THEN
  5173. CopyDataR( matrixC, matrixCO, IncC, StrideC, IncCO,
  5174. StrideCO, RowsA, ColsB );
  5175. END;
  5176. cachePool.Release( cacheA );
  5177. cachePool.Release( cacheB );
  5178. cachePool.Release( cacheC );
  5179. RETURN TRUE;
  5180. END MatMulARARSSEStride;
  5181. PROCEDURE MatMulAXAXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5182. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5183. add: BOOLEAN ): BOOLEAN;
  5184. VAR sum: LONGREAL; CbFrom, i, j, k: SIZE;
  5185. valA, valB: LONGREAL; adrA, adrB, adrC: ADDRESS;
  5186. cacheA, cacheB, cacheC: Cache;
  5187. matrixCO, StrideCO, IncCO:SIZE; t: HUGEINT;
  5188. BEGIN
  5189. IF (IncA # SIZEOF( LONGREAL )) THEN
  5190. cacheA :=
  5191. cachePool.Acquire( RowsA * ColsA * SIZEOF( LONGREAL ) );
  5192. CopyDataX( matrixA, cacheA.adr, IncA, StrideA,
  5193. SIZEOF( LONGREAL ),
  5194. SIZEOF( LONGREAL ) * ColsA, RowsA, ColsA );
  5195. matrixA := cacheA.adr;
  5196. StrideA := SIZEOF( LONGREAL ) * ColsA;
  5197. IncA := SIZEOF( LONGREAL );
  5198. END;
  5199. IF (IncB # SIZEOF( LONGREAL )) THEN
  5200. cacheB :=
  5201. cachePool.Acquire( RowsB * ColsB * SIZEOF( LONGREAL ) );
  5202. CopyDataX( matrixB, cacheB.adr, IncB, StrideB,
  5203. SIZEOF( LONGREAL ),
  5204. SIZEOF( LONGREAL ) * ColsB, RowsB, ColsB );
  5205. matrixB := cacheB.adr;
  5206. StrideB := SIZEOF( LONGREAL ) * ColsB;
  5207. IncB := SIZEOF( LONGREAL );
  5208. END;
  5209. IF (IncC # SIZEOF( LONGREAL )) THEN
  5210. cacheC :=
  5211. cachePool.Acquire( RowsA * ColsB * SIZEOF( LONGREAL ) );
  5212. CopyDataX( matrixC, cacheC.adr, IncC, StrideC,
  5213. SIZEOF( LONGREAL ),
  5214. SIZEOF( LONGREAL ) * ColsB, RowsA, ColsB );
  5215. matrixCO := matrixC; StrideCO := StrideC;
  5216. IncCO := IncC; StrideC := SIZEOF( LONGREAL ) * ColsB;
  5217. IncC := SIZEOF( LONGREAL ); matrixC := cacheC.adr;
  5218. END;
  5219. Tic( t );
  5220. CbFrom := 0;
  5221. IF ColsB >= 12 THEN
  5222. SSEMul12BlockX( CbFrom, StrideA, StrideB, StrideC,
  5223. ColsA, RowsA, ColsB, RowsB, matrixA,
  5224. matrixB, matrixC, add );
  5225. END;
  5226. IF ColsB - CbFrom >= 8 THEN
  5227. SSEMul8BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5228. CbFrom, matrixA, matrixB, matrixC, add );
  5229. INC( CbFrom, 8 );
  5230. END;
  5231. IF ColsB - CbFrom >= 4 THEN
  5232. SSEMul4BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5233. CbFrom, matrixA, matrixB, matrixC, add );
  5234. INC( CbFrom, 4 );
  5235. END;
  5236. IF ColsB - CbFrom >= 2 THEN
  5237. SSEMul2BlockX( StrideA, StrideB, StrideC, ColsA, RowsA,
  5238. CbFrom, matrixA, matrixB, matrixC, add );
  5239. INC( CbFrom, 2 );
  5240. END;
  5241. IF ColsB - CbFrom > 0 THEN
  5242. (* do it in Oberon *)
  5243. FOR i := 0 TO RowsA - 1 DO
  5244. adrC := matrixC + i * StrideC + CbFrom * IncC;
  5245. FOR j := CbFrom TO ColsB - 1 DO
  5246. adrA := matrixA + i * StrideA;
  5247. adrB := matrixB + j * IncB;
  5248. IF add THEN SYSTEM.GET( adrC, sum )
  5249. ELSE sum := 0
  5250. END;
  5251. FOR k := 0 TO RowsB - 1 DO
  5252. SYSTEM.GET( adrA, valA );
  5253. SYSTEM.GET( adrB, valB );
  5254. sum := sum + (* A[i, k] * B[k, j] *) valA * valB;
  5255. INC( adrA, IncA ); INC( adrB, StrideB );
  5256. END;
  5257. SYSTEM.PUT( adrC, sum ); INC( adrC, IncC );
  5258. (* C[i, j] := sum; *)
  5259. END;
  5260. END;
  5261. END;
  5262. Toc( t, compT );
  5263. IF cacheC # NIL THEN
  5264. CopyDataX( matrixC, matrixCO, IncC, StrideC, IncCO,
  5265. StrideCO, RowsA, ColsB );
  5266. END;
  5267. cachePool.Release( cacheA );
  5268. cachePool.Release( cacheB );
  5269. cachePool.Release( cacheC );
  5270. RETURN TRUE;
  5271. END MatMulAXAXSSEStride;
  5272. (****** naiive Oberon matrix multiplication ******)
  5273. PROCEDURE MatMulARARNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5274. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5275. add: BOOLEAN );
  5276. (*
  5277. A is M x K matrix, M=rows (A); K=cols(A);
  5278. B is K x N matrix; K=rows(B); N = cols(B);
  5279. C is M x N matrix; M=rows(C); N=cols(C);
  5280. *)
  5281. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5282. val1, val2, sum: REAL; t: HUGEINT;
  5283. BEGIN
  5284. Tic( t );
  5285. FOR i := 1 TO M DO
  5286. adrC := matrixC; adrB := matrixB;
  5287. FOR j := 1 TO N DO
  5288. adrA := matrixA; innerB := adrB;
  5289. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5290. FOR k := 1 TO K DO
  5291. SYSTEM.GET( adrA, val1 );
  5292. SYSTEM.GET( innerB, val2 );
  5293. sum := sum + val1 * val2; INC( adrA, IncA );
  5294. INC( innerB, StrideB );
  5295. END;
  5296. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5297. INC( adrC, IncC );
  5298. END;
  5299. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5300. END;
  5301. Toc( t, compT );
  5302. END MatMulARARNaiive;
  5303. PROCEDURE MatMulAXAXNaiive( matrixA, matrixB, matrixC: ADDRESS;
  5304. IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: SIZE;
  5305. add: BOOLEAN );
  5306. (*
  5307. A is M x K matrix, M=rows (A); K=cols(A);
  5308. B is K x N matrix; K=rows(B); N = cols(B);
  5309. C is M x N matrix; M=rows(C); N=cols(C);
  5310. *)
  5311. VAR adrA, adrB, innerB, adrC: ADDRESS; i, j, k: SIZE;
  5312. val1, val2, sum: LONGREAL; t: HUGEINT;
  5313. BEGIN
  5314. Tic( t );
  5315. FOR i := 1 TO M DO
  5316. adrC := matrixC; adrB := matrixB;
  5317. FOR j := 1 TO N DO
  5318. adrA := matrixA; innerB := adrB;
  5319. IF add THEN SYSTEM.GET( adrC, sum ) ELSE sum := 0 END;
  5320. FOR k := 1 TO K DO
  5321. SYSTEM.GET( adrA, val1 );
  5322. SYSTEM.GET( innerB, val2 );
  5323. sum := sum + val1 * val2; INC( adrA, IncA );
  5324. INC( innerB, StrideB );
  5325. END;
  5326. SYSTEM.PUT( adrC, sum ); INC( adrB, IncB );
  5327. INC( adrC, IncC );
  5328. END;
  5329. INC( matrixA, StrideA ); INC( matrixC, StrideC );
  5330. END;
  5331. Toc( t, compT );
  5332. END MatMulAXAXNaiive;
  5333. (*
  5334. PROCEDURE Toggle( VAR A, B: LONGINT );
  5335. VAR temp: LONGINT;
  5336. BEGIN
  5337. temp := A; A := B; B := temp;
  5338. END Toggle;
  5339. PROCEDURE Transpose( VAR matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB, IncC, StrideC, M, N, K: LONGINT );
  5340. (*
  5341. prepare computation of C=A*B via C = (B` * A`)`
  5342. *)
  5343. BEGIN
  5344. Toggle( matrixA, matrixB ); Toggle( IncA, StrideB ); Toggle( StrideA, IncB );
  5345. Toggle( IncC, StrideC ); Toggle( M, N );
  5346. END Transpose;
  5347. *)
  5348. (*
  5349. *)
  5350. PROCEDURE BestMethod( M, N, K: SIZE ): LONGINT;
  5351. BEGIN
  5352. IF M = 1 THEN
  5353. IF N < 32 THEN RETURN cMatMulScalarProduct
  5354. ELSIF N < 256 THEN
  5355. IF K < 256 THEN RETURN cMatMulScalarProduct
  5356. ELSE RETURN cMatMulStride
  5357. END;
  5358. ELSE RETURN cMatMulStride
  5359. END;
  5360. ELSIF N = 1 THEN
  5361. IF (M > 1024) & (K > 1024) THEN
  5362. RETURN cMatMulTransposed
  5363. ELSE RETURN cMatMulScalarProduct
  5364. END;
  5365. ELSIF K = 1 THEN
  5366. IF N < 32 THEN
  5367. IF M < 256 THEN RETURN cMatMulNaive
  5368. ELSE RETURN cMatMulStride
  5369. END;
  5370. ELSIF N < 256 THEN
  5371. IF M < 32 THEN RETURN cMatMulNaive
  5372. ELSE RETURN cMatMulStride
  5373. END;
  5374. ELSE RETURN cMatMulStride
  5375. END;
  5376. ELSIF M < 32 THEN
  5377. IF N < 32 THEN RETURN cMatMulScalarProduct
  5378. ELSIF N < 256 THEN
  5379. IF K < 32 THEN RETURN cMatMulScalarProduct
  5380. ELSE RETURN cMatMulStride
  5381. END;
  5382. ELSE RETURN cMatMulStride
  5383. END;
  5384. ELSIF M < 256 THEN
  5385. IF N < 32 THEN
  5386. IF K < 32 THEN RETURN cMatMulScalarProduct
  5387. ELSE RETURN cMatMulStride
  5388. END;
  5389. ELSE
  5390. IF K < 256 THEN RETURN cMatMulStride
  5391. ELSE RETURN cMatMulBlocked
  5392. END;
  5393. END;
  5394. ELSE
  5395. IF N < 32 THEN RETURN cMatMulStride ELSE
  5396. IF K < 256 THEN RETURN cMatMulStride
  5397. ELSE RETURN cMatMulBlocked
  5398. END;
  5399. END;
  5400. END;
  5401. RETURN cMatMulStride;
  5402. END BestMethod;
  5403. (*
  5404. (N) (K) (N)
  5405. CCCCCC AAAAA BBBBB
  5406. CCCCCC AAAAA BBBBB
  5407. (M) CCCCCC = (M) AAAAA * (K) BBBBB
  5408. CCCCCC AAAAA BBBBB
  5409. CCCCCC AAAAA BBBBB
  5410. *)
  5411. PROCEDURE MatMulR( matrixA, matrixB, matrixC: ADDRESS;
  5412. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5413. (*! heuristics for choice of different methods needs improvement *)
  5414. (*! transpose if superior*)
  5415. (*! provide special variant for small [up to 4x4] matrices *)
  5416. VAR M, N, K: SIZE;
  5417. BEGIN
  5418. ASSERT( ColsA = RowsB );
  5419. M := RowsA; N := ColsB; K := ColsA;
  5420. CASE BestMethod( M, N, K ) OF
  5421. | cMatMulScalarProduct:
  5422. RETURN FALSE;
  5423. | cMatMulNaive:
  5424. RETURN MatMulRNaive( matrixA, matrixB, matrixC, IncA,
  5425. StrideA, IncB, StrideB, IncC,
  5426. StrideC, RowsA, ColsA, RowsB,
  5427. ColsB );
  5428. | cMatMulTransposed:
  5429. RETURN MatMulARARTransposed( matrixA, matrixB,
  5430. matrixC, IncA,
  5431. StrideA, IncB,
  5432. StrideB, IncC,
  5433. StrideC, RowsA,
  5434. ColsA, RowsB,
  5435. ColsB, FALSE );
  5436. | cMatMulStride:
  5437. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5438. matrixC, IncA, StrideA,
  5439. IncB, StrideB, IncC,
  5440. StrideC, RowsA,
  5441. ColsA, RowsB, ColsB,
  5442. FALSE );
  5443. | cMatMulBlocked:
  5444. RETURN MatMulARARBlocked( matrixA, matrixB,
  5445. matrixC, IncA, StrideA,
  5446. IncB, StrideB, IncC,
  5447. StrideC, RowsA, ColsA,
  5448. RowsB, ColsB, FALSE );
  5449. ELSE
  5450. RETURN FALSE (* use scalar product for each row and column *)
  5451. END;
  5452. END MatMulR;
  5453. PROCEDURE MatMulX( matrixA, matrixB, matrixC: ADDRESS;
  5454. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5455. VAR M, N, K: SIZE;
  5456. BEGIN
  5457. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5458. K := ColsA;
  5459. (*
  5460. KernelLog.String("MatMulX, M,N,K = "); KernelLog.Int(M,10); KernelLog.Int(N,10); KernelLog.Int(K,10); KernelLog.Ln;
  5461. KernelLog.String("Method= "); KernelLog.Int( BestMethod(M,N,K),10); KernelLog.Ln;
  5462. *)
  5463. CASE BestMethod( M, N, K ) OF
  5464. | cMatMulScalarProduct:
  5465. RETURN FALSE;
  5466. | cMatMulNaive:
  5467. RETURN MatMulXNaive( matrixA, matrixB, matrixC, IncA,
  5468. StrideA, IncB, StrideB, IncC,
  5469. StrideC, RowsA, ColsA, RowsB,
  5470. ColsB );
  5471. | cMatMulTransposed:
  5472. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5473. matrixC, IncA,
  5474. StrideA, IncB, StrideB,
  5475. IncC, StrideC, RowsA,
  5476. ColsA, RowsB, ColsB,
  5477. FALSE );
  5478. | cMatMulStride:
  5479. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5480. matrixC, IncA, StrideA,
  5481. IncB, StrideB, IncC,
  5482. StrideC, RowsA, ColsA,
  5483. RowsB, ColsB,
  5484. FALSE );
  5485. | cMatMulBlocked:
  5486. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5487. matrixC, IncA, StrideA,
  5488. IncB, StrideB, IncC,
  5489. StrideC, RowsA, ColsA,
  5490. RowsB, ColsB, FALSE );
  5491. ELSE
  5492. RETURN FALSE (* use scalar product for each row and column *)
  5493. END;
  5494. END MatMulX;
  5495. PROCEDURE MatMulIncR( matrixA, matrixB, matrixC: ADDRESS;
  5496. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5497. (*! heuristics for choice of different methods needs improvement *)
  5498. (*! transpose if superior*)
  5499. (*! provide special variant for small [up to 4x4] matrices *)
  5500. VAR M, N, K: SIZE;
  5501. BEGIN
  5502. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5503. K := ColsA;
  5504. CASE BestMethod( M, N, K ) OF
  5505. | cMatMulScalarProduct:
  5506. RETURN FALSE;
  5507. | cMatMulNaive:
  5508. RETURN MatMulIncRNaive( matrixA, matrixB, matrixC,
  5509. IncA, StrideA, IncB, StrideB,
  5510. IncC, StrideC, RowsA, ColsA,
  5511. RowsB, ColsB );
  5512. | cMatMulTransposed:
  5513. RETURN MatMulARARTransposed( matrixA, matrixB,
  5514. matrixC, IncA,
  5515. StrideA, IncB,
  5516. StrideB, IncC,
  5517. StrideC, RowsA,
  5518. ColsA, RowsB,
  5519. ColsB, TRUE );
  5520. | cMatMulStride:
  5521. RETURN MatMulARARSSEStride( matrixA, matrixB,
  5522. matrixC, IncA, StrideA,
  5523. IncB, StrideB, IncC,
  5524. StrideC, RowsA,
  5525. ColsA, RowsB, ColsB,
  5526. TRUE );
  5527. | cMatMulBlocked:
  5528. RETURN MatMulARARBlocked( matrixA, matrixB,
  5529. matrixC, IncA, StrideA,
  5530. IncB, StrideB, IncC,
  5531. StrideC, RowsA, ColsA,
  5532. RowsB, ColsB, TRUE );
  5533. ELSE
  5534. RETURN FALSE (* use scalar product for each row and column *)
  5535. END;
  5536. END MatMulIncR;
  5537. PROCEDURE MatMulIncX( matrixA, matrixB, matrixC: ADDRESS;
  5538. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5539. VAR M, N, K: SIZE;
  5540. BEGIN
  5541. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5542. K := ColsA;
  5543. CASE BestMethod( M, N, K ) OF
  5544. | cMatMulScalarProduct:
  5545. RETURN FALSE;
  5546. | cMatMulNaive:
  5547. RETURN MatMulIncXNaive( matrixA, matrixB, matrixC,
  5548. IncA, StrideA, IncB, StrideB,
  5549. IncC, StrideC, RowsA, ColsA,
  5550. RowsB, ColsB );
  5551. | cMatMulTransposed:
  5552. RETURN MatMulAXAXTransposed( matrixA, matrixB,
  5553. matrixC, IncA,
  5554. StrideA, IncB, StrideB,
  5555. IncC, StrideC, RowsA,
  5556. ColsA, RowsB, ColsB,
  5557. TRUE );
  5558. | cMatMulStride:
  5559. RETURN MatMulAXAXSSEStride( matrixA, matrixB,
  5560. matrixC, IncA, StrideA,
  5561. IncB, StrideB, IncC,
  5562. StrideC, RowsA, ColsA,
  5563. RowsB, ColsB, TRUE );
  5564. | cMatMulBlocked:
  5565. RETURN MatMulAXAXBlocked( matrixA, matrixB,
  5566. matrixC, IncA, StrideA,
  5567. IncB, StrideB, IncC,
  5568. StrideC, RowsA, ColsA,
  5569. RowsB, ColsB, TRUE );
  5570. ELSE
  5571. RETURN FALSE (* use scalar product for each row and column *)
  5572. END;
  5573. END MatMulIncX;
  5574. PROCEDURE MatMulARARBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5575. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5576. add: BOOLEAN ): BOOLEAN;
  5577. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5578. BEGIN
  5579. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5580. K := ColsA; MagicBlockR( M, N, K, L2M, L2N, L2K );
  5581. (*
  5582. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5583. IncC, StrideC, RowsA, ColsB, ColsA );
  5584. *)
  5585. MultiplyR( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5586. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5587. StrideC, add );
  5588. RETURN TRUE;
  5589. END MatMulARARBlocked;
  5590. PROCEDURE MatMulAXAXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5591. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE;
  5592. add: BOOLEAN ): BOOLEAN;
  5593. VAR M, N, K, L2M, L2N, L2K: SIZE;
  5594. BEGIN
  5595. ASSERT( ColsA = RowsB ); M := RowsA; N := ColsB;
  5596. K := ColsA; MagicBlockX( M, N, K, L2M, L2N, L2K );
  5597. (*
  5598. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA, IncB, StrideB,
  5599. IncC, StrideC, RowsA, ColsB, ColsA );
  5600. *)
  5601. MultiplyX( matrixA, matrixB, matrixC, RowsA, ColsB, ColsA,
  5602. L2M, L2N, L2K, IncA, StrideA, IncB, StrideB, IncC,
  5603. StrideC, add );
  5604. RETURN TRUE;
  5605. END MatMulAXAXBlocked;
  5606. PROCEDURE MatMulRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5607. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5608. BEGIN
  5609. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5610. IncB, StrideB, IncC, StrideC, RowsA,
  5611. ColsB, ColsA, FALSE );
  5612. RETURN TRUE;
  5613. END MatMulRNaive;
  5614. PROCEDURE MatMulXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5615. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5616. BEGIN
  5617. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5618. IncB, StrideB, IncC, StrideC, RowsA,
  5619. ColsB, ColsA, FALSE );
  5620. RETURN TRUE;
  5621. END MatMulXNaive;
  5622. PROCEDURE MatMulIncRNaive( matrixA, matrixB, matrixC: ADDRESS;
  5623. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5624. BEGIN
  5625. MatMulARARNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5626. IncB, StrideB, IncC, StrideC, RowsA,
  5627. ColsB, ColsA, TRUE );
  5628. RETURN TRUE;
  5629. END MatMulIncRNaive;
  5630. PROCEDURE MatMulIncXNaive( matrixA, matrixB, matrixC: ADDRESS;
  5631. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5632. BEGIN
  5633. MatMulAXAXNaiive( matrixA, matrixB, matrixC, IncA, StrideA,
  5634. IncB, StrideB, IncC, StrideC, RowsA,
  5635. ColsB, ColsA, TRUE );
  5636. RETURN TRUE;
  5637. END MatMulIncXNaive;
  5638. PROCEDURE MatMulXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5639. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5640. BEGIN
  5641. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5642. IncA, StrideA, IncB,
  5643. StrideB, IncC, StrideC,
  5644. RowsA, ColsA, RowsB,
  5645. ColsB, FALSE );
  5646. END MatMulXTransposed;
  5647. PROCEDURE MatMulIncXTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5648. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5649. BEGIN
  5650. RETURN MatMulAXAXTransposed( matrixA, matrixB, matrixC,
  5651. IncA, StrideA, IncB,
  5652. StrideB, IncC, StrideC,
  5653. RowsA, ColsA, RowsB,
  5654. ColsB, TRUE )
  5655. END MatMulIncXTransposed;
  5656. PROCEDURE MatMulRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5657. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5658. BEGIN
  5659. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5660. IncA, StrideA, IncB,
  5661. StrideB, IncC, StrideC,
  5662. RowsA, ColsA, RowsB,
  5663. ColsB, FALSE );
  5664. END MatMulRTransposed;
  5665. PROCEDURE MatMulIncRTransposed( matrixA, matrixB, matrixC: ADDRESS;
  5666. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5667. BEGIN
  5668. RETURN MatMulARARTransposed( matrixA, matrixB, matrixC,
  5669. IncA, StrideA, IncB,
  5670. StrideB, IncC, StrideC,
  5671. RowsA, ColsA, RowsB,
  5672. ColsB, TRUE )
  5673. END MatMulIncRTransposed;
  5674. PROCEDURE MatMulXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5675. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5676. BEGIN
  5677. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5678. IncA, StrideA, IncB, StrideB,
  5679. IncC, StrideC, RowsA,
  5680. ColsA, RowsB, ColsB,
  5681. FALSE );
  5682. END MatMulXSSEStride;
  5683. PROCEDURE MatMulIncXSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5684. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5685. BEGIN
  5686. RETURN MatMulAXAXSSEStride( matrixA, matrixB, matrixC,
  5687. IncA, StrideA, IncB, StrideB,
  5688. IncC, StrideC, RowsA,
  5689. ColsA, RowsB, ColsB,
  5690. TRUE );
  5691. END MatMulIncXSSEStride;
  5692. PROCEDURE MatMulRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5693. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5694. BEGIN
  5695. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5696. IncA, StrideA, IncB, StrideB,
  5697. IncC, StrideC, RowsA,
  5698. ColsA, RowsB, ColsB,
  5699. FALSE );
  5700. END MatMulRSSEStride;
  5701. PROCEDURE MatMulIncRSSEStride( matrixA, matrixB, matrixC: ADDRESS;
  5702. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5703. BEGIN
  5704. RETURN MatMulARARSSEStride( matrixA, matrixB, matrixC,
  5705. IncA, StrideA, IncB, StrideB,
  5706. IncC, StrideC, RowsA,
  5707. ColsA, RowsB, ColsB,
  5708. TRUE )
  5709. END MatMulIncRSSEStride;
  5710. PROCEDURE MatMulRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5711. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5712. BEGIN
  5713. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5714. IncA, StrideA, IncB, StrideB,
  5715. IncC, StrideC, RowsA, ColsA,
  5716. RowsB, ColsB, FALSE )
  5717. END MatMulRBlocked;
  5718. PROCEDURE MatMulIncRBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5719. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5720. BEGIN
  5721. RETURN MatMulARARBlocked( matrixA, matrixB, matrixC,
  5722. IncA, StrideA, IncB, StrideB,
  5723. IncC, StrideC, RowsA, ColsA,
  5724. RowsB, ColsB, TRUE )
  5725. END MatMulIncRBlocked;
  5726. PROCEDURE MatMulXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5727. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5728. BEGIN
  5729. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5730. IncA, StrideA, IncB, StrideB,
  5731. IncC, StrideC, RowsA, ColsA,
  5732. RowsB, ColsB, FALSE )
  5733. END MatMulXBlocked;
  5734. PROCEDURE MatMulIncXBlocked( matrixA, matrixB, matrixC: ADDRESS;
  5735. IncA, StrideA, IncB, StrideB, IncC, StrideC, RowsA, ColsA, RowsB, ColsB: SIZE ): BOOLEAN;
  5736. BEGIN
  5737. RETURN MatMulAXAXBlocked( matrixA, matrixB, matrixC,
  5738. IncA, StrideA, IncB, StrideB,
  5739. IncC, StrideC, RowsA, ColsA,
  5740. RowsB, ColsB, TRUE )
  5741. END MatMulIncXBlocked;
  5742. PROCEDURE SetMatMulMethod*( i: LONGINT );
  5743. BEGIN
  5744. KernelLog.String("ArrayBaseOptimized, method = ");
  5745. IF i = cMatMulDynamic THEN
  5746. KernelLog.String("dynamic.");
  5747. ArrayBase.matMulIncR := MatMulIncR;
  5748. ArrayBase.matMulIncX := MatMulIncX;
  5749. ArrayBase.matMulR := MatMulR;
  5750. ArrayBase.matMulX := MatMulX;
  5751. ELSIF i = cMatMulScalarProduct THEN
  5752. KernelLog.String("scalarproduct.");
  5753. ArrayBase.matMulIncR := NIL;
  5754. ArrayBase.matMulIncX := NIL;
  5755. ArrayBase.matMulR := NIL; ArrayBase.matMulX := NIL;
  5756. ELSIF i = cMatMulNaive THEN
  5757. KernelLog.String("naiive.");
  5758. ArrayBase.matMulR := MatMulRNaive;
  5759. ArrayBase.matMulX := MatMulXNaive;
  5760. ArrayBase.matMulIncR := MatMulIncRNaive;
  5761. ArrayBase.matMulIncX := MatMulIncXNaive;
  5762. ELSIF i = cMatMulTransposed THEN
  5763. KernelLog.String("transposed.");
  5764. ArrayBase.matMulR := MatMulRTransposed;
  5765. ArrayBase.matMulX := MatMulXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  5766. ArrayBase.matMulIncR := MatMulIncRTransposed;
  5767. ArrayBase.matMulIncX := MatMulIncXTransposed; (* KernelLog.String( "transposed" ); KernelLog.Ln; *)
  5768. ELSIF i = cMatMulStride THEN
  5769. KernelLog.String("stride.");
  5770. ArrayBase.matMulR := MatMulRSSEStride;
  5771. ArrayBase.matMulX := MatMulXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  5772. ArrayBase.matMulIncR := MatMulIncRSSEStride;
  5773. ArrayBase.matMulIncX := MatMulIncXSSEStride; (* KernelLog.String( "stride" ); KernelLog.Ln; *)
  5774. ELSIF i = cMatMulBlocked THEN
  5775. KernelLog.String("blocked.");
  5776. ArrayBase.matMulR := MatMulRBlocked;
  5777. ArrayBase.matMulX := MatMulXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  5778. ArrayBase.matMulIncR := MatMulIncRBlocked;
  5779. ArrayBase.matMulIncX := MatMulIncXBlocked; (* KernelLog.String( "blocked" ); KernelLog.Ln; *)
  5780. END;
  5781. KernelLog.Ln;
  5782. END SetMatMulMethod;
  5783. (* optimizations for small arrays (Alexey Morozov) *)
  5784. (* assumes that all arrays do not overlap *)
  5785. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  5786. PROCEDURE MatMulR2x2(dadr, ladr, radr: ADDRESS);
  5787. CODE{SYSTEM.i386, SYSTEM.SSE2}
  5788. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5789. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5790. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  5791. MOVUPS XMM0, [EAX] ; [a00,a01,a10,a11]
  5792. MOVUPS XMM1, [EBX] ; [b00,b01,b10,b11]
  5793. MOVAPS XMM2, XMM1
  5794. SHUFPS XMM2, XMM1, 204 ; XMM2 := [b00,b11,b00,b11]
  5795. MULPS XMM2, XMM0
  5796. SHUFPS XMM0, XMM0, 177 ; XMM0 := [a01,a00,a11,a10]
  5797. SHUFPS XMM1, XMM1, 102 ; XMM1 := [b10,b01,b10,b01]
  5798. MULPS XMM1, XMM0
  5799. ADDPS XMM1, XMM2
  5800. MOVUPS [ECX], XMM1
  5801. END MatMulR2x2;
  5802. (* based on weighted sum of rows (Alexey Morozov) *)
  5803. (* FIXME: use MOVAPS when Felix fixes problems with alignment!!! *)
  5804. PROCEDURE MatMulR3x3(dadr, ladr, radr: ADDRESS);
  5805. CODE{SYSTEM.i386, SYSTEM.SSE2}
  5806. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5807. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5808. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  5809. MOVUPS XMM0, [EBX] ; XMM0 := [b00,b01,b02,-]
  5810. MOVUPS XMM1, [EBX+12] ; XMM1 := [b10,b11,b12,-]
  5811. ; last element is out of range, is it still OK?
  5812. MOVUPS XMM2, [EBX+24] ; XMM2 := [b20,b21,b22,-]
  5813. ;MOVLPS XMM2, [EBX+24]
  5814. ;MOVSS XMM3, [EBX+32]
  5815. ;MOVLHPS XMM2, XMM3
  5816. MOVSS XMM3, [EAX]
  5817. SHUFPS XMM3, XMM3, 0; XMM3 := [a00,a00,a00,-]
  5818. MOVAPS XMM4, XMM0
  5819. MULPS XMM4, XMM3
  5820. MOVSS XMM3, [EAX+4]
  5821. SHUFPS XMM3, XMM3, 0; XMM3 := [a01,a01,a01,-]
  5822. MULPS XMM3, XMM1
  5823. ADDPS XMM4, XMM3
  5824. MOVSS XMM3, [EAX+8]
  5825. SHUFPS XMM3, XMM3, 0; XMM3 := [a02,a02,a02,-]
  5826. MULPS XMM3, XMM2
  5827. ADDPS XMM4, XMM3
  5828. MOVUPS [ECX], XMM4
  5829. ;***************************************************;
  5830. MOVSS XMM3, [EAX+12]
  5831. SHUFPS XMM3, XMM3, 0; XMM3 := [a10,a10,a10,-]
  5832. MOVAPS XMM4, XMM0
  5833. MULPS XMM4, XMM3
  5834. MOVSS XMM3, [EAX+16]
  5835. SHUFPS XMM3, XMM3, 0; XMM3 := [a11,a11,a11,-]
  5836. MULPS XMM3, XMM1
  5837. ADDPS XMM4, XMM3
  5838. MOVSS XMM3, [EAX+20]
  5839. SHUFPS XMM3, XMM3, 0; XMM3 := [a12,a12,a12,-]
  5840. MULPS XMM3, XMM2
  5841. ADDPS XMM4, XMM3
  5842. MOVUPS [ECX+12], XMM4
  5843. ;***************************************************;
  5844. MOVSS XMM3, [EAX+24]
  5845. SHUFPS XMM3, XMM3, 0; XMM3 := [a20,a20,a20,-]
  5846. MOVAPS XMM4, XMM0
  5847. MULPS XMM4, XMM3
  5848. MOVSS XMM3, [EAX+28]
  5849. SHUFPS XMM3, XMM3, 0; XMM3 := [a21,a21,a21,-]
  5850. MULPS XMM3, XMM1
  5851. ADDPS XMM4, XMM3
  5852. MOVSS XMM3, [EAX+32]
  5853. SHUFPS XMM3, XMM3, 0; XMM3 := [a22,a22,a22,-]
  5854. MULPS XMM3, XMM2
  5855. ADDPS XMM4, XMM3
  5856. ;MOVUPS [ECX+24], XMM4
  5857. MOVLPS [ECX+24], XMM4
  5858. MOVHLPS XMM4, XMM4
  5859. MOVSS [ECX+32], XMM4
  5860. END MatMulR3x3;
  5861. (* based on Strassen algorithm (Alexey Morozov) *)
  5862. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  5863. PROCEDURE MatMulR4x4(dadr, ladr, radr: ADDRESS);
  5864. CODE{SYSTEM.i386, SYSTEM.SSE2}
  5865. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5866. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5867. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  5868. ; load A00
  5869. MOVLPS XMM0, [EAX] ; XMM0 := [a00,a01,-,-]
  5870. MOVHPS XMM0, [EAX+16] ; XMM0 := [a00,a01,a10,a11]
  5871. ; load A01
  5872. MOVLPS XMM1, [EAX+8] ; XMM1 := [a02,a03,-,-]
  5873. MOVHPS XMM1, [EAX+24] ; XMM1 := [a02,a03,a12,a13]
  5874. ; load B00
  5875. MOVLPS XMM2, [EBX] ; XMM2 := [b00,b01,-,-]
  5876. MOVHPS XMM2, [EBX+16] ; XMM2 := [b00,b01,b10,b11]
  5877. ; load B01
  5878. MOVLPS XMM3, [EBX+8] ; XMM3 := [a02,a03,-,-]
  5879. MOVHPS XMM3, [EBX+24] ; XMM3 := [a02,a03,a12,a13]
  5880. ; load B10
  5881. MOVLPS XMM4, [EBX+32] ; XMM4 := [b20,b21,-,-]
  5882. MOVHPS XMM4, [EBX+48] ; XMM4 := [b20,b21,b30,b31]
  5883. ; load B11
  5884. MOVLPS XMM5, [EBX+40] ; XMM5 := [b22,b23,-,-]
  5885. MOVHPS XMM5, [EBX+56] ; XMM5 := [b22,b23,b32,b33]
  5886. ;****************************************************;
  5887. ; multiply A00(D)*B00(E) (use MatMulR2x2 code)
  5888. MOVAPS XMM6, XMM2
  5889. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5890. MULPS XMM6, XMM0
  5891. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5892. MOVAPS XMM7, XMM2
  5893. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5894. MULPS XMM7, XMM0
  5895. ADDPS XMM7, XMM6
  5896. ; multiply A01(D)*B10(E)
  5897. MOVAPS XMM0, XMM4
  5898. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5899. MULPS XMM0, XMM1
  5900. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5901. MOVAPS XMM6, XMM4
  5902. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5903. MULPS XMM6, XMM1
  5904. ADDPS XMM6, XMM0
  5905. ADDPS XMM7, XMM6
  5906. MOVLPS [ECX], XMM7
  5907. MOVHPS [ECX+16], XMM7
  5908. ;****************************************************;
  5909. ; load A00
  5910. MOVLPS XMM0, [EAX] ; XMM0 := [a00,a01,-,-]
  5911. MOVHPS XMM0, [EAX+16] ; XMM0 := [a00,a01,a10,a11]
  5912. ; load A01
  5913. MOVLPS XMM1, [EAX+8] ; XMM1 := [a02,a03,-,-]
  5914. MOVHPS XMM1, [EAX+24] ; XMM1 := [a02,a03,a12,a13]
  5915. ; multiply A00(D)*B01(E) (use MatMulR2x2 code)
  5916. MOVAPS XMM6, XMM3
  5917. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5918. MULPS XMM6, XMM0
  5919. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5920. MOVAPS XMM7, XMM3
  5921. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5922. MULPS XMM7, XMM0
  5923. ADDPS XMM7, XMM6
  5924. ; multiply A01(D)*B11(E)
  5925. MOVAPS XMM0, XMM5
  5926. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5927. MULPS XMM0, XMM1
  5928. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5929. MOVAPS XMM6, XMM5
  5930. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5931. MULPS XMM6, XMM1
  5932. ADDPS XMM6, XMM0
  5933. ADDPS XMM7, XMM6
  5934. MOVLPS [ECX+8], XMM7
  5935. MOVHPS [ECX+24], XMM7
  5936. ;****************************************************;
  5937. ; load A10
  5938. MOVLPS XMM0, [EAX+32] ; XMM0 := [a20,a21,-,-]
  5939. MOVHPS XMM0, [EAX+48] ; XMM0 := [a20,a21,a30,a31]
  5940. ; load A11
  5941. MOVLPS XMM1, [EAX+40] ; XMM1 := [a22,a23,-,-]
  5942. MOVHPS XMM1, [EAX+56] ; XMM1 := [a22,a23,a32,a33]
  5943. ; multiply A10(D)*B00(E) (use MatMulR2x2 code)
  5944. MOVAPS XMM6, XMM2
  5945. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5946. MULPS XMM6, XMM0
  5947. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5948. MOVAPS XMM7, XMM2
  5949. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5950. MULPS XMM7, XMM0
  5951. ADDPS XMM7, XMM6
  5952. ; multiply A11(D)*B10(E)
  5953. MOVAPS XMM0, XMM4
  5954. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5955. MULPS XMM0, XMM1
  5956. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5957. MOVAPS XMM6, XMM4
  5958. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5959. MULPS XMM6, XMM1
  5960. ADDPS XMM6, XMM0
  5961. ADDPS XMM7, XMM6
  5962. MOVLPS [ECX+32], XMM7
  5963. MOVHPS [ECX+48], XMM7
  5964. ;****************************************************;
  5965. ; load A10
  5966. MOVLPS XMM0, [EAX+32] ; XMM0 := [a20,a21,-,-]
  5967. MOVHPS XMM0, [EAX+48] ; XMM0 := [a20,a21,a30,a31]
  5968. ; load A11
  5969. MOVLPS XMM1, [EAX+40] ; XMM1 := [a22,a23,-,-]
  5970. MOVHPS XMM1, [EAX+56] ; XMM1 := [a22,a23,a32,a33]
  5971. ; multiply A10(D)*B01(E) (use MatMulR2x2 code)
  5972. MOVAPS XMM6, XMM3
  5973. SHUFPS XMM6, XMM6, 204 ; XMM6 := [e00,e11,e00,e11]
  5974. MULPS XMM6, XMM0
  5975. SHUFPS XMM0, XMM0, 177 ; XMM0 := [d01,d00,d11,d10]
  5976. MOVAPS XMM7, XMM3
  5977. SHUFPS XMM7, XMM7, 102 ; XMM7 := [e10,e01,e10,e01]
  5978. MULPS XMM7, XMM0
  5979. ADDPS XMM7, XMM6
  5980. ; multiply A11(D)*B11(E)
  5981. MOVAPS XMM0, XMM5
  5982. SHUFPS XMM0, XMM0, 204 ; XMM0 := [e00,e11,e00,e11]
  5983. MULPS XMM0, XMM1
  5984. SHUFPS XMM1, XMM1, 177 ; XMM1 := [d01,d00,d11,d10]
  5985. MOVAPS XMM6, XMM5
  5986. SHUFPS XMM6, XMM6, 102 ; XMM6 := [e10,e01,e10,e01]
  5987. MULPS XMM6, XMM1
  5988. ADDPS XMM6, XMM0
  5989. ADDPS XMM7, XMM6
  5990. MOVLPS [ECX+40], XMM7
  5991. MOVHPS [ECX+56], XMM7
  5992. END MatMulR4x4;
  5993. (* FIXME: use MOVAPS when Felix fixes issues with alignment!!! *)
  5994. (* FIXME: speed it up when horizontal add is available!!! *)
  5995. PROCEDURE MatVecMulR2x2(dadr, ladr, radr: ADDRESS);
  5996. CODE{SYSTEM.i386, SYSTEM.SSE2}
  5997. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  5998. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  5999. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6000. ; load the whole matrix
  6001. MOVUPS XMM0, [EAX] ; XMM0 := [a00,a01,a10,a11]
  6002. MOVLPS XMM1, [EBX] ; XMM1 := [b00,b01,-,-]
  6003. MOVLHPS XMM1, XMM1 ; XMM1 := [b00,b10,b00,b10]
  6004. MULPS XMM0, XMM1 ; XMM0 := [a00*b00,a01*b10,a10*b00,a11*b10]
  6005. MOVAPS XMM1, XMM0
  6006. SHUFPS XMM0, XMM0, 8; XMM0 := [a00*b00,a10*b00,-,-]
  6007. SHUFPS XMM1, XMM1, 13; XMM1 := [a01*b10,a11*b10,-,-]
  6008. ADDPS XMM0, XMM1
  6009. MOVLPS [ECX], XMM0
  6010. END MatVecMulR2x2;
  6011. (* PH *)
  6012. (* to do: use MOVAPS when Felix fixes issues with alignment *)
  6013. PROCEDURE MatVecMulR4x4(dadr, ladr, radr: ADDRESS);
  6014. CODE{SYSTEM.i386, SYSTEM.SSE3}
  6015. MOV EBX, [EBP+radr] ; EBX := ADDR(right)
  6016. MOV EAX, [EBP+ladr] ; EAX := ADDR(left)
  6017. MOV ECX, [EBP+dadr] ; ECX := ADDR(dest)
  6018. MOVUPS XMM0, [EBX] ; XMM0 := [b0,b1,b2,b3]
  6019. MOVUPS XMM1, [EAX] ; XMM1 := [a00,a01,a02,a03]
  6020. MOVUPS XMM2, [EAX+16] ; XMM2 := [a10,a11,a12,a13]
  6021. MOVUPS XMM3, [EAX+32] ; XMM3 := [a20,a21,a22,a23]
  6022. MOVUPS XMM4, [EAX+48] ; XMM4 := [a30,a31,a32,a33]
  6023. MULPS XMM1, XMM0
  6024. MULPS XMM2, XMM0
  6025. HADDPS XMM1, XMM2 ; adjacent pairs are horizontally added
  6026. MULPS XMM3, XMM0
  6027. MULPS XMM4, XMM0
  6028. HADDPS XMM3, XMM4 ; adjacent pairs are horizontally added
  6029. HADDPS XMM1, XMM3 ; adjacent pairs are horizontally added
  6030. MOVUPS [ECX], XMM1
  6031. END MatVecMulR4x4;
  6032. PROCEDURE InstallMatMul*(context: Commands.Context);
  6033. VAR type: LONGINT; string: ARRAY 32 OF CHAR;
  6034. BEGIN
  6035. context.arg.String(string);
  6036. IF string = "dynamic" THEN
  6037. type := cMatMulDynamic;
  6038. ELSIF string = "scalarproduct" THEN
  6039. type := cMatMulScalarProduct
  6040. ELSIF string = "naive" THEN
  6041. type := cMatMulNaive
  6042. ELSIF string = "transposed" THEN
  6043. type := cMatMulTransposed
  6044. ELSIF string = "stride" THEN
  6045. type := cMatMulStride
  6046. ELSIF string ="blocked" THEN
  6047. type := cMatMulBlocked
  6048. ELSE
  6049. KernelLog.String("unknown method: "); KernelLog.String(string); KernelLog.Ln;
  6050. type := cMatMulDynamic;
  6051. END;
  6052. SetMatMulMethod( type );
  6053. END InstallMatMul;
  6054. PROCEDURE InstallAsm*;
  6055. BEGIN
  6056. KernelLog.String( "ASM " );
  6057. ArrayBase.loopSPAXAX := SPAXAXLoopA;
  6058. ArrayBase.loopSPARAR := SPARARLoopA;
  6059. ArrayBase.loopAddAXAX := AddAXAXLoopA;
  6060. ArrayBase.loopAddARAR := AddARARLoopA;
  6061. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopA;
  6062. ArrayBase.loopMatMulARAR := MatMulARARLoopA;
  6063. ArrayBase.loopMulAXSX := MulAXSXLoopA;
  6064. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopA;
  6065. ArrayBase.loopMulARSR := MulARSRLoopA;
  6066. ArrayBase.loopIncMulARSR := IncMulARSRLoopA;
  6067. ArrayBase.loopMatMulIncAXAX := MatMulIncAXAXLoopA;
  6068. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopA;
  6069. ArrayBase.transpose4 := Transpose4;
  6070. ArrayBase.transpose8 := Transpose8;
  6071. END InstallAsm;
  6072. PROCEDURE InstallSSE*;
  6073. BEGIN
  6074. IF Machine.SSESupport THEN
  6075. KernelLog.String( "SSE " );
  6076. ArrayBase.loopSPARAR := SPARARLoopSSE;
  6077. ArrayBase.loopAddARAR := AddARARLoopSSE;
  6078. ArrayBase.loopMulARSR := MulARSRLoopSSE;
  6079. ArrayBase.loopIncMulARSR := IncMulARSRLoopSSE;
  6080. ArrayBase.loopMatMulARAR := MatMulARARLoopSSE;
  6081. ArrayBase.matMulR := MatMulR;
  6082. ArrayBase.loopMatMulIncARAR := MatMulIncARARLoopSSE;
  6083. ArrayBase.matMulIncR := MatMulIncR;
  6084. (* optimizations for small matrices (Alexey Morozov) *)
  6085. ArrayBase.matMulR2x2 := MatMulR2x2;
  6086. ArrayBase.matMulR3x3 := MatMulR3x3;
  6087. ArrayBase.matMulR4x4 := MatMulR4x4;
  6088. ArrayBase.matVecMulR2x2 := MatVecMulR2x2;
  6089. END;
  6090. END InstallSSE;
  6091. PROCEDURE InstallSSE2*; (* extra for testing, will be merged with Install in later versions *)
  6092. BEGIN
  6093. IF Machine.SSE2Support THEN
  6094. KernelLog.String( "SSE2 " );
  6095. ArrayBase.loopSPAXAX := SPAXAXLoopSSE;
  6096. ArrayBase.loopAddAXAX := AddAXAXLoopSSE;
  6097. ArrayBase.loopMulAXSX := MulAXSXLoopSSE;
  6098. ArrayBase.loopIncMulAXSX := IncMulAXSXLoopSSE;
  6099. ArrayBase.loopMatMulAXAX := MatMulAXAXLoopSSE;
  6100. ArrayBase.matMulX := MatMulX;
  6101. ArrayBase.loopMatMulIncAXAX :=
  6102. MatMulIncAXAXLoopSSE;
  6103. ArrayBase.matMulIncX := MatMulIncX;
  6104. END;
  6105. END InstallSSE2;
  6106. (*! to do: at current, this only works for Win, not for native because SSE3Support is not yet implemented in I386.Machine.Mod*)
  6107. PROCEDURE InstallSSE3*; (* extra for testing, will be merged with Install in later versions *)
  6108. BEGIN
  6109. IF Machine.SSE3Support THEN
  6110. KernelLog.String( "SSE3 " );
  6111. (* optimizations for small matrices *)
  6112. ArrayBase.matVecMulR4x4 := MatVecMulR4x4;
  6113. END;
  6114. END InstallSSE3;
  6115. PROCEDURE Install*;
  6116. BEGIN
  6117. KernelLog.String( "ArrayBaseOptimized: installing runtime library optimizations:" );
  6118. InstallAsm; InstallSSE; InstallSSE2; InstallSSE3;
  6119. KernelLog.String( " done." ); KernelLog.Ln;
  6120. END Install;
  6121. PROCEDURE SetParameters*( context: Commands.Context );
  6122. BEGIN
  6123. context.arg.SkipWhitespace; context.arg.Int(cBlockSize,TRUE);
  6124. context.arg.SkipWhitespace; context.arg.Int(nrProcesses,TRUE);
  6125. IF nrProcesses > maxProcesses THEN
  6126. nrProcesses := maxProcesses
  6127. ELSIF nrProcesses = 0 THEN nrProcesses := Machine.NumberOfProcessors();
  6128. END;
  6129. KernelLog.String( "BlockSize=" ); KernelLog.Int( cBlockSize, 0 );
  6130. KernelLog.String( ", NrProcesses = " ); KernelLog.Int( nrProcesses, 0 ); KernelLog.Ln;
  6131. END SetParameters;
  6132. BEGIN
  6133. alignedC := 0; unalignedC := 0; singleC := 0;
  6134. matAllocTime := 0; matCompTime := 0;
  6135. cBlockSize := 0; (* automatic *)
  6136. nrProcesses := Machine.NumberOfProcessors(); (* automatic *)
  6137. allocT := 0; copyT := 0; compT := 0;
  6138. NEW( cachePool );
  6139. END FoxArrayBaseOptimized.
  6140. SystemTools.Free ArrayBaseOptimized ~
  6141. ArrayBaseOptimized.Install ~
  6142. ArrayBaseOptimized.InstallSSE2 ~
  6143. ArrayBaseOptimized.InstallSSE ~
  6144. ArrayBaseOptimized.InstallAsm ~
  6145. ArrayBaseOptimized.InstallMatMul dynamic ~
  6146. ArrayBaseOptimized.InstallMatMul scalarproduct ~
  6147. ArrayBaseOptimized.InstallMatMul transposed ~
  6148. ArrayBaseOptimized.InstallMatMul naive ~
  6149. ArrayBaseOptimized.InstallMatMul stride ~
  6150. ArrayBaseOptimized.InstallMatMul blocked ~
  6151. ArrayBaseOptimized.SetParameters 0 1 ~ (* BlockSize, NrProcesses *)