// =================================================================== // $Id: $ // // ktbftrav.cpp // // class: CKTBTraversal // // REPLACEMENT_STRING // // Copyright by Vlastimil Havran, 2007 - email to "vhavran AT seznam.cz" // Initial coding by Vlasta Havran, February 2007 (copy from kdrtrav.cpp) // GOLEM headers #include "ktbconf.h" #include "ktbtrav.h" #include "Intersectable.h" namespace GtpVisibilityPreprocessor { #ifdef TRV00F #ifdef _USE_HAVRAN_SSE #ifdef __SSE__ // Even faster - about 125,500 rays per second for single dir and 164,000 rps // for double dir ! int CKTBTraversal::FindNearestI_16oneDir(SimpleRayContainer &rays, int offset, int copyOffset) { static RayPacket2x2 raypack; struct SResultI { Intersectable *intersectable; float tdist; }; static SResultI results[16]; for (int i = 0; i < 4; i++) { int k = i * 4 + offset; for (int j = 0; j < 4; j++, k++) { raypack.SetLoc(j, rays[k].mOrigin); raypack.SetDir(j, rays[k].mDirection); } // Here either use ray packet traversal or // casting individual rays FindNearestI(raypack); k = i * 4; for (int j = 0; j < 4; j++, k++) { results[k].intersectable = raypack.GetObject(j); results[k].tdist = raypack.GetT(j); } // for j } // for i // Copy the results to the output array for (int i = 0; i < 16; i++) { SimpleRay::IntersectionRes[i + copyOffset].intersectable = results[i].intersectable; SimpleRay::IntersectionRes[i + copyOffset].tdist = results[i].tdist; } // for i return 0; } #if 0 // This code works well 1/1/2008 - 11:00 // The same operations for packets of rays for the same signs, // otherwise it is emulated by decomposition // of a packet to individual rays and traced individually. void CKTBTraversal::FindNearestI(RayPacket2x2 &rp) { int m1 = _mm_movemask_ps(rp.dx4); if ((m1 == 0)||(m1 == 15)) { m1 = _mm_movemask_ps(rp.dy4); if ((m1 == 0)||(m1 == 15)) { m1 = _mm_movemask_ps(rp.dz4); if ((m1 == 0)||(m1 == 15)) { rp.Init(); // all the signs for 4 rays are the same, use // ray packet traversal // Compute min and max distances GALIGN16 union { float tmin4[4]; __m128 tmin_4; }; GALIGN16 union { float tmax4[4]; __m128 tmax_4; }; GALIGN16 int inters[4]; inters[0] = inters[1] = inters[2] = inters[3] = 1; SimpleRay sray[4]; int maxIntersections = 4; unsigned int inters32 = 0xf; for (int i = 0; i < 4; i++) { SimpleRay::IntersectionRes[i].intersectable = 0; rp.SetObject(i, 0); bbox.ComputeMinMaxT(rp.GetLoc(i), rp.GetDir(i), &(tmin4[i]), &(tmax4[i])); if ( (tmin4[i] >= tmax4[i]) || (tmax4[i] < 0.f) ) { inters[i] = 0; // finished inters32 &= ~(1 << i); // bit zero when ray is invalid maxIntersections--; } if (tmin4[i] < 0.f) tmin4[i] = 0.f; sray[i].mOrigin = rp.GetLoc(i); sray[i].mDirection = rp.GetDir(i); } // for i if (maxIntersections == 0) return; SKTBNodeT * childNodes[2]; int RayDirs[3]; RayDirs[0] = (rp.dx[0] > 0.f) ? 1 : 0; RayDirs[1] = (rp.dy[0] > 0.f) ? 1 : 0; RayDirs[2] = (rp.dz[0] > 0.f) ? 1 : 0; //int activeMask=_mm_movemask_ps(_mm_cmplt_ps( tmin_4, tmax_4 ))&inters32; int activeMask = inters32; int indexStack = 0; SKTBNodeT *currNode = root; unsigned int k = GetNodeType(currNode); for (;;) { while (k < CKTBAxes::EE_Leaf) { // the 3 operations below can be brought down to 3 simple float // calculations by precomputing min/max of the inverse dir const __m128 node_split = _mm_set_ps1(GetSplitValue(currNode)); const __m128 t4 = _mm_mul_ps(_mm_sub_ps(node_split, rp.orig[k]), rp.idir[k]); childNodes[0] = GetLeft(currNode); childNodes[1] = GetRight(currNode); int rayDir = RayDirs[k]; SKTBNodeT *far = childNodes[rayDir]; if (!(_mm_movemask_ps(_mm_cmpgt_ps(t4, tmin_4)) & activeMask)) { currNode = far; k = GetNodeType(currNode); continue; } currNode = childNodes[rayDir ^ 0x1]; // this is near node k = GetNodeType(currNode); if (!(_mm_movemask_ps(_mm_cmplt_ps( t4, tmax_4)) & activeMask)) continue; // pop far node to the stack stack4[indexStack].nodep = far; stack4[indexStack].tmax_4 = tmax_4; stack4[indexStack].tmin_4 = _mm_max_ps(t4, tmin_4); // stack4[indexStack].mask = activeMask; indexStack++; tmax_4 = _mm_min_ps(t4, tmax_4); activeMask &= _mm_movemask_ps(_mm_cmplt_ps( tmin_4, tmax_4 )); } // while this is an interior node // either a leaf or a link if (k == CKTBAxes::EE_Leaf) { // test objects for intersection if (!IsEmptyLeaf_(currNode)) { // cout << "Full leaf" << endl; // test the objects in the full leaf against the ray for (int i = 0; i < 4; i++) { if (inters[i] ) { // no intersection so far ! SimpleRay::IntersectionRes[i].maxt = tmax4[i] + Limits::Small; // Test only rays that were not finished if (TestFullLeaf(sray[i], currNode, i)) { // intersection for this ray found inters[i] = 0; inters32 &= ~(1 << i); rp.SetT(i, SimpleRay::IntersectionRes[i].maxt); rp.SetObject(i, SimpleRay::IntersectionRes[i].intersectable); // signed distance should be already set in TestFullLeaf // the first object intersected was found if (--maxIntersections == 0) return; } } // if this ray did not hit the triangle so far } // for all 4 rays } // full leaf // pop farChild from the stack // restore the current values // update the minimum distance since we traverse to the next one if (indexStack == 0) return; indexStack--; currNode = stack4[indexStack].nodep; k = GetNodeType(currNode); tmin_4 = stack4[indexStack].tmin_4; tmax_4 = stack4[indexStack].tmax_4; activeMask = _mm_movemask_ps(_mm_cmple_ps( tmin_4, tmax_4 )) & inters32; continue; } // if leaf // cout << "Link node was accessed" << endl; assert(k == CKTBAxes::EE_Link); currNode = GetLinkNode(currNode); k = GetNodeType(currNode); } // for return; }}} // Trace ray by ray SimpleRay ray; for (int i = 0; i < 4; i++) { ray.mOrigin = rp.GetLoc(i); ray.mDirection = rp.GetDir(i); FindNearestI(ray); rp.SetObject(i, SimpleRay::IntersectionRes[0].intersectable); rp.SetT(i, SimpleRay::IntersectionRes[0].tdist); // SimpleRay::IntersectionRes[0].intersectable->GetNormal(0); } // for return; } #endif #if 1 // This code also works well 1/1/2008 - 14:00 // Using mask of 128-bits width - the code works as well, only a bit // faster than the code above void CKTBTraversal::FindNearestI(RayPacket2x2 &rp) { int m1 = _mm_movemask_ps(rp.dx4); if ((m1 == 0)||(m1 == 15)) { m1 = _mm_movemask_ps(rp.dy4); if ((m1 == 0)||(m1 == 15)) { m1 = _mm_movemask_ps(rp.dz4); if ((m1 == 0)||(m1 == 15)) { rp.Init(); // all the signs for 4 rays are the same, use // ray packet traversal // Compute min and max distances GALIGN16 union { float tmin4[4]; __m128 tmin_4; }; GALIGN16 union { float tmax4[4]; __m128 tmax_4; }; GALIGN16 union { unsigned int activeMask[4]; __m128 activeMask_4; }; GALIGN16 union { unsigned int liveMask[4]; __m128 liveMask_4; }; liveMask[0] = liveMask[1] = liveMask[2] = liveMask[3] = 0xffffffff; GALIGN16 SimpleRay sray[4]; int maxIntersections = 4; // unsigned int inters32 = 0xf; for (int i = 0; i < 4; i++) { SimpleRay::IntersectionRes[i].intersectable = 0; rp.SetObject(i, 0); bbox.ComputeMinMaxT(rp.GetLoc(i), rp.GetDir(i), &(tmin4[i]), &(tmax4[i])); if ( (tmin4[i] >= tmax4[i]) || (tmax4[i] < 0.f) ) { liveMask[i] = 0; // finished // inters32 &= ~(1 << i); // bit zero when ray is invalid maxIntersections--; } if (tmin4[i] < 0.f) tmin4[i] = 0.f; sray[i].mOrigin = rp.GetLoc(i); sray[i].mDirection = rp.GetDir(i); } // for i if (maxIntersections == 0) return; // This is the mask 128 bits witdth //activeMask_4 = // _mm_and_ps(_mm_cmple_ps(tmin_4, tmax_4), // _mm_cmplt_ps(tmax_4, _mm_setzero_ps())); activeMask_4 = liveMask_4; SKTBNodeT * childNodes[2]; int RayDirs[4]; RayDirs[0] = (rp.dx[0] > 0.f) ? 1 : 0; RayDirs[1] = (rp.dy[0] > 0.f) ? 1 : 0; RayDirs[2] = (rp.dz[0] > 0.f) ? 1 : 0; int indexStack = 0; SKTBNodeT *currNode = root; unsigned int k = GetNodeType(currNode); for (;;) { // traverse until we find a leaf while (k < CKTBAxes::EE_Leaf) { // the 3 operations below can be brought down to 3 simple float // calculations by precomputing min/max of the inverse dir // const __m128 node_split = ; const __m128 t4 = _mm_mul_ps(_mm_sub_ps(_mm_set_ps1(GetSplitValue(currNode)), rp.orig[k]), rp.idir[k]); childNodes[0] = GetLeft(currNode); childNodes[1] = GetRight(currNode); int rayDir = RayDirs[k]; SKTBNodeT *far = childNodes[rayDir]; if (!_mm_movemask_ps(_mm_and_ps(_mm_cmpge_ps(t4, tmin_4), activeMask_4))) { currNode = far; k = GetNodeType(currNode); continue; } currNode = childNodes[rayDir ^ 0x1]; // this is near node k = GetNodeType(currNode); if (!_mm_movemask_ps(_mm_and_ps(_mm_cmple_ps(t4, tmax_4), activeMask_4))) continue; // pop far node to the stack stack4[indexStack].nodep = far; stack4[indexStack].tmax_4 = tmax_4; // Uncomenting this macro is unsafe! // Not convinced if for packet of 4 rays we can say that since when // one ray is different than the others, it could bring to wrong state // It is surely true for one ray when tmin < t < tmax, but for a packet // of rays this condition can be true only for a single ray // tmin4 = max(t4, tmin4) = min(t4, tmax4) //#define _NOT_STORE_MINT #ifdef _NOT_STORE_MINT #else // store mint onto the stack stack4[indexStack].tmin_4 = _mm_max_ps(t4, tmin_4); #endif // stack4[indexStack].mask = activeMask; indexStack++; tmax_4 = _mm_min_ps(t4, tmax_4); activeMask_4 = _mm_cmplt_ps( tmin_4, tmax_4 ); } // while this is an interior node // either a leaf or a link if (k == CKTBAxes::EE_Leaf) { // test objects for intersection if (!IsEmptyLeaf_(currNode)) { // cout << "Full leaf" << endl; // test the objects in the full leaf against the ray for (int i = 0; i < 4; i++) { if (liveMask[i] ) { // no intersection so far ! SimpleRay::IntersectionRes[i].maxt = tmax4[i] + Limits::Small; #if 0 // Using subroutine // Test only rays that were not finished if (TestFullLeaf(sray[i], currNode, i)) #else // avoiding one call const ObjectContainer * const list = GetObjList(currNode); int intersected = 0; // iterate the whole list and find out the nearest intersection ObjectContainer::const_iterator sc_end = list->end(); for (ObjectContainer::const_iterator sc = list->begin(); sc != sc_end; sc++) { // if the intersection realy lies in the node intersected |= ((*sc)->CastSimpleRay(sray[i], i)); } // for all objects if (intersected) #endif { // object was intersected rp.SetT(i, SimpleRay::IntersectionRes[i].maxt); rp.SetObject(i, SimpleRay::IntersectionRes[i].intersectable); // signed distance should be already set in TestFullLeaf // the first object intersected was found if (--maxIntersections == 0) return; // inters32 &= ~(1 << i); liveMask[i] = 0; } } // if this ray did not hit the triangle so far } // for all 4 rays } // full leaf // pop farChild from the stack // restore the current values // update the minimum distance since we traverse to the next one do { if (indexStack == 0) return; indexStack--; currNode = stack4[indexStack].nodep; k = GetNodeType(currNode); #ifdef _NOT_STORE_MINT // this is an attempt ! tmin_4 = tmax_4; #else // This surrely works tmin_4 = stack4[indexStack].tmin_4; #endif tmax_4 = stack4[indexStack].tmax_4; activeMask_4 = _mm_and_ps(_mm_cmple_ps( tmin_4, tmax_4 ), liveMask_4); } while (_mm_movemask_ps(activeMask_4) == 0); } else { // cout << "Link node was accessed" << endl; assert(k == CKTBAxes::EE_Link); currNode = GetLinkNode(currNode); k = GetNodeType(currNode); } } // for(;;) return; }}} // Trace ray by ray SimpleRay ray; for (int i = 0; i < 4; i++) { ray.mOrigin = rp.GetLoc(i); ray.mDirection = rp.GetDir(i); FindNearestI(ray); rp.SetObject(i, SimpleRay::IntersectionRes[0].intersectable); rp.SetT(i, SimpleRay::IntersectionRes[0].tdist); // SimpleRay::IntersectionRes[0].intersectable->GetNormal(0); } // for return; } #endif // This code allows to specify the box where the ray should be traversed! void CKTBTraversal::FindNearestI(RayPacket2x2 &rp, Vector3 &boxmin, Vector3 &boxmax) { static AxisAlignedBox3 localbox; localbox.SetMin(boxmin); localbox.SetMax(boxmax); int m1 = _mm_movemask_ps(rp.dx4); if ((m1 == 0)||(m1 == 15)) { m1 = _mm_movemask_ps(rp.dy4); if ((m1 == 0)||(m1 == 15)) { m1 = _mm_movemask_ps(rp.dz4); if ((m1 == 0)||(m1 == 15)) { rp.Init(); // all the signs for 4 rays are the same, use // ray packet traversal // Compute min and max distances GALIGN16 union { float tmin4[4]; __m128 tmin_4; }; GALIGN16 union { float tmax4[4]; __m128 tmax_4; }; GALIGN16 union { unsigned int activeMask[4]; __m128 activeMask_4; }; GALIGN16 union { unsigned int liveMask[4]; __m128 liveMask_4; }; liveMask[0] = liveMask[1] = liveMask[2] = liveMask[3] = 0xffffffff; GALIGN16 SimpleRay sray[4]; int maxIntersections = 4; // unsigned int inters32 = 0xf; for (int i = 0; i < 4; i++) { SimpleRay::IntersectionRes[i].intersectable = 0; rp.SetObject(i, 0); localbox.ComputeMinMaxT(rp.GetLoc(i), rp.GetDir(i), &(tmin4[i]), &(tmax4[i])); if ( (tmin4[i] >= tmax4[i]) || (tmax4[i] < 0.f) ) { liveMask[i] = 0; // finished // inters32 &= ~(1 << i); // bit zero when ray is invalid maxIntersections--; } if (tmin4[i] < 0.f) tmin4[i] = 0.f; sray[i].mOrigin = rp.GetLoc(i); sray[i].mDirection = rp.GetDir(i); } // for i if (maxIntersections == 0) return; // This is the mask 128 bits witdth //activeMask_4 = // _mm_and_ps(_mm_cmple_ps(tmin_4, tmax_4), // _mm_cmplt_ps(tmax_4, _mm_setzero_ps())); activeMask_4 = liveMask_4; SKTBNodeT * childNodes[2]; int RayDirs[4]; RayDirs[0] = (rp.dx[0] > 0.f) ? 1 : 0; RayDirs[1] = (rp.dy[0] > 0.f) ? 1 : 0; RayDirs[2] = (rp.dz[0] > 0.f) ? 1 : 0; int indexStack = 0; SKTBNodeT *currNode = root; unsigned int k = GetNodeType(currNode); for (;;) { // traverse until we find a leaf while (k < CKTBAxes::EE_Leaf) { // the 3 operations below can be brought down to 3 simple float // calculations by precomputing min/max of the inverse dir // const __m128 node_split = ; const __m128 t4 = _mm_mul_ps(_mm_sub_ps(_mm_set_ps1(GetSplitValue(currNode)), rp.orig[k]), rp.idir[k]); childNodes[0] = GetLeft(currNode); childNodes[1] = GetRight(currNode); int rayDir = RayDirs[k]; SKTBNodeT *far = childNodes[rayDir]; if (!_mm_movemask_ps(_mm_and_ps(_mm_cmpge_ps(t4, tmin_4), activeMask_4))) { currNode = far; k = GetNodeType(currNode); continue; } currNode = childNodes[rayDir ^ 0x1]; // this is near node k = GetNodeType(currNode); if (!_mm_movemask_ps(_mm_and_ps(_mm_cmple_ps(t4, tmax_4), activeMask_4))) continue; // pop far node to the stack stack4[indexStack].nodep = far; stack4[indexStack].tmax_4 = tmax_4; // Uncomenting this macro is unsafe! // Not convinced if for packet of 4 rays we can say that since when // one ray is different than the others, it could bring to wrong state // It is surely true for one ray when tmin < t < tmax, but for a packet // of rays this condition can be true only for a single ray // tmin4 = max(t4, tmin4) = min(t4, tmax4) #undef _NOT_STORE_MINT //#define _NOT_STORE_MINT #ifdef _NOT_STORE_MINT #else // store mint onto the stack stack4[indexStack].tmin_4 = _mm_max_ps(t4, tmin_4); #endif // stack4[indexStack].mask = activeMask; indexStack++; tmax_4 = _mm_min_ps(t4, tmax_4); activeMask_4 = _mm_cmplt_ps( tmin_4, tmax_4 ); } // while this is an interior node // either a leaf or a link if (k == CKTBAxes::EE_Leaf) { // test objects for intersection if (!IsEmptyLeaf_(currNode)) { // cout << "Full leaf" << endl; // test the objects in the full leaf against the ray for (int i = 0; i < 4; i++) { if (liveMask[i] ) { // no intersection so far ! SimpleRay::IntersectionRes[i].maxt = tmax4[i] + Limits::Small; #if 0 // Using subroutine // Test only rays that were not finished if (TestFullLeaf(sray[i], currNode, i)) #else // avoiding one call const ObjectContainer * const list = GetObjList(currNode); int intersected = 0; // iterate the whole list and find out the nearest intersection ObjectContainer::const_iterator sc_end = list->end(); for (ObjectContainer::const_iterator sc = list->begin(); sc != sc_end; sc++) { // if the intersection realy lies in the node intersected |= ((*sc)->CastSimpleRay(sray[i], i)); } // for all objects if (intersected) #endif { rp.SetT(i, SimpleRay::IntersectionRes[i].maxt); rp.SetObject(i, SimpleRay::IntersectionRes[i].intersectable); // signed distance should be already set in TestFullLeaf // the first object intersected was found if (--maxIntersections == 0) return; // inters32 &= ~(1 << i); liveMask[i] = 0; } } // if this ray did not hit the triangle so far } // for all 4 rays } // full leaf // pop farChild from the stack // restore the current values // update the minimum distance since we traverse to the next one do { if (indexStack == 0) return; indexStack--; currNode = stack4[indexStack].nodep; k = GetNodeType(currNode); #ifdef _NOT_STORE_MINT // this is an attempt ! tmin_4 = tmax_4; #else // This surrely works tmin_4 = stack4[indexStack].tmin_4; #endif tmax_4 = stack4[indexStack].tmax_4; activeMask_4 = _mm_and_ps(_mm_cmple_ps( tmin_4, tmax_4 ), liveMask_4); } while (_mm_movemask_ps(activeMask_4) == 0); } else { // cout << "Link node was accessed" << endl; assert(k == CKTBAxes::EE_Link); currNode = GetLinkNode(currNode); k = GetNodeType(currNode); } } // for(;;) return; }}} // Trace ray by ray SimpleRay ray; for (int i = 0; i < 4; i++) { ray.mOrigin = rp.GetLoc(i); ray.mDirection = rp.GetDir(i); FindNearestI(ray, localbox); rp.SetObject(i, SimpleRay::IntersectionRes[0].intersectable); rp.SetT(i, SimpleRay::IntersectionRes[0].tdist); // SimpleRay::IntersectionRes[0].intersectable->GetNormal(0); } // for } #endif // __SSE__ #endif // _USE_HAVRAN_SSE #endif // TRV00F } // namespace