<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="applications.xsl"?>
  <Applications>	

     <Application>
        <GUID>5a730964-d49a-4305-b5a8-3c5d75ecf73b</GUID>
        <Name>Eudyptula</Name>
        <ShortDescription>Eudyptula is portable graphics engine that provides advanced support for the CUDA tools of NVIDIA and with its core purpose to be used in the development of scientific applications </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/622_eudyptula_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/622_eudyptula_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>06</ReleaseMonth>
        <ReleaseDay>25</ReleaseDay>
        <ReleaseDateDisplay>06/25/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Georgios Paraskevas</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/eudyptula/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Georgios Paraskevas</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>9ca281be-34d8-4b10-9f7c-cd1853ad715c</GUID>
        <Name>High performance sequence alignment</Name>
        <ShortDescription>A fast Smith-Waterman algorithm, implemented on CUDA </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/620_protein_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/620_protein_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>19</ReleaseDay>
        <ReleaseDateDisplay>09/19/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Vahid Noormofidi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cudaalignment/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Life Sciences</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Vahid Noormofidi</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>082d85de-353e-4a4d-9613-2513309d4b09</GUID>
        <Name>aeth.drive</Name>
        <ShortDescription>A fast, parallel, versatile QED modelling framework. Uses Geometric Calculus and CUDA. Algorithm supports complex phenomena including turbulence, quantum effects, and relativistic gravitational procession. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/619_aeth_small.jpeg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/619_aeth_large.jpeg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>11</ReleaseMonth>
        <ReleaseDay>15</ReleaseDay>
        <ReleaseDateDisplay>11/15/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Kevin Daley</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/aethdrive/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Kevin Daley</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>2d481ec6-3138-4970-9d92-0abf7e82d639</GUID>
        <Name>BlazeSim</Name>
        <ShortDescription>Project SHERIF is the hardware acceleration of the Fire Dynamics Simulator (FDS) using CUDA on NVIDIA graphic cards.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/618_blazesim_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/618_blazesim_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>21</ReleaseDay>
        <ReleaseDateDisplay>05/21/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">mastermemorex</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/blazesim/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>mastermemorex</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>91df274b-6c8d-470a-956d-8e6ff1d8c053</GUID>
        <Name>jacuzzi</Name>
        <ShortDescription>This projects aims at providing java-bindings to the CUDA numeric environment. CUDA is an extension to the C/C++ programming language by NVIDIA. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/617_jacuzzi_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/617_jacuzzi_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>03</ReleaseMonth>
        <ReleaseDay>05</ReleaseDay>
        <ReleaseDateDisplay>03/05/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Alexander Heusel</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/jacuzzi/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Alexander Heusel</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>551bb282-5e25-4ff5-92fc-a0fc675d32bc</GUID>
        <Name>cuda cagen</Name>
        <ShortDescription>CUDA-based rule 30 cellular automaton generator for nVidia GPUs</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/616_CellularAutomata_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/616_CellularAutomata_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>17</ReleaseDay>
        <ReleaseDateDisplay>09/17/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Yuri Parfenov</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cudacagen/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Yuri Parfenov</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>60d005b8-e3c7-47a5-8fec-ab8aef9f2031</GUID>
        <Name>Fast parallel Particle-To-Grid interpolation for plasma PIC simulations on the GPU</Name>
        <ShortDescription>Particle-in-Cell (PIC) methods have been widely used for plasma physics simulations in the past three decades. To ensure an acceptable level of statistical accuracy relatively large numbers of particles are needed. State-of-the-art Graphics Processing Units (GPUs), with their high memory bandwidth, hundreds of SPMD processors, and half-a-teraflop performance potential, offer a viable alternative to distributed memory parallel computers for running medium-scale PIC plasma simulations on inexpensive commodity hardware. In this paper, we present an overview of a typical plasma PIC code and discuss its GPU implementation. In particular we focus on fast algorithms for the performance bottleneck operation of particle-to-grid interpolation.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/615_ptg_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/615_ptg_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Maryland</OrganizationName>
        <OrganizationURL>http://www.umd.edu/</OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>10/01/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>20</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="gogo@umd.edu ">George Stantchev</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.cscamm.umd.edu/publications/yjpdc2543-4_CS-08-35.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>George Stantchev,gogo@umd.edu </Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>276b1bef-214e-4528-85e7-c08792f09988</GUID>
        <Name>cudacluster</Name>
        <ShortDescription>The CUDA Cluster allows you to organize a cluster of CUDA-enabled Peer-To-Peer nodes, allowing for execution of tasks with extreme performance, by harnessing the combined power of multiple such GPU hosts. Sample jobs are provided. C#.Net/Mono with C. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/614_cudacluster_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/614_cudacluster_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>06</ReleaseDay>
        <ReleaseDateDisplay>08/06/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Nikolaos Tountas</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cudacluster/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Nikolaos Tountas</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>2be843df-918d-4f4f-94ec-6c1b99e58760</GUID>
        <Name>MP3 Encoder</Name>
        <ShortDescription>MP3 encoder that runs on CUDA compatible hardware. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/613_cudamp3_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/613_cudamp3_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>03</ReleaseMonth>
        <ReleaseDay>19</ReleaseDay>
        <ReleaseDateDisplay>03/19/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>Research</SoftwareLicenseType>
        <Authors>
           <Author email="">biggestpos </Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cudamp3encoder/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Video &amp; Audio</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>biggestpos </Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>9a4aea49-e96f-487a-b6e3-ab50c134a049</GUID>
        <Name>cesql</Name>
        <ShortDescription>Database Server based on NVIDIA CUDA Technology. CUDA makes it possible to use the GPU and its performance for parallel data computing.A classic sql server uses only about 15 GFlops instead of more than 500 GFlops which could be used by cesql. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/612_cesql_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/612_cesql_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>06</ReleaseMonth>
        <ReleaseDay>08</ReleaseDay>
        <ReleaseDateDisplay>06/08/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Arash_Mahini@users.sourceforge.net">Arash Mahini</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cesql/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Arash Mahini,Arash_Mahini@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>436a1f19-e066-438d-9769-afd6b612b52e</GUID>
        <Name>cehttp</Name>
        <ShortDescription>Web Server based on NVIDIA CUDA Technology. CUDA makes it possible to use the GPU and its performance for parallel data computing.A classic web server uses only about 15 GFlops instead of more than 500 GFlops which could be used by cehttp. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/611_cehttp_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/611_cehttp_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>06</ReleaseMonth>
        <ReleaseDay>08</ReleaseDay>
        <ReleaseDateDisplay>06/08/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Arash_Mahini@users.sourceforge.net">Arash Mahini</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cehttp/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType></ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Arash Mahini,Arash_Mahini@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>faba717d-f830-457b-94a4-a8ca1d709890</GUID>
        <Name>The CUDA Files</Name>
        <ShortDescription>Implementations of various algorithms using CUDA. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/610_thecudafiles_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/610_thecudafiles_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>01</ReleaseMonth>
        <ReleaseDay>08</ReleaseDay>
        <ReleaseDateDisplay>01/08/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="sashang@users.sourceforge.net">sashang</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/thecudafiles/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>sashang,sashang@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>8ffabfbd-cad9-4fa1-81ee-f61d4bc4cc76</GUID>
        <Name>FreeSWITCH-CUDA</Name>
        <ShortDescription>This goal of this project is produce and maintain a branch of the FreeSWITCH telephony platform that utilizes CUDA (NVida's GPGPU toolkit) to offload cpu-intensive transcoding tasks to the (NVidia) GPU. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/609_freeswitch_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/609_freeswitch_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>04/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Zac_Wolfe@users.sourceforge.net">Zac Wolfe</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/freeswitch-cuda/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Zac Wolfe,Zac_Wolfe@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>9b5c77ca-f014-4173-83cd-3bc3da09039b</GUID>
        <Name>tokaspt</Name>
        <ShortDescription>The Once Known as SmallPT is a cheap editable realtime derivation of http://kevinbeason.com/smallpt/ By way of the marketing department, some outrageously insignificant numbers: on a Quadro FX 5800, on the default scene at default resolution and configuration, 768x512x(2x2)x118fps = 185.6M 4-bounces rays are traced per second (alternatively, a maximum of 742.4M bounces are generated). Requires CUDA 2.1 to compile and run. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/608_img_ui_bloated_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/608_img_ui_bloated_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>http://ompf.org</OrganizationName>
        <OrganizationURL>http://ompf.org</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>01</ReleaseMonth>
        <ReleaseDay>25</ReleaseDay>
        <ReleaseDateDisplay>01/25/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="tbptbp@gmail.com">Thierry Berger-Perrin</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://code.google.com/p/tokaspt/">Application</ContentType>
           <ContentType url="http://code.google.com/p/tokaspt/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Thierry Berger-Perrin,tbptbp@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>e143112b-a0c0-4f45-8360-6afe7687f68e</GUID>
        <Name>A framework for efficient and scalable execution of domain-specific templates on GPUs</Name>
        <ShortDescription>Graphics Processing Units (GPUs) have emerged as important players in the transition of the computing industry from sequential to multi- and many-core computing. We propose a software framework for execution of domain specific parallel templates on GPUs, which simultaneously raises the abstraction level of GPU programming and ensures efficient execution with forward scalability to large data sizes and new GPU platforms. To achieve scalable and efficient GPU execution, our framework focuses on two critical problems that have been largely ignored in previous efforts - processing large data sets that do not fit within the GPU memory, and minimizing data transfers between the host and GPU. Our framework takes domain-specific parallel programming templates that are expressed as parallel operator graphs, and performs operator splitting, offload unit identification, and scheduling of off-loaded computations and data transfers between the host and the GPU, to generate a highly optimized execution plan. Finally, a code generator produces a hybrid CPU/GPU program in accordance with the derived execution plan, that uses lower level frameworks such as CUDA. We have applied the proposed framework to templates from the recognition domain, specifically edge detection kernels and convolutional neural networks that are commonly used in image and video analysis. We present results on two different GPU platforms from NVIDIA (a Tesla C870 GPU computing card and a GeForce 8800 graphics card) that demonstrate 1.7 - 7.8X performance improvements over already accelerated baseline GPU implementations. We also demonstrate scalability to input data sets and application memory footprints of 6GB and 17GB, respectively, on GPU platforms with only 768MB and 1.5GB of memory.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/607_ipdp_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/607_ipdp_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>NEC Labs, Berkeley, Purdue</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>05/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>8</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="narayans@eecs.berkeley.edu">Narayanan Sundaramyz</Author>
           <Author email="">Anand Raghunathanyx</Author>
           <Author email="">Srimat T. Chakradhar</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.eecs.berkeley.edu/~narayans/Publications_files/ipdps2009.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Medical Imaging</ApplicationType>
           <ApplicationType>machine learning</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>edge detection, convolution neural network, out-of-core,Narayanan Sundaramyz,Anand Raghunathanyx,Srimat T. Chakradhar,narayans@eecs.berkeley.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>90d02408-0266-458a-874c-6ffe41985526</GUID>
        <Name>PAPER - Accelerating Parallel Evaluations of ROCS</Name>
        <ShortDescription>PAPER is a GPU-accelerated implementation of Gaussian molecular shape overlay (the algorithm in OpenEye ROCS) running on NVIDIA graphics cards. We have demonstrated multiple-order-of-magnitude speedups relative to a CPU-based implementation of the same algorithm, and 5x speedup relative to OpenEye ROCS even on low-end graphics hardware (an NVIDIA 8600GT).</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/606_gpuROCS_thumb_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/606_gpuROCS_thumb_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Computer Science, Stanford University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>06</ReleaseDay>
        <ReleaseDateDisplay>05/06/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>35</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="ihaque@cs.stanford.edu">Imran Haque</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.cs.stanford.edu/people/ihaque/papers/gpurocs.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Life Sciences</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>paper openeye rocs ,Imran Haque,ihaque@cs.stanford.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>d45f95f7-772b-41f6-a00d-4cb40e53e785</GUID>
        <Name>HyperNEAT4CUDA</Name>
        <ShortDescription>This is a simple C# implementation of HyperNEAT implemented on NVidia's Compute Unified Device Architecture (CUDA). </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/605_hyperneat_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/605_hyperneat_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>OpenSource</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>19</ReleaseDay>
        <ReleaseDateDisplay>05/19/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">K A Lloyd</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/hyperneat4cuda/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>K A Lloyd</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>727f6e8e-1cc9-4afc-9d6f-3329a569a712</GUID>
        <Name>Smoke rendering demo</Name>
        <ShortDescription>This application renders a density field of float values. In the particualr demo it is a smoke density field, but i could might as well be other sorts of data like fog, fluids or calculations. The density field is visualized using a ray marching technique and the background is rendered by ray tracing a kd tree.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/604_smoke_sreenshot1_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/604_smoke_sreenshot1_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Alexandra Instituttet</OrganizationName>
        <OrganizationURL>http://www.alexandra.dk/index.htm</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>14</ReleaseDay>
        <ReleaseDateDisplay>05/14/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="peter.trier@alexandra.dk">Peter Trier</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://cg.alexandra.dk/category/software/">Application</ContentType>
           <ContentType url="http://www.youtube.com/watch?v=teEDA9esk-A">Multimedia</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Smoke rendering, ray tracing,Peter Trier,peter.trier@alexandra.dk</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>72067ded-99f3-4176-96ad-9f1551b12c41</GUID>
        <Name>CUJ2K - JPEG2000 Encoder </Name>
        <ShortDescription>CUJ2K is a fast encoder for the new image compression standard JPEG2000 which is an improvement of JPEG providing better compression ratios and also supporting lossless compression along with many other features. JPEG2000 is very computation-intensive and therefore benfits much from CUDA acceleration. CUJ2K uses streaming to accelerate batch image compression. This program provides commandline-, .Net GUI- and libary-interfaces to convert BMP -> JPEG2000. It also supports creation of MJ2 videos.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/603_banner_small.gif</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/603_banner_large.gif</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Hochschule</OrganizationType>
        <OrganizationName>University of Stuttgart, IPVS</OrganizationName>
        <OrganizationURL>http://www.ipvs.uni-stuttgart.de/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>20</ReleaseDay>
        <ReleaseDateDisplay>09/20/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>4</SpeedUp>
        <SoftwareLicenseType>Open Source</SoftwareLicenseType>
        <Authors>
           <Author email="cuj2k.project@googlemail.com">Norbert Fuerst</Author>
           <Author email="">Armin Weiss</Author>
           <Author email="">Simon Papandreou</Author>
           <Author email="">Martin Heide</Author>
           <Author email="">Ana Balevic</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://cuj2k.sourceforge.net/">Application</ContentType>
           <ContentType url="http://cuj2k.sourceforge.net/">Paper</ContentType>
           <ContentType url="http://cuj2k.sourceforge.net/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>MedicalImaging</ApplicationType>
           <ApplicationType>Libraries</ApplicationType>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>JPEG2000, image compression, encoder, codec, JPEG, CUJ2K, image processing, lossless, lossy,Norbert Fuerst,Armin Weiss,Simon Papandreou, Martin Heide, Ana Balevic,cuj2k.project@googlemail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>64528049-540a-4d7f-9cc0-2d4a2ccad4f0</GUID>
        <Name>Parallel Multiclass classification using SVM on GPUs</Name>
        <ShortDescription>The scaling of serial algorithms cannot rely on the improvement of CPUs anymore. The performance of classical Support Vector Machine (SVM) implementations has reached its limit and the arrival of the multi core era requires these algorithms to adapt to a new parallel scenario. Graphics Processing Units (GPU) have arisen as high performance platforms to implement data parallel algorithms. In this paper, it is described how a native implementation of a multiclass classifier based on SVMs can map its inherent degrees of parallelism to the GPU programming model and efficiently use its computational throughput. Empirical results show that the training and classification time of the algorithm can be reduced an order of magnitude compared to a classical solver, LIBSVM, while guaranteeing the same accuracy.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/602_multisvm_small.gif</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/602_multisvm_large.gif</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>MIT</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>112</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="sherrero@mit.edu">Sergio Herrero-Lopez</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://code.google.com/p/multisvm/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Sergio Herrero-Lopez,sherrero@mit.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>f7874e4b-ba49-44f9-b736-6a3341519f41</GUID>
        <Name>Fast pattern classification of ventricular arrhythmias using graphics processing units</Name>
        <ShortDescription>Graphics Processing Units (GPUs) can provide remarkable performance gains when compared to CPUs for computationally-intensive applications. In the biomedical area, most of the previous studies are focused on using Neural Networks (NNs) for pattern recognition of biomedical signals. However, the long training times prevent them to be used in real-time. This is critical for the fast detection of Ventricular Arrhythmias (VAs) which may cause cardiac arrest and sudden death. In this paper, we present a parallel implementation of the Back-Propagation (BP) and the Multiple Back-Propagation (MBP) algorithm which allowed significant training speedups. In our proposal, we explicitly specify data parallel computations by defining special functions (kernels); therefore, we can use a fast evaluation strategy for reducing the computational cost without wasting memory resources. The performance of the pattern classification implementation is compared against other reported algorithms.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/600_mbpTop_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/600_mbpTop_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>IPG</OrganizationName>
        <OrganizationURL>http://www.ipg.pt</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>11</ReleaseMonth>
        <ReleaseDay>09</ReleaseDay>
        <ReleaseDateDisplay>11/09/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>53</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="noel@ipg.pt">Noel Lopes</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://dit.ipg.pt/MBP/papers.aspx">Application</ContentType>
           <ContentType url="http://dit.ipg.pt/MBP/papers.aspx">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>medicine</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Neural Networks,Noel Lopes,noel@ipg.pt</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>c8a33001-387c-474f-a477-63571429ab6f</GUID>
        <Name>Heart Wall Tracking</Name>
        <ShortDescription>Tracking of mouse heart walls through a series of ultrasound images.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/599_heartwall_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/599_heartwall_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Virginia</OrganizationName>
        <OrganizationURL>http://www.virginia.edu</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>11</ReleaseMonth>
        <ReleaseDay>05</ReleaseDay>
        <ReleaseDateDisplay>11/05/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>15</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="lgs9a@virginia.edu">Lukasz G. Szafaryn</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="https://www.cs.virginia.edu/~skadron/wiki/rodinia/index.php/Heart_Wall_Tracking">Application</ContentType>
           <ContentType url="https://www.cs.virginia.edu/~skadron/wiki/rodinia/index.php/Heart_Wall_Tracking">Multimedia</ContentType>
           <ContentType url="https://www.cs.virginia.edu/~skadron/wiki/rodinia/index.php/Heart_Wall_Tracking">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>MedicalImaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Image Processing, Feature Detection, Ultrasound,Lukasz G. Szafaryn,lgs9a@virginia.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>d29736de-ffee-4b0a-b7ec-8d041259c195</GUID>
        <Name>Towards a multi-GPU solver for the three-dimensional two-phase incompressible Navier-Stokes equations</Name>
        <ShortDescription>We have ported parts of our parallel level-set based two-phase solver for the three-dimensional Navier-Stokes equations on the GPU. To our knowledge, this is the first time that a two-phase fluid solver profits from the performance boost of several GPUs. A multi-GPU double-precision solver for the pressure Poisson equation based on the Jacobi preconditioned conjugate gradient method was implemented using CUDA and MPI. Thereby, we obtain a major speedup factor of 31.1 for the Poisson solver on four GPUs of our NVIDIA Tesla S1070, in contrast to a single CPU. Consequently, our overall fluid solver shows an impressive speedup factor of 16.6.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/598_logo_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/598_logo_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Institute for Numerical Simulation - University of Bonn, Germany</OrganizationName>
        <OrganizationURL>http://www.ins.uni-bonn.de</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>30</ReleaseDay>
        <ReleaseDateDisplay>09/30/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>16</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="zaspel@ins.uni-bonn.de">Peter Zaspel</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://wissrech.ins.uni-bonn.de/people/zaspel/poster_GPU2009.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>CFD, multi-GPU, Navier-Stokes, multi-phase,Peter Zaspel,zaspel@ins.uni-bonn.de</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>5bd7b280-5a27-49e5-be83-c95099ac3a3c</GUID>
        <Name>String Matching on a Multicore GPU Using CUDA</Name>
        <ShortDescription>Graphics Processing Units (GPUs) have evolved over the past few years from dedicated graphics rendering devices to powerful parallel processors, outperforming traditional Central Processing Units (CPUs) in many areas of scientific computing. The use of GPUs as processing elements was very limited until recently, when the concept of General-Purpose computing on Graphics Processing Units (GPGPU) was introduced. GPGPU made possible to exploit the processing power and the memory bandwidth of the GPUs with the use of APIs that hide the GPU hardware from programmers. This paper presents experimental results on the parallel processing for some well known on-line string matching algorithms using one such GPU abstraction API, the Compute Unified Device Architecture (CUDA).</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/597_cuda1o_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/597_cuda1o_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Macedonia</OrganizationName>
        <OrganizationURL>http://www.uom.gr</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>10</ReleaseDay>
        <ReleaseDateDisplay>09/10/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>24</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="ckouz@uom.gr">C. S. Kouzinopoulos</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.computer.org/portal/web/csdl/doi/10.1109/PCI.2009.47">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>String matching</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>string matching, algorithms, CUDA, GPGPU, parallel,C. S. Kouzinopoulos,ckouz@uom.gr</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>c3242f2b-7ede-43d1-87b7-c462eae24c94</GUID>
        <Name>Fast Tridiagonal Solvers on the GPU</Name>
        <ShortDescription>We study the performance of three parallel algorithms and their hybrid variants for solving tridiagonal linear systems on a GPU: cyclic reduction (CR), parallel cyclic reduction (PCR) and recursive doubling (RD). We develop an approach to measure, analyze, and optimize the performance of GPU programs in terms of memory access, computation, and control overhead. We find that CR enjoys linear algorithm complexity but suffers from more algorithmic steps and bank conflicts, while PCR and RD have fewer algorithmic steps but do more work each step. To combine the benefits of the basic algorithms, we propose hybrid CR+PCR and CR+RD algorithms, which improve the performance of PCR, RD and CR by 21%, 31% and 61% respectively. Our GPU solvers achieve up to a 28x speedup over a sequential LAPACK solver, and a 12x speedup over a multi-threaded CPU solver.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/596_idav_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/596_idav_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of California, Davis</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>28</ReleaseDay>
        <ReleaseDateDisplay>10/28/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>12</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="yaozhang@ucdavis.edu">Yao Zhang</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://graphics.cs.ucdavis.edu/publications/print_pub?pub_id=978">Application</ContentType>
           <ContentType url="http://graphics.cs.ucdavis.edu/publications/print_pub?pub_id=978">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Yao Zhang,yaozhang@ucdavis.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>0facea85-946d-47ef-93fd-12b5ae74b4b6</GUID>
        <Name>Accelerating Geo-Science and Engineering System Simulations on Graphics Hardware</Name>
        <ShortDescription>This paper discusses GPU implementations of three example applications from computational fluid dynamics, seismic wave propagation, and rock magnetism. These candidate applications involve important numerical modeling techniques, widely employed in physical system simulations, that are themselves examples of distinct computing classes identified as fundamental to scientific and engineering computing. The presented numerical methods (and respective computing classes they belong to) are: (1) a lattice-Boltzmann code for geofluid dynamics (structured grid class); (2) a spectral-finite-element code for seismic wave propagation simulations (sparse linear algebra class); and (3) a least-squares minimization code for interpreting magnetic force microscopy data (dense linear algebra class). Significant performance increases are seen in all three applications, demonstrating the power of GPU implementations for these types of simulations and their associated computing classes.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/595_stochastic_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/595_stochastic_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Minnesota</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>25</ReleaseDay>
        <ReleaseDateDisplay>10/25/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>30</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="sdcwalsh@umn.edu">Stuart D.C. Walsh</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://dx.doi.org/10.1016/j.cageo.2009.05.001">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Stuart D.C. Walsh,sdcwalsh@umn.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>786fec9c-472d-4f0e-9985-42ad2050e358</GUID>
        <Name>Sailfish: An Open Source fluid simulation package using the Lattice-Boltzmann method</Name>
        <ShortDescription>Sailfish is a general purpose fluid dynamics solver optimized for modern multicore processors, especially Graphics Processing Units (GPUs). The solver is based on the Lattice Boltzmann Method and works for both 2D and 3D fluids. Its performance peaks at 950MLUPS with the D2Q9 grid and 750MLUPS with D3Q19 (using CUDA on a single GTX280 video card). The design of Sailfish tries to reconcile ease of use and flexibility with performance. Python, with its powerful modules: sympy (for automatic code generation), numpy, pygame, tvtk etc. is used as the main language on the host (for I/O, visualization and user interaction), while the actual computations are performed on the GPU using CUDA or OpenCL.
</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/594_sailfish_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/594_sailfish_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Institute of Physics, University of Silesia</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>17</ReleaseDay>
        <ReleaseDateDisplay>04/17/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>100</SpeedUp>
        <SoftwareLicenseType>Open Source</SoftwareLicenseType>
        <Authors>
           <Author email="mjanusz@us.edu.pl">M. Januszewski</Author>
           <Author email="">M. Kostur</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.youtube.com/watch?v=kx4-VjaJ2eI">Multimedia</ContentType>
           <ContentType url="http://gitorious.org/sailfish">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>M. Januszewski,M. Kostur,mjanusz@us.edu.pl</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>111d3757-3e16-4600-bf47-437a832bae86</GUID>
        <Name>GPU-SPHysics</Name>
        <ShortDescription>a GPU-based Smoothed Particle Hydrodynamics model for free surface flows</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/593_boreinboxwhite_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/593_boreinboxwhite_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Istituto Nazionale di Geofisica e Vulcanologia</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>23</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Alexis Herault</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.ce.jhu.edu/dalrymple/GPU/">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword></Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>dea0e214-213a-4557-9ef4-1e9d5d6f80c9</GUID>
        <Name>Evaluating Multi-Core Platforms for HPC Data-Intensive Kernels</Name>
        <ShortDescription>We present an evaluation of three platform types, namely NVIDIA GPUs, the STI Cell/B.E., and generic multi-core CPUs on convolutional resampling (aka gridding), which is an irregular, data-intensive application from radio astronomy. We evaluate these platforms in terms of performance, programming effort and cost. Although we do not select a clear winner, we do provide a list of guidelines to assist in platform choice and development of similar data-intensive applications.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/592_gridding_fig_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/592_gridding_fig_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Delft University of Technology</OrganizationName>
        <OrganizationURL>http://www.tudelft.nl/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>18</ReleaseDay>
        <ReleaseDateDisplay>05/18/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="a.s.vanamesfoort@tudelft.nl">Alexander S. van Amesfoort</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.pds.ewi.tudelft.nl/~afoort/publ/cf09/">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType>Signal Processing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>data-intensive gridding astronomy,Alexander S. van Amesfoort,a.s.vanamesfoort@tudelft.nl</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>02285ada-66ce-4cd5-8809-e459372d9fb8</GUID>
        <Name>An efficient GPU implementation for large scaleindividual-based simulation of collective behavior</Name>
        <ShortDescription>In this work we describe a GPU implementation for an individual-based model for fish schooling. In this model each fish aligns its position and orientation with an appropriate average of its neighbors positions and orientations. This carries a very high computational cost in the so-called nearest neighbors search. By leveraging the GPU processing power and the new programming model called CUDA we implement an efficient framework which permits to simulate the collective motion of high-density individual groups. In particular we present as a case study a simulation of motion of millions of fishes. We describe our implementation and present extensive experiments which demonstrate the effectiveness of our GPU implementation.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/591_HiBi09_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/591_HiBi09_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Universita di Salerno</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>16</ReleaseDay>
        <ReleaseDateDisplay>10/16/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="ugo.erra@unibas.it">Ugo Erra</Author>
           <Author email="ugo.erra@unibas.it">Bernardino Frola</Author>
           <Author email="ugo.erra@unibas.it">Vittorio Scarano</Author>
           <Author email="ugo.erra@unibas.it">Iain Couzin</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://isis.dia.unisa.it/projects/behavert/">Application</ContentType>
           <ContentType url="http://www.youtube.com/watch?v=eymho1qRqK4&amp;feature=player_embedded">Multimedia</ContentType>
           <ContentType url="http://isis.dia.unisa.it/projects/behavert/">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Life Sciences</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Ugo Erra,ugo.erra@unibas.it</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>a412a716-04f1-4cf9-a389-8a51d3ea7680</GUID>
        <Name>OpenCurrent</Name>
        <ShortDescription>OpenCurrent is an open source C++ library for solving Partial Differential Equations (PDEs) over regular grids using the CUDA platform from NVIDIA. It breaks down a PDE into 3 basic objects, Grids, Solvers, and Equations. Grid data structures efficiently implement regular 1D, 2D, and 3D arrays in both double and single precision. Grids support operations like computing linear combinations, managing host-device memory transfers, interpolating values at non-grid points, and performing array-wide reductions. Solvers use these data structures to calculate terms arising from discretizations of PDEs, such as finite-difference based advection and diffusion schemes, and a multigrid solver for Poisson equations. These computational building blocks can be assembled into complete Equation objects that solve time-dependent PDEs. One such Equation solver is an incompressible Navier-Stokes solver that uses a second-order Boussinesq model. This equation solver is fully validated, and has been used to study Rayleigh-Benard convection under a variety of different regimes (citation). Benchmarks show it to perform about 8 times faster than an equivalent Fortran code running on an 8-core Xeon. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/590_opencurrent_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/590_opencurrent_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>NVIDIA</OrganizationName>
        <OrganizationURL>http://www.nvidia.com</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>25</ReleaseDay>
        <ReleaseDateDisplay>09/25/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>Open Source</SoftwareLicenseType>
        <Authors>
           <Author email="">Jonathan Cohen</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://code.google.com/p/opencurrent/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>libraries</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Jonathan Cohen</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>21a1b481-5773-403d-8644-730c1c5f1d58</GUID>
        <Name>Correlating Radio Astronomy Signals</Name>
        <ShortDescription>A recent development in radio astronomy is to replace traditional dishes with many small antennas. The signals are combined to form one large, virtual telescope. The enormous data streams are cross-correlated to filter out noise. This is especially challenging, since the computational demands grow quadratically with the number of data streams. Moreover, the correlator is not only computationally intensive, but also very I/O intensive. The LOFAR telescope, for instance, will produce over 100 terabytes per day. The future SKA telescope will even require in the order of exaflops, and petabits/s of I/O. A recent trend is to correlate in software instead of dedicated hardware, to increase flexibility and to reduce development efforts. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/589_LBA-field_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/589_LBA-field_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Astron</OrganizationName>
        <OrganizationURL>http://www.astron.nl</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>16</ReleaseDay>
        <ReleaseDateDisplay>10/16/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>6.3</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="nieuwpoort@astron.nl">Rob van Nieuwpoort</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.astron.nl/~nieuwpoort/papers/ics09-correlator.pdf">Paper</ContentType>
           <ContentType url="http://www.astron.nl/~nieuwpoort/">Code</ContentType>
           <ContentType url="http://www.lofar.org/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType>Signal Processing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Rob van Nieuwpoort,nieuwpoort@astron.nl</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>fb82b05f-0449-485d-8779-b53d28646189</GUID>
        <Name>TUNED AND ASYNCHRONOUS STENCIL KERNELS FOR CPU/GPU SYSTEMS</Name>
        <ShortDescription>We describe heterogeneous multi-CPU and multi-GPU implementations of Jacobi's iterative method for the 2-D Poisson equation on a structured grid, in both single and double-precision. Properly tuned, our best implementation achieves 98% of the empirical streaming GPU bandwidth (66% of peak) on a NVIDIA C1060.
Motivated to find a still faster implementation, we further consider wildly asynchronous implementations that can reduce or even eliminate the synchronization bottleneck between iterations. In these versions, which are based on the principle of a chaotic relaxation (Chazan and Miranker, 1969), we simply remove or delay
synchronization between iterations, thereby potentially trading of more 
ops (via more iterations to converge) for a higher degree of asynchronous parallelism. Our relaxed-synchronization implementations on a GPU can be 1.2-2.5x faster than our best synchronized GPU implementation while achieving the same accuracy. Looking forward, this result suggests research on similarly fast-and-loose algorithms in the coming era of increasingly massive concurrency and relatively high synchronization or communication costs.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/588_tuned_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/588_tuned_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Georgia Institute of Technology</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>05/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Sundaresan Venkatasubramanian</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://smartech.gatech.edu/bitstream/1853/29728/1/venkatasubramanian_sundaresan_200908_mast.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType></ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Sundaresan Venkatasubramanian</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>f72dcd39-833c-4760-8d04-87e67f4afa2b</GUID>
        <Name>Hybrid GPU-Based Single- and Double-Bounce SAR Simulation</Name>
        <ShortDescription>A new hybrid graphics-processing-unit (GPU)-based real-time synthetic aperture radar (SAR) simulation system is presented. Previous real-time SAR simulators only supported single-bounce simulation in real time. The new hybrid system uses the rasterization approach for real-time single-bounce simulation and a new image-based GPU ray-tracing approach for monostatic SAR double-bounce simulation. This approach provides fast simulation results even while simulating complex and extended scenes. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/587_hybrid_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/587_hybrid_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>LIESMARS, Wuhan University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>10/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="timobalz@gmail.com">Timo Balz</Author>
           <Author email="">Uwe Stilla</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5257398&amp;arnumber=5164916&amp;count=26&amp;index=19">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType> Remote Sensing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Radar, SAR, Remote Sensing, Simulaton, Ray-Tracing,Timo Balz,Uwe Stilla,timobalz@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>1026c7d5-f1c2-4709-800e-fad3add12e5a</GUID>
        <Name>A Proposal to Extend the OpenMP Tasking Modelfor Heterogeneous Architectures</Name>
        <ShortDescription>A proposal to extend OpenMP so it incorporates the concept of multiple architectures so it takes care of: separating the different pieces, compiling them adequately, offloading them. The user is still responsible for identifying interesting parts to offload and optimize for the target.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/586_openmp_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/586_openmp_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Universitat Politechnica de Catalunya</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>06</ReleaseMonth>
        <ReleaseDay>03</ReleaseDay>
        <ReleaseDateDisplay>06/03/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">E. Ayguade</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="https://iwomp.zih.tu-dresden.de/downloads/targets-Duran.pdf ">Presentation</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Libraries</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>E. Ayguade</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>13072d4f-4cdc-488e-ac1b-5d42f73c2528</GUID>
        <Name>AntiPlanet2</Name>
        <ShortDescription>AntiPlanet2 is first person 3D shooter game in fantastic extraterrestrial world, which is built of spheres and shadows. AntiPlanet uses ray tracing render for visualization. It works through CUDA. 3D engine works in any resolution in real-time, supports transparency and bi-cubic textures. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/585_fallenflowers_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/585_fallenflowers_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>virtualray.ru</OrganizationName>
        <OrganizationURL>http://www.virtualray.ru</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>06</ReleaseDay>
        <ReleaseDateDisplay>10/06/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>Commercial</SoftwareLicenseType>
        <Authors>
           <Author email="levdy@virtualray.ru">Lev Dymchenko</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.virtualray.ru/eng/download.html">Application</ContentType>
           <ContentType url="http://www.youtube.com/watch?v=i3rFwz9cmio">Multimedia</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType> computer game</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>3d shooter antiplanet first person action game real time ray tracing spherical computer art,Lev Dymchenko,levdy@virtualray.ru</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>3859efe4-0773-4cc5-be54-9fc3d338a0ce</GUID>
        <Name>cuco</Name>
        <ShortDescription>The GPU version of cosmological simulation code Gadget based on CUDA</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/584_cuco_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/584_cuco_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>Partner Group of MPA</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>25</ReleaseDay>
        <ReleaseDateDisplay>08/25/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Lei Liu </Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://github.com/liulei/cuco">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Lei Liu </Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>0e8e658b-58f3-4627-a6ca-1c64e79c3416</GUID>
        <Name>Data Monster</Name>
        <ShortDescription>Database processing is a cornerstone of computing, and it is a market that last year generated approximately US $27 billion, according to technology analysis firm Forrester Research, in Cambridge, Mass. The firm projects that this number which includes new database licenses, technical support, and consulting will grow to $32 billion by 2013. Every time you bid on an eBay auction, search for a movie on Netflix, look for a Kindle title on Amazon, or do a Google search, massive database applications spring into action, delving into huge quantities of data spread across tens of thousands of machines.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/583_datamonster_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/583_datamonster_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>ieee spectrum</OrganizationName>
        <OrganizationURL>http://spectrum.ieee.org</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>09/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Andrea Di Blas</Author>
           <Author email="">Tim Kaldewey</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://spectrum.ieee.org/computing/software/data-monster/1">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType></ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Andrea Di Blas,Tim Kaldewey</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>4d61ce47-f1c6-472f-81d3-595fd0ab0883</GUID>
        <Name>Citrix HDX 3D for Professional Graphics</Name>
        <ShortDescription>Citrix HDX 3D for Professional Graphics can now deliver Windows physical desktops and applications to the most advanced professional graphics power users through Citrix XenDesktop technology. XenDesktop with HDX 3D provides the best performance possible over the wide area network (WAN), and over a local area network (LAN), HDX 3D consumes 10x less bandwidth than alternatives while still providing a high-definition user experience.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/582_citrix_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/582_citrix_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>Citrix</OrganizationName>
        <OrganizationURL>http://www.citrix.com</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>15</ReleaseDay>
        <ReleaseDateDisplay>10/15/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>Commercial</SoftwareLicenseType>
        <Authors>
           <Author email="">Citrix</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.citrix.com/lang/English/home.asp">Application</ContentType>
           <ContentType url="http://www.timarenz.de/citrix-hdx-3d-for-professional-graphics-a-quickndirty-review">Multimedia</ContentType>
           <ContentType url="http://support.citrix.com/servlet/KbServlet/download/21548-102-641318/Citrix%20HDX%203D%20-%20Version%201.0%20-%20Requirements%20Guide.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Citrix</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>a8985a03-4860-49c7-92ce-f1237031cc81</GUID>
        <Name>GPU-Accelerated TF-IDF</Name>
        <ShortDescription>TF-IDF (term-frequency/inverse-document frequency) is one of the fundamental concepts used in information retrieval and text mining. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/581_atomic_method_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/581_atomic_method_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>North Carolina State University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>03</ReleaseMonth>
        <ReleaseDay>10</ReleaseDay>
        <ReleaseDateDisplay>03/10/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>9</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="zhang.yongpeng@gmail.com">Yongpeng Zhang</Author>
           <Author email="">Frank Mueller</Author>
           <Author email="">Xiaohui Cui and Thomas Potok</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://moss.csc.ncsu.edu/~mueller/ftp/pub/mueller/papers/epham09.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Text Mining</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Yongpeng Zhang,Frank Mueller, Xiaohui Cui and Thomas Potok,zhang.yongpeng@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>27bd9c43-0986-477e-aa4d-1dcd0493090c</GUID>
        <Name>High-Quality Rendering of Varying Isosurfaces</Name>
        <ShortDescription>Smooth trivariate splines on uniform tetrahedral partitions are well suited for high-quality visualization of isosurfaces from scalar volumetric data. We propose a novel rendering approach based on spline patches with low total degree, for which ray-isosurface intersections are computed using effcient root finding algorithms. Smoothly varying surface normals are directly extracted from the underlying spline representation. Our approach is using a combined CUDA and graphics pipeline and yields two key advantages over previous work. First, we can interactively vary the isovalues since all required processing steps are performed on the GPU. Second, we employ instancing in order to reduce shader complexity and to minimize overall memory usage. In particular, this allows to compute the spline coeffcients on-the-fly in real-time on the GPU. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/580_C1isosurfaces-medical_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/580_C1isosurfaces-medical_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>TU Darmstadt</OrganizationName>
        <OrganizationURL>http://www.tu-darmstadt.de/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>07</ReleaseDay>
        <ReleaseDateDisplay>10/07/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>68</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="thomasdidikoch@gmx.net">T. Kalbe</Author>
           <Author email="">T. Koch</Author>
           <Author email="">M. Goesele</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.youtube.com/watch?v=yRavRPW1Ao0">Multimedia</ContentType>
           <ContentType url="http://www.gris.informatik.tu-darmstadt.de/~mgoesele/projects/C1-isosurfaces.html">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>MedicalImaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Raycasting trivariate Splines isosurface volumerendering,T. Kalbe,T. Koch,M. Goesele,thomasdidikoch@gmx.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>0fa28489-5f19-4370-9a78-1d90711534a6</GUID>
        <Name>Realtime Dense Stereo Matching with Dynamic Programming in CUDA</Name>
        <ShortDescription>Real-time depth extraction from stereo images is an important process in computer vision. This paper proposes a new implementation of the dynamic programming algorithm to calculate dense depth maps using the CUDA architecture achieving real-time performance with consumer graphics cards. We compare the running time of the algorithm against CPU implementation and demonstrate the scalability property of the algorithm by testing it on different graphics cards. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/579_DP_algorithm_CUDA_TV_2009_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/579_DP_algorithm_CUDA_TV_2009_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>CAD/CAM/CAE Lab. EAFIT University</OrganizationName>
        <OrganizationURL>http://www1.eafit.edu.co/cadcamcae/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>09</ReleaseDay>
        <ReleaseDateDisplay>09/09/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>10</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="jcongote@eafit.edu.co">John Congote</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www1.eafit.edu.co/cadcamcae/documents/09_09_2009_Congote_etal_TV_stereo_depthmap_CUDA.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>John Congote,jcongote@eafit.edu.co</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>b7498c1e-fb46-492b-8ae6-fe6a4ccea50d</GUID>
        <Name>Improving the Open64 Backend for GPUs</Name>
        <ShortDescription>NVIDIA uses Open64 as a front-end tool to compile CUDA programs into an intermediate language called PTX. PTX can be viewed as an assembly language targeting a virtual machine and is an abstract layer between the application and the final hardwaredependent machine code. Our research explores the relationship between register pressure in the PTX code and the final machine code. We also implemented two optimizations in Open64 to help reduce register pressure and increase thread concurrency.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/578_open64_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/578_open64_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Northeastern University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>10/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="rdomingu@ece.neu.edu">Rodrigo Dominguez</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.ece.neu.edu/~rdomingu/gsoc09/NVIDIASummitPoster.pdf">Presentation</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Programming Tools</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Rodrigo Dominguez,rdomingu@ece.neu.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>92df54f0-8995-4ad9-8a57-0e9ddfd14842</GUID>
        <Name>Computer Generated Hologram on GPU - Simple color electroholography reconstruction system -</Name>
        <ShortDescription>We have constructed a simple color electroholography system that has excellent cost performance. It uses a graphics processing unit (GPU) and a liquid crystal display (LCD) projector. The structure of the GPU is suitable for calculating computer-generated holograms (CGHs). The calculation speed of the GPU is approximately 1,500 times faster than that of a central processing unit(Intel Core 2 Duo 2.66 GHz (We used one core for the calculation)). The LCD projector is an inexpensive, high-performance device for displaying CGHs. It has high-definition LCD panels for red, green and blue. Thus, it can be easily used for color electroholography. For a three-dimensional object consisting of 1,000 points, our system succeeded in real-time color holographic reconstruction at rate of 30 frames per second.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/577_hologram_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/577_hologram_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>Chiba University / Shohoku College / Kisarazu National College of Technology</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>07</ReleaseDay>
        <ReleaseDateDisplay>10/07/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>1500</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="itot@faculty.chiba-u.jp">Tomoyoshi Ito</Author>
           <Author email="">Naoki Takada</Author>
           <Author email="">Tomoyoshi Shimobaba</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.opticsinfobase.org/oe/viewmedia.cfm?uri=oe-17-18-16038&amp;seq=1">Multimedia</ContentType>
           <ContentType url="http://www.opticsinfobase.org/oe/abstract.cfm?URI=oe-17-18-16038">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Tomoyoshi Ito,Naoki Takada,Tomoyoshi Shimobaba,itot@faculty.chiba-u.jp</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>a8608fd3-c5f3-45fe-bc14-9438fefb2c62</GUID>
        <Name>CudaPad</Name>
        <ShortDescription>Cudapad is a software that helps developments develop and test small kernals for NVIDIAs CUDA language. Sometimes in your IDE you will want a quick way build or test a piece of CUDA code and CudaPad lets you do it.  It shows the ptx code, cubin code, register count, error and more on the fly.  There is no need to manually compile your code.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/576_CudaPad_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/576_CudaPad_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>CudaPad</OrganizationName>
        <OrganizationURL>http://cudapad.com/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>23</ReleaseDay>
        <ReleaseDateDisplay>08/23/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">CudaPad</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://cudapad.com/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Programming Tools</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>CudaPad</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>d91c3c63-a2d6-4a15-a70a-87bcafdd70d8</GUID>
        <Name>Real-time Parallel Hashing on the GPU</Name>
        <ShortDescription>We introduce an efficient data-parallel algorithm for building hash tables containing millions of elements in real-time on the GPU. Our two-tiered approach combines classical randomized perfect hashing and the recently introduced cuckoo hashing. Retrieval of any item requires checking at most three locations.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/575_paper_thumb_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/575_paper_thumb_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of California, Davis</OrganizationName>
        <OrganizationURL>http://idav.ucdavis.edu/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>12</ReleaseDay>
        <ReleaseDateDisplay>09/12/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="dfalcantara@ucdavis.edu">Dan Anthony Alcantara</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://idav.ucdavis.edu/~dfalcant/research/hashing.php">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Libraries</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Dan Anthony Alcantara,dfalcantara@ucdavis.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>b79e2f2b-047f-4497-bc71-8fae1e3bf2df</GUID>
        <Name>Real-time Robotic Surgery Platform with the GPU</Name>
        <ShortDescription>A Real-time Simulation, Guidance and Visualisation Platform for Intra-operative Minimally Invasive Robotic Surgery</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/574_robot-hotspot180_medium_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/574_robot-hotspot180_medium_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Imperial College London</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>06</ReleaseDay>
        <ReleaseDateDisplay>10/06/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>88</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="gzy@doc.ic.ac.uk">Guang-Zhong Yang</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://vip.doc.ic.ac.uk/gpu">Presentation</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>MedicalImaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Guang-Zhong Yang,gzy@doc.ic.ac.uk</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>aee7f189-cad9-4004-ab12-7af9e2dac705</GUID>
        <Name>Accelerating Virtual Texturing using CUDA</Name>
        <ShortDescription>Virtual texturing selectively loads parts of a large texture data set visible by the current view. Our poster shows how virtual texturing can be accelerated by using CUDA and OpenGL</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/573_cuda_zone_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/573_cuda_zone_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Ghent University - IBBT, ELIS Department/Multimedia Lab</OrganizationName>
        <OrganizationURL>http://multimedialab.elis.ugent.be/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>30</ReleaseDay>
        <ReleaseDateDisplay>09/30/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="charlesfrederik.hollemeersch@ugent.be">Charles-Frederik Hollemeersch</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://multimedialab.elis.ugent.be/gpu/media/pdf/2009.09%20-%20GTC%202009%20-%20Charles-Frederik%20Hollemeersch%20et%20al.%20-%20Accelerating%20Virtual%20Texturing%20Using%20CUDA.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>virtual textures rendering,Charles-Frederik Hollemeersch,charlesfrederik.hollemeersch@ugent.be</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>dfe42cca-0549-462c-a8b4-6f7f2fdb17a8</GUID>
        <Name>Implementation in C+CUDA of Multi-Label Text Categorizers</Name>
        <ShortDescription>In automated multi-label text categorization problems with large numbers of labels, the training databases are large, which may render the categorization time prohibitive for online systems. In this work, we evaluate the parallel implementation in C+CUDA of two multi-label text categorizers: the first is based on the k-Nearest Neighbors (k-NN) algorithm and the second is based on Probabilistic Neural Networks (PNN). We implemented these algorithms in three different ways: sequential in C, parallel in C+CUDA, and parallel using the C+CUBLAS library.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/572_800px-Pnn_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/572_800px-Pnn_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Universidade Federal do Espirito Santo</OrganizationName>
        <OrganizationURL>http://www.ufes.br</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>03</ReleaseDay>
        <ReleaseDateDisplay>08/03/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>64</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="alberto@lcad.inf.ufes.br">Alberto F. De Souza et al.</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.lcad.inf.ufes.br/wiki/index.php/Estudos_em_programa%C3%A7%C3%A3o_paralela_em_CUDA#Categoriza.C3.A7.C3.A3o_de_Texto_em_C.2BCUDA">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Information Retrieval</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Alberto F. De Souza,alberto@lcad.inf.ufes.br</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>9e8ea1d4-1246-4b44-86aa-3eaeeec9bc0c</GUID>
        <Name>Biologically Inspired Stereoscopic Vision Model in C+CUDA</Name>
        <ShortDescription>Most of the depth perception processing is done in the visual cortex, mainly in the primary (V1) and medial temporal (MT) areas. In this work, we modeled the neural architecture of the V1 and MT cortices using as building blocks previous models of cortical cells and log-polar mapping. A sequential implementation of our model can build a tridimensional representation of the external world using stereoscopic image pairs obtained from a pair of fronto-parallel cameras. A C+CUDA parallel implementation is almost 60 times faster and allows real-time 3D reconstruction.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/571_800px-3d-hallysson_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/571_800px-3d-hallysson_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Universidade Federal do Espirito Santo</OrganizationName>
        <OrganizationURL>http://www.ufes.br</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>03</ReleaseDay>
        <ReleaseDateDisplay>08/03/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>57</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="alberto@lcad.inf.ufes.br">Alberto F. De Souza et al.</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.lcad.inf.ufes.br/wiki/index.php/Estudos_em_programa%C3%A7%C3%A3o_paralela_em_CUDA#Vis.C3.A3o_Est.C3.A9reo_em_C.2BCUDA_2">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computer Vision</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Alberto F. De Souza,alberto@lcad.inf.ufes.br</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>ddb099b1-959c-4bf2-9254-ba51143125d4</GUID>
        <Name>ACCELERATING SPHERICAL HARMONIC TRANSFORMS ON THE NVIDIA GPU</Name>
        <ShortDescription>The Spherical Harmonic Transform is a critical computational kernel of the dynamics algorithms for numerical weather prediction and climate modeling. As atmospheric models push towards higher resolutions it has become necessary to accelerate this computationally intensive transform. Previous work has made attempts to parallelize and optimize the transform [1] [2] [3] [4], but none have exploited the advantages of the NVIDIAs General Purpose Graphics Processor Unit (GPGPU), a very recent SIMD type architecture. This paper describes a CPU-GPU type implementation for computation of Spherical Harmonic Transform. The implementation shows gain in terms of computation time and a low error rate, when compared to the implementation discussed in [1].</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/570_soman_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/570_soman_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Electrical Engineering University of Wisconsin, Madison, Wisconsin, USA</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>42</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Vikrant Soman</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://homepages.cae.wisc.edu/~ece734/project/s09/soman_rpt.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Spherical Harmonic Transform, GPU, Parallel,Vikrant Soman</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>e22a4a2f-cf43-499a-8777-3570c85b9e60</GUID>
        <Name>CULATools</Name>
        <ShortDescription>CULA is EM Photonics' GPU-accelerated numerical linear algebra library that contains a growing list of LAPACK functions.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/569_cula-logo_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/569_cula-logo_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>CULATools</OrganizationName>
        <OrganizationURL>http://www.culatools.com/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>30</ReleaseDay>
        <ReleaseDateDisplay>09/30/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email=""></Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.culatools.com/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>200</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword></Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>3355d528-c9f0-4e35-a07e-da8ea95ddc35</GUID>
        <Name>Scalable Split Primitives for the GPU</Name>
        <ShortDescription>Fast Split and Sort Implementation for millions of input elements and supporting 32-128 bit key values</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/568_splitSort_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/568_splitSort_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>CVIT, IIIT Hyderabad</OrganizationName>
        <OrganizationURL>http://cvit.iiit.ac.in</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>15</ReleaseDay>
        <ReleaseDateDisplay>07/15/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="skp@research.iiit.ac.in">Suryakant Patidar</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://cvit.iiit.ac.in/projects/gpuproject/SplitSort/SplitSort.pdf">Paper</ContentType>
           <ContentType url="http://cvit.iiit.ac.in/projects/gpuproject/SplitSort/splitSort-v0.1.tar.gz">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Libraries</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Sort, Split,Suryakant Patidar,skp@research.iiit.ac.in</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>2f279ff5-7168-4acf-822b-72fd98b2cd76</GUID>
        <Name>FindCUDA.cmake</Name>
        <ShortDescription>Building on the open source project CMake, developers can now integrate CUDA C compilation directly into their Visual Studio, Makefile or XCode build systems. File level dependencies are supported, as well as many other features designed to help CUDA C files build as part of the native system. Starting with CMake 2.8, FindCUDA.cmake is part of the standard distribution.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/567_CMake-logo-high-res_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/567_CMake-logo-high-res_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>NVIDIA Corp.</OrganizationName>
        <OrganizationURL>http://www.nvidia.com</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>30</ReleaseDay>
        <ReleaseDateDisplay>09/30/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="jbigler@nvidia.com">James Bigler</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.cmake.org/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Programming Tools</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Build, CMake, Visual Studio, Makefile, XCode,James Bigler,jbigler@nvidia.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>46bb452f-bc32-4e3e-a9f8-ef2b42c975db</GUID>
        <Name>Cognitive developmental approach towards the realization of human-like visual scene understanding</Name>
        <ShortDescription>How we humans understand visual scenes so easily and quickly? It is difficult to answer the question. However human babiles naturally acquire the ability to do it. Thus, imitating typical actions of babies would be promising for acquiring the ability of human-like visual scene understanding. Based on the above discussion, we propose a new framework of human-like visual scene understanding based on cognitive developmental approach, and construct a prototype system that recognizes already known objects, detects and registers unknown objects in near real-time with CUDA technologies.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/566_poster2_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/566_poster2_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>NTT Communication Science Laboratories</OrganizationName>
        <OrganizationURL>http://www.kecl.ntt.co.jp</OrganizationURL>
        <ReleaseYear>3009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>27</ReleaseDay>
        <ReleaseDateDisplay>09/27/3009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="akisato@ieee.org">Akisato Kimura</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.brl.ntt.co.jp/people/akisato/saliency4.html">Multimedia</ContentType>
           <ContentType url="http://www.brl.ntt.co.jp/people/akisato/saliency4.html">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Signal Processing</ApplicationType>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Cognitive developmental approach, visual scene understanding, saliency, video segmentation, CUDA,Akisato Kimura,akisato@ieee.org</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>2fc9604a-e291-46e4-9324-b90b1ec110b0</GUID>
        <Name>Stochastic Lagrangian Particle Model for Air Pollution</Name>
        <ShortDescription>The Graphics Processing Unit (GPU) is a powerful tool for parallel computing. In the past years the performance and capabilities of GPUs have increased, and the Compute Unified Device Architecture (CUDA) - a parallel computing architecture - has been developed by NVIDIA to utilize this performance in general purpose computations. Here we show for the first time a possible application of GPU for environmental studies serving as a basement for decision making strategies. A stochastic Lagrangian particle model has been developed on CUDA to estimate the transport and the transformation of the radionuclides from a single point source during an accidental release. Our results show that parallel implementation achieves typical acceleration values in the order of 80-120 times compared to CPU using a single-threaded implementation on a 2.33 GHz desktop computer. Only very small differences have been found between the results obtained from GPU and CPU simulations, which are comparable with the effect of stochastic transport phenomena in atmosphere. The relatively high speedup with no additional costs to maintain this parallel architecture could result in a wide usage of GPU for diversified environmental applications in the near future.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/565_plume_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/565_plume_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Eotvos Lorand University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>21</ReleaseDay>
        <ReleaseDateDisplay>09/21/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>120</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="mofi@elte.hu">Ferenc Molnar Jr.</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://nimbus.elte.hu/~cuda/">Application</ContentType>
           <ContentType url="http://nimbus.elte.hu/~cuda/">Paper</ContentType>
           <ContentType url="http://nimbus.elte.hu/~cuda/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Video card; Parallel computing; CUDA; Environmental application; Air pollution ,Ferenc Molnar Jr.,mofi@elte.hu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>bd8ebdba-8c09-413c-8c09-8cd67ec51ea5</GUID>
        <Name>SCGPSim: A fast SystemC Simulator on GPUs</Name>
        <ShortDescription>A SystemC simulator on GPUs</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/564_poster_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/564_poster_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>FERMAT Lab, Virginia Tech, Blacksburg, USA</OrganizationName>
        <OrganizationURL>http://www.fermat.ece.vt.edu/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>10/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>100</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="knmahesh@vt.edu">Mahesh Nanjundappa</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://filebox.vt.edu/users/knmahesh/index_files/nvidia_poster.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Electronic Design Automation</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>GPGPU, EDA, Parallel Simulation, SystemC,Mahesh Nanjundappa,knmahesh@vt.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>6e6bb696-0ae8-49c7-b75f-982182e43b7e</GUID>
        <Name>Flowcart</Name>
        <ShortDescription>Flowball is an interactive game using dense optical flow computed in realtime on a Geforce GTX 280. We provide a video and optical flow libraries...</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/563_cuda_zone_flowcart_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/563_cuda_zone_flowcart_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Institute for Computer Graphics and Vision, Graz University of Technology</OrganizationName>
        <OrganizationURL>http://www.icg.tugraz.at/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>02</ReleaseDay>
        <ReleaseDateDisplay>09/02/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="info@gpu4vision.org">Wolfgang Paier</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://gpu4vision.icg.tugraz.at/index.php?content=processing.php#pub36">Application</ContentType>
           <ContentType url="http://www.youtube.com/watch?v=Gnuscp3stF8">Multimedia</ContentType>
           <ContentType url="http://gpu4vision.icg.tugraz.at/index.php?content=processing.php#pub36">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Game Physics</ApplicationType>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Wolfgang Paier,info@gpu4vision.org</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>ccbd6aa9-f5a3-4310-bc01-4463d114ba04</GUID>
        <Name>CUDA Accelerated Sparse Field Level Set Segmentation of Large Medical Data Sets</Name>
        <ShortDescription>Segmentation of large medical volumes is an important task in diagnostic medicine. Computer assisted level set segmentation techniques have been shown to improve the accuracy of difficult segmentation tasks. We present a novel GPU accelerated level set segmentation algorithm that avoids redundant computations by only processing those voxels near the propagating level set surface. We evaluate the speed and accuracy of our algorithm by performing various segmentation tasks on a noisy magnetic resonance image (MRI) generated from the BrainWeb phantom dataset. We compare the performance of our algorithm to that of the previous best GPU and CPU algorithms. Compared to previous best GPU algorithm, our algorithm reduces the total number of processed voxels by 16 times with a negligible effect on segmentation accuracy. Our algorithm converges 9 times faster than the previous best GPU algorithm and 360 times faster than the previous best CPU algorithm on identical hardware. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/562_level_set_growth_3D_3_images_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/562_level_set_growth_3D_3_images_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Calgary</OrganizationName>
        <OrganizationURL>http://www.ucalgary.ca/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>10</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>10/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>360</SpeedUp>
        <SoftwareLicenseType>Commercial</SoftwareLicenseType>
        <Authors>
           <Author email="mlrobert@ucalgary.ca">Mike Roberts</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://pages.cpsc.ucalgary.ca/~mlrobert/data/Mike_Roberts_et_al_GTC.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>MedicalImaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>segmentation, level set, sparse field, narrow band,Mike Roberts,mlrobert@ucalgary.ca</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>b19208d8-3dcc-4b57-b8aa-993ed8261989</GUID>
        <Name>GPU accelerated Maximum Intensity Projection </Name>
        <ShortDescription>The "Maximum Intensity Projection" (MIP) is a computer visualization method in medicine that uses 3D data, e. g. CT or MRT, and computes a 2D view from a certain viewpoint.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/603_mip_filter2_small.gif</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/603_mip_filter2_large.gif</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Heidelberg University / Heilbronn University </OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="crurik@ix.urz.uni-heidelberg.de">Clas Rurik</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://dev.spectratic.de/mip/index_en.html">Multimedia</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>MedicalImaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Clas Rurik,crurik@ix.urz.uni-heidelberg.de</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>5ad29b38-3310-42a2-830e-f315c5103602</GUID>
        <Name>Stochastic Lagrangian Particle Model for Air Pollution</Name>
        <ShortDescription>The Graphics Processing Unit (GPU) is a powerful tool for parallel computing. In the past years the performance and capabilities of GPUs have increased, and the Compute Unified Device Architecture (CUDA) - a parallel computing architecture - has been developed by NVIDIA to utilize this performance in general purpose computations. Here we show for the first time a possible application of GPU for environmental studies serving as a basement for decision making strategies. A stochastic Lagrangian particle model has been developed on CUDA to estimate the transport and the transformation of the radionuclides from a single point source during an accidental release. Our results show that parallel implementation achieves typical acceleration values in the order of 80-120 times compared to CPU using a single-threaded implementation on a 2.33 GHz desktop computer. Only very small differences have been found between the results obtained from GPU and CPU simulations, which are comparable with the effect of stochastic transport phenomena in atmosphere. The relatively high speedup with no additional costs to maintain this parallel architecture could result in a wide usage of GPU for diversified environmental applications in the near future.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/602_plume_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/602_plume_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Eotvos Lorand University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>21</ReleaseDay>
        <ReleaseDateDisplay>09/21/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>120</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="mofi@elte.hu">Ferenc Molnar Jr.</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://nimbus.elte.hu/~cuda/">Application</ContentType>
           <ContentType url="http://nimbus.elte.hu/~cuda/">Paper</ContentType>
           <ContentType url="http://nimbus.elte.hu/~cuda/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Video card, Parallel computing, CUDA, Environmental application, Air pollution,Ferenc Molnar Jr.,mofi@elte.hu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>358bc116-6b7d-4598-a11a-bdad6cbd8e30</GUID>
        <Name>On the utility of graphics cards to perform massively parallel simulation of advanced Monte Carlo methods </Name>
        <ShortDescription>We present a case-study on the utility of graphics cards to perform massively parallel simulation of advanced Monte Carlo methods. Graphics cards, containing multiple Graphics Processing Units (GPUs), are self-contained parallel computational devices that can be housed in conventional desktop and laptop computers. For certain classes of Monte Carlo algorithms they offer massively parallel simulation, with the added advantage over conventional distributed multi-core processors that they are cheap, easily accessible, easy to maintain, easy to code, dedicated local devices with low power consumption. On a canonical set of stochastic simulation examples including population-based Markov chain Monte Carlo methods and Sequential Monte Carlo methods, we find speedups from 35 to 500 fold over conventional single-threaded computer code. Our findings suggest that GPUs have the potential to facilitate the growth of statistical modelling into complex data rich domains through the availability of cheap and accessible many-core computation. We believe the speedup we observe should motivate wider use of parallelizable simulation methods and greater methodological attention to their design. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/601_montecarlo_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/601_montecarlo_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Oxford-Man Institute</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>14</ReleaseDay>
        <ReleaseDateDisplay>05/14/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>500</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="lee@stats.ox.ac.uk">Anthony Lee</Author>
           <Author email="">Christopher Yau</Author>
           <Author email="">Michael B. Giles</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0905.2441">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Sequential Monte Carlo, Population-Based Markov Chain Monte Carlo, General Purpose Computationon Graphics Processing Units, Many-Core Architecture, Stochastic Simulation, Parallel Processing,Anthony Lee,Christopher Yau,Michael B. Giles,lee@stats.ox.ac.uk</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>ed8e3b35-7db8-4c89-8cf7-a9366ce84bbe</GUID>
        <Name>FOLKI-GPU Optical Flow</Name>
        <ShortDescription>A very fast implementation of Optical flow (25fps for full HD res)</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/600_onera_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/600_onera_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>ONERA</OrganizationName>
        <OrganizationURL>http://www.onera.fr/english.php</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>24</ReleaseDay>
        <ReleaseDateDisplay>07/24/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="aurelien.plyer@onera.fr">Aurelien Plyer</Author>
           <Author email="">Guy Le Besnerais</Author>
           <Author email="">Frederic Champagnat</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.onera.fr/dtim-en/gpu-for-image/folkigpu.php">Multimedia</ContentType>
           <ContentType url="http://www.onera.fr/dtim-en/gpu-for-image/folkigpu.php">Paper</ContentType>
           <ContentType url="http://www.onera.fr/dtim-en/gpu-for-image/folkigpu.php">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Video &amp; Audio</ApplicationType>
           <ApplicationType>computer vision</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>optical flow motion,Aurelien Plyer,Guy Le Besnerais,Frederic Champagnat,aurelien.plyer@onera.fr</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>aa156ca7-4c87-4d17-89d0-e51569250645</GUID>
        <Name>A Fast High Quality Pseudo Random Number Generator for NVIDIA CUDA</Name>
        <ShortDescription>Previously either due to hardware GPU limits or older versions of software, careful implementation of PRNGs was required to make good use of the limited numerical precision available on graphics cards. Newer nVidia G80 and Tesla hardware support double precision. This is available to high level programmers via CUDA. This allows a much simpler C++ implementation of Park-Miller random numbers, which provides a four fold speed up compared to an earlier GPU implementation. Code is available via ftp.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/599_graph_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/599_graph_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Computer Science, CREST centre, Kings College, London</OrganizationName>
        <OrganizationURL>http://www.cs.ucl.ac.u</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>01</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>01/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Wi11iam.Langdon@kcl.ac.uk">W. B. Langdon</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.cs.ucl.ac.uk/staff/W.Langdon/ftp/papers/langdon_2009_CIGPU.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Programming Tools</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>W. B. Langdon,Wi11iam.Langdon@kcl.ac.uk</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>cbe71302-afb1-4776-bb98-80fd8651b466</GUID>
        <Name>JUMP FLOODING ALGORITHM ON GRAPHICS HARDWARE AND ITS APPLICATIONS</Name>
        <ShortDescription>The graphics processing unit (GPU) has been developing at a very fast pace these few years. More and more researches have been done to utilize the ever increasing computability power of the GPU on general-purpose computations. This thesis proposes a new GPU algorithm { jump cooding algorithm (JFA). JFA is a new paradigm of communication between pixels on the GPU. It can quickly propagate the information of certain pixels to the others. The speed of JFA is exponen-tially faster than that of the standard cooding algorithm, and is approximately independent to the input size.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/597_progress_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/597_progress_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName></OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">RONG GUODONG</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.comp.nus.edu.sg/~tants/jfa/rong-guodong-phd-thesis.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>RONG GUODONG</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>91755150-16a7-4570-9a2e-2b2e921d2baf</GUID>
        <Name>Many-Core Algorithms for Statistical Phylogenetics</Name>
        <ShortDescription>Statistical phylogenetics is computationally intensive, resulting in considerable attention meted on techniques for parallelization. Codon-based models allow for independent rates of synonymous and replacement substitutions and have the potential to more adequately model the process of protein coding sequence evolution with a resulting increase in phylogenetic accuracy. Unfortunately, due to the high number of codon states, computational burden has largely thwarted phylogenetic reconstruction under codon models, particularly at the genomic-scale. Here we describe novel algorithms and methods for evaluating phylogenies under arbitrary molecular evolutionary models on Graphics Processing Units (GPUs), making use of the large number of processing cores to efficiently parallelize calculations even for large state-size models. Results: 

We implement the approach in an existing Bayesian framework and apply the algorithms to estimating the phylogeny of 62 complete mitochondrial genomes of carnivores under a 60-state codon model. We see a near 90-fold speed increase over an optimized CPU-based computation and a >140-fold increase over the currently available implementation, making this the first practical use of codon models for phylogenetic inference over whole mitochondrial or microorganism genomes. 
</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/596_Phylogenetics_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/596_Phylogenetics_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>Department of Biomathematics, University of California, Los Angeles</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>15</ReleaseDay>
        <ReleaseDateDisplay>04/15/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>140</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Marc A. Suchard</Author>
           <Author email="">Andrew Rambaut</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://tree.bio.ed.ac.uk/publications/390/">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType></ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Marc A. Suchard,Andrew Rambaut</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>d753c609-c7e3-4ddc-b2c9-d054e3ab46dd</GUID>
        <Name>Speed Up SVM Algorithm for Massive Classification Tasks</Name>
        <ShortDescription>We present a new parallel and incremental Support Vector Machine (SVM) algorithm for the classification of very large datasets on graphics processing units (GPUs). SVM and kernel related methods have shown to build accurate models but the learning task usually needs a quadratic program so that this task for large datasets requires large memory capacity and long time. We extend a recent Least Squares SVM (LS-SVM) proposed by Suykens and Vandewalle for building incremental and parallel algorithm. The new algorithm uses graphics processors to gain high performance at low cost. Numerical test results on UCI and Delve dataset repositories showed that our parallel incremental algorithm using GPUs is about 70 times faster than a CPU implementation and often significantly faster (over 1000 times) than state-of-the-art algorithms like LibSVM, SVM-perf and CB-SVM.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/595_svm_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/595_svm_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>IRISA Symbiose, Campus de Beaulieu, 35042 Rennes Cedex, France</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>30</ReleaseDay>
        <ReleaseDateDisplay>09/30/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="dtnghi@cit.ctu.edu.vn">Thanh-Nghi Do</Author>
           <Author email="vhnguyen@irisa.fr">Van-Hoa Nguyen</Author>
           <Author email="francois.poulet@irisa.fr">Francois Poulet</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.springerlink.com/content/rl34206jgr886444/">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Thanh-Nghi Do,Van-Hoa Nguyen,Francois Poulet,dtnghi@cit.ctu.edu.vn,vhnguyen@irisa.fr,francois.poulet@irisa.fr</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>040861ed-61a2-410f-907e-65e4a23b33a3</GUID>
        <Name>Visualizing Multiwavelength Astrophysical Data</Name>
        <ShortDescription>With recent advances in the measurement technology for allsky astrophysical imaging, our view of the sky is no longer limited to the tiny visible spectral range over the 2D Celestial sphere. We now can access a third dimension corresponding to a broad electromagnetic spectrum with a wide range of allsky surveys; these surveys span frequency bands including long long wavelength radio, microwaves, very short X-rays, and gamma rays. These advances motivate us to study and examine multiwavelength visualization techniques to maximize our capabilities to visualize and exploit these informative image data sets. In this work, we begin with the processing of the data themselves, uniformizing the representations and units of raw data obtained from varied detector sources. Then we apply tools to map, convert, color-code, and format the multiwavelength data in forms useful for applications. We explore different visual representations for displaying the data, including such methods as textured image stacks, the horseshoe representation, and GPU-based volume visualization. A family of visual tools and analysis methods are introduced to explore the data, including interactive data mapping on the graphics processing unit (GPU), the mini-map explorer, and GPU-based interactive feature analysis. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/593_title_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/593_title_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>The Hong Kong University of Science and Technology</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>12/01/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Hongwei Li</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www2.computer.org/portal/web/csdl/doi/10.1109/TVCG.2008.182">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Hongwei Li</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>489165bd-a529-412c-bb5e-0230b77d02f9</GUID>
        <Name>A GPU based real-time software correlation system for theMurchison Widefield Array prototype.</Name>
        <ShortDescription>Modern graphics processing units (GPUs) are inexpensive commodity hardware that offer Tflop/s theoretical computing capacity. GPUs are well suited to many compute-intensive tasks including digital signal processing. We describe the implementation and performance of a GPU-based digital correlator for radio astronomy. The correlator is implemented using the NVIDIA CUDA development environment. We evaluate three design options on two generations of NVIDIA hardware. The different designs utilize the internal registers, shared memory and multiprocessors in different ways. We find that optimal performance is achieved with the design that minimizes global memory reads on recent generations of hardware. The GPU-based correlator outperforms a single-threaded CPU equivalent by a factor of 60 for a 32 antenna array, and runs on commodity PC hardware. The extra compute capability provided by the GPU maximises the correlation capability of a PC while retaining the fast development time associated with using standard hardware, networking and programming languages. In this way, a GPU-based correlation system represents a middle ground in design space between high performance, custom built hardware and pure CPU-based software correlation. The correlator was deployed at the Murchison Widefield Array 32 antenna prototype system where it ran in real-time for extended periods. We briefly describe the data capture, streaming and correlation system for the prototype array.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/592_bar_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/592_bar_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Harvard-Smithsonian Center for Astrophysics</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="rwayth@cfa.harvard.edu">Randall B. Wayth</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://mwatelescope.org/info/documents_7_1910002104.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType>Signal Processing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Randall B. Wayth,rwayth@cfa.harvard.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>01e2e4a7-8b67-47c9-80b4-c4b0b40c66e7</GUID>
        <Name>Asymptotic theorems of sequential estimation-adjusted urn models</Name>
        <ShortDescription>The Generalized P'{o}lya Urn (GPU) is a popular urn model which is widely used in many disciplines. In particular, it is extensively used in treatment allocation schemes in clinical trials. In this paper, we propose a sequential estimation-adjusted urn model (a nonhomogeneous GPU) which has a wide spectrum of applications. Because the proposed urn model depends on sequential estimations of unknown parameters, the derivation of asymptotic properties is mathematically intricate and the corresponding results are unavailable in the literature. We overcome these hurdles and establish the strong consistency and asymptotic normality for both the patient allocation and the estimators of unknown parameters, under some widely satisfied conditions. These properties are important for statistical inferences and they are also useful for the understanding of the urn limiting process. A superior feature of our proposed model is its capability to yield limiting treatment proportions according to any desired allocation target. The applicability of our model is illustrated with a number of examples. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/591_formula_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/591_formula_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Zhejiang University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2006</ReleaseYear>
        <ReleaseMonth>03</ReleaseMonth>
        <ReleaseDay>14</ReleaseDay>
        <ReleaseDateDisplay>03/14/2006</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Li-X. Zhang</Author>
           <Author email="">Feifang Hu</Author>
           <Author email="">Siu Hung Cheung</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/math/0603329">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Li-X. Zhang,Feifang Hu,Siu Hung Cheung</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>8fd8d414-8e27-4d33-b9b7-ec084a06aeb4</GUID>
        <Name>High Performance Direct Gravitational N-body Simulations</Name>
        <ShortDescription>We present the results of gravitational direct $N$-body simulations using the commercial graphics processing units (GPU) NVIDIA Quadro FX1400 and GeForce 8800GTX, and compare the results with GRAPE-6Af special purpose hardware. The force evaluation of the $N$-body problem was implemented in Cg using the GPU directly to speed-up the calculations. The integration of the equations of motions were, running on the host computer, implemented in C using the 4th order predictor-corrector Hermite integrator with block time steps. We find that for a large number of particles ($N apgt 10^4$) modern graphics processing units offer an attractive low cost alternative to GRAPE special purpose hardware. A modern GPU continues to give a relatively flat scaling with the number of particles, comparable to that of the GRAPE. Using the same time step criterion the total energy of the $N$-body system was conserved better than to one in $10^6$ on the GPU, which is only about an order of magnitude worse than obtained with GRAPE. For $Napgt 10^6$ the GeForce 8800GTX was about 20 times faster than the host computer. Though still about an order of magnitude slower than GRAPE, modern GPU's outperform GRAPE in their low cost, long mean time between failure and the much larger onboard memory; the GRAPE-6Af holds at most 256k particles whereas the GeForce 8800GTF can hold 9 million particles in memory. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/590_graph_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/590_graph_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Section Computational Science, University of Amsterdam, Amsterdam, The Netherlands</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>02</ReleaseMonth>
        <ReleaseDay>23</ReleaseDay>
        <ReleaseDateDisplay>02/23/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Simon Portegies Zwart</Author>
           <Author email="">Robert Belleman</Author>
           <Author email="">Peter Geldof</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/cs/0702135">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Simon Portegies Zwart,Robert Belleman,Peter Geldof</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>abb3c32a-1e92-4553-9a7a-812aaa364adb</GUID>
        <Name>Graphic processors to speed-up simulations for the design of high performance solar receptors</Name>
        <ShortDescription>Graphics Processing Units (GPUs) are now powerful and flexible systems adapted and used for other purposes than graphics calculations (General Purpose computation on GPU -- GPGPU). We present here a prototype to be integrated into simulation codes that estimate temperature, velocity and pressure to design next generations of solar receptors. Such codes will delegate to our contribution on GPUs the computation of heat transfers due to radiations. We use Monte-Carlo line-by-line ray-tracing through finite volumes. This means data-parallel arithmetic transformations on large data structures. Our prototype is inspired on the source code of GPUBench. Our performances on two recent graphics cards (Nvidia 7800GTX and ATI RX1800XL) show some speed-up higher than 400 compared to CPU implementations leaving most of CPU computing resources available. As there were some questions pending about the accuracy of the operators implemented in GPUs, we start this report with a survey and some contributed tests on the various floating point units available on GPUs. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/589_model_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/589_model_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>ELIAUS, UPVD</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2007</ReleaseYear>
        <ReleaseMonth>03</ReleaseMonth>
        <ReleaseDay>06</ReleaseDay>
        <ReleaseDateDisplay>03/06/2007</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>420</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="firstname.lastname@univ-perp.fr">Sylvain Collange</Author>
           <Author email="">Marc Daumas</Author>
           <Author email="">David Defour</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/cs/0703028">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Sylvain Collange,Marc Daumas,David Defour,firstname.lastname@univ-perp.fr</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>003d7f3b-356c-4876-81b4-c207d76b6bf2</GUID>
        <Name>nHD</Name>
        <ShortDescription>nHD is a multi-GPU 2nd order full Godunov three-dimensionaluniform-mesh Euler equations solver for calorically ideal,compressible gas. nHD uses CUDA with MPI and runs on a cluster ofmulti-GPU machines to accelerate computational hydrodynamicscalculations.Full Godunov method solves the hydrodynamic equations by discretizingthe fluid and calculating the nonlinear evolution of the discretizeddistribution, using the analytic solutions for Riemann problems. Thusfull Godunov method can resolve arbitrary severe shocks with minimumartificial dissipation and oscillation, and is the irreplaceablemethod for simulations of compressible fluid, where shocks and vacuumsare naturally generated.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/588_nHD7_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/588_nHD7_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Physics, Kyoto University</OrganizationName>
        <OrganizationURL>http://www.scphys.kyoto-u.ac.jp/index_e.html</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>20</ReleaseDay>
        <ReleaseDateDisplay>09/20/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>173</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="muranushi@gmail.com">Takayuki Muranushi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://code.google.com/p/astro-attic/wiki/NHDIntroduction">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Computational Hydrodynamics, Full Godunov Method,Takayuki Muranushi,muranushi@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>9369afee-d78a-4b20-a092-c689a4a40301</GUID>
        <Name>SCELib3.0</Name>
        <ShortDescription>SCELib is a computer program which implements the Single Center Expansion (SCE) method to describe molecular electronic densities and the interaction potentials between a charged projectile (electron or positron) and a target molecular system. The first version (CPC Catalog identifier ADMG_v1_0) was submitted to the CPC Program Library in 2000, and version 2.0 (ADMG_v2_0) was submitted in 2004. We here announce the new release 3.0 which presents additional features with respect to the previous versions aiming at a significative enhance of its capabilities to deal with larger molecular systems. SCELib 3.0 allows for ab initio effective core potential (ECP) calculations of the molecular wavefunctions to be used in the SCE method in addition to the standard all-electron description of the molecule. The list of supported architectures has been updated and the code has been ported to platforms based on accelerating coprocessors, such as the NVIDIA GPGPU and the new parallel model adopted is able to efficiently run on a mixed many-core computing system. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/587_Ribose_toc_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/587_Ribose_toc_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>CASPUR, Consortium for Supercomputing in Research</OrganizationName>
        <OrganizationURL>http://www.caspur.it</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>25</ReleaseDay>
        <ReleaseDateDisplay>07/25/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>177</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="n.sanna@caspur.it">Nico Sanna</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.sciencedirect.com/science?_ob=ArticleURL&amp;_udi=B6TJ5-4WVF6SC-1&amp;_user=2814622&amp;_rdoc=1&amp;_fmt=&amp;_orig=search&amp;_sort=d&amp;_docanchor=&amp;view=c&amp;_acct=C000058858&amp;_version=1&amp;_urlVersion=0&amp;_userid=2814622&amp;md5=6b3f73ac472d1df2a6fcf78cda6525b2">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Nico Sanna,n.sanna@caspur.it</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>9a45c9f4-df80-4c98-b6b4-98def8807dd4</GUID>
        <Name>Black holes on GPUs</Name>
        <ShortDescription>This paper describes a parallel implementation of Monte Carlo simulations using the post-Newtonian equations of motion to model black holes. We use these simulations to investigate the phase space of binary black hole systems.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/586_blackhole_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/586_blackhole_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Maryland</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>27</ReleaseDay>
        <ReleaseDateDisplay>08/27/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>50</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="tiglio@umd.edu">Frank Herrmann</Author>
           <Author email="">John Silberholz</Author>
           <Author email="">Matias Bellone / Gustavo Guerberoff / Manuel Tiglio</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arXiv.org/pdf/0908.3889v2">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Life Sciences</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Frank Herrmann,John Silberholz,Matias Bellone,Gustavo Guerberoff,Manuel Tiglio,tiglio@umd.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>16c382b3-7218-4288-a261-523470b8c535</GUID>
        <Name>GPU accelerated analysis of financial markets</Name>
        <ShortDescription>The compute unified device architecture is an almost conventional programming approach for managing computations on a graphics processing unit (GPU) as a data-parallel computing device. With a maximum number of 240 cores in combination with a high memory bandwidth, a recent GPU offers resources for computational physics. We apply this technology to methods of fluctuation analysis, which includes determination of the scaling behavior of a stochastic process and the equilibrium autocorrelation function. Additionally, the recently introduced pattern formation conformity (Preis T et al 2008 Europhys. Lett. 82 68005), which quantifies pattern-based complex short-time correlations of a time series, is calculated on a GPU and analyzed in detail. Results are obtained up to 84 times faster than on a current central processing unit core. When we apply this method to high-frequency time series of the German BUND future, we find significant pattern-based correlations on short time scales.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/585_financial_markets_small.gif</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/585_financial_markets_large.gif</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Johannes Gutenberg University Mainz</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>16</ReleaseDay>
        <ReleaseDateDisplay>09/16/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>80</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="preis@uni-mainz.de">Tobias Preis</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.tobiaspreis.de/">Multimedia</ContentType>
           <ContentType url="http://www.tobiaspreis.de/">Paper</ContentType>
           <ContentType url="http://www.tobiaspreis.de/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Finance</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Tobias Preis,preis@uni-mainz.de</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>c462ebc4-646d-4eaf-9714-144678d49528</GUID>
        <Name>Fast recursive filters for simulating nonlinear dynamic systems</Name>
        <ShortDescription>A fast and accurate computational scheme for simulating nonlinear dynamic systems is presented. The scheme assumes that the system can be represented by a combination of components of only two different types: first-order low-pass filters and static nonlinearities. The parameters of these filters and nonlinearities may depend on system variables, and the topology of the system may be complex, including feedback. Several examples taken from neuroscience are given: phototransduction, photopigment bleaching, and spike generation according to the Hodgkin-Huxley equations. The scheme uses two slightly different forms of autoregressive filters, with an implicit delay of zero for feedforward control and an implicit delay of half a sample distance for feedback control. On a fairly complex model of the macaque retinal horizontal cell it computes, for a given level of accuracy, 1-2 orders of magnitude faster than 4th-order Runge-Kutta. The computational scheme has minimal memory requirements, and is also suited for computation on a stream processor, such as a GPU (Graphical Processing Unit). </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/584_nuclear_small.gif</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/584_nuclear_large.gif</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Netherlands Institute for Neuroscience</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2007</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>11</ReleaseDay>
        <ReleaseDateDisplay>04/11/2007</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">J. H. van Hateren</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0704.1362">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Life Sciences</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>J. H. van Hateren</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>4b683456-f4de-488d-b8f6-6e9a8607538f</GUID>
        <Name>N-Body Simulations on GPUs</Name>
        <ShortDescription>Commercial graphics processors (GPUs) have high compute capacity at very low cost, which makes them attractive for general purpose scientific computing. In this paper we show how graphics processors can be used for N-body simulations to obtain improvements in performance over current generation CPUs. We have developed a highly optimized algorithm for performing the O(N^2) force calculations that constitute the major part of stellar and molecular dynamics simulations. In some of the calculations, we achieve sustained performance of nearly 100 GFlops on an ATI X1900XTX. The performance on GPUs is comparable to specialized processors such as GRAPE-6A and MDGRAPE-3, but at a fraction of the cost. Furthermore, the wide availability of GPUs has significant implications for cluster computing and distributed computing efforts like Folding@Home. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/583_nbody_small.gif</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/583_nbody_large.gif</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Stanford University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2007</ReleaseYear>
        <ReleaseMonth>06</ReleaseMonth>
        <ReleaseDay>20</ReleaseDay>
        <ReleaseDateDisplay>06/20/2007</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="pande@stanford.edu">Erich Elsen</Author>
           <Author email="">V. Vishal</Author>
           <Author email="">Mike Houston</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0706.3060">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Life Sciences</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Erich Elsen,V. Vishal,Mike Houston,pande@stanford.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>e5271230-c663-4fb6-bf23-997f7563256e</GUID>
        <Name>High Performance Direct Gravitational N-body Simulations</Name>
        <ShortDescription>We present the results of gravitational direct $N$-body simulations using the Graphics Processing Unit (GPU) on a commercial NVIDIA GeForce 8800GTX designed for gaming computers. The force evaluation of the $N$-body problem is implemented in ``Compute Unified Device Architecture'' (CUDA) using the GPU to speed-up the calculations. We tested the implementation on three different $N$-body codes: two direct $N$-body integration codes, using the 4th order predictor-corrector Hermite integrator with block time-steps, and one Barnes-Hut treecode, which uses a 2nd order leapfrog integration scheme. The integration of the equations of motions for all codes is performed on the host CPU. We find that for $N > 512$ particles the GPU outperforms the GRAPE-6Af, if some softening in the force calculation is accepted. Without softening and for very small integration time steps the GRAPE still outperforms the GPU. We conclude that modern GPUs offer an attractive alternative to GRAPE-6Af special purpose hardware. Using the same time-step criterion, the total energy of the $N$-body system was conserved better than to one in $10^6$ on the GPU, only about an order of magnitude worse than obtained with GRAPE-6Af. For $N apgt 10^5$ the 8800GTX outperforms the host CPU by a factor of about 100 and runs at about the same speed as the GRAPE-6Af. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/582_nbody_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/582_nbody_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Section Computational Science, University of Amsterdam, Amsterdam, TheNetherlands</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2007</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>06</ReleaseDay>
        <ReleaseDateDisplay>07/06/2007</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Robert G. Belleman</Author>
           <Author email="">Jeroen Bedorf</Author>
           <Author email="">Simon Portegies Zwart </Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0707.0438">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Robert G. Belleman,Jeroen Bedorf,Simon Portegies Zwart </Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>a89487d4-5b55-47ad-9a1c-38363d7c0e04</GUID>
        <Name>Developing and Deploying Advanced Algorithms to Novel Supercomputing Hardware</Name>
        <ShortDescription>The objective of our research is to demonstrate the practical usage and orders of magnitude speedup of real-world applications by using alternative technologies to support high performance computing. Currently, the main barrier to the widespread adoption of this technology is the lack of development tools and case studies that typically impede non-specialists that might otherwise develop applications that could leverage these technologies. By partnering with the Innovative Systems Laboratory at the National Center for Supercomputing, we have obtained access to several novel technologies, including several Field-Programmable Gate Array (FPGA) systems, NVidia Graphics Processing Units (GPUs), and the STI Cell BE platform. Our goal is to not only demonstrate the capabilities of these systems, but to also serve as guides for others to follow in our path. To date, we have explored the efficacy of the SRC-6 MAP-C and MAP-E and SGI RASC Athena and RC100 reconfigurable computing platforms in supporting a two-point correlation function which is used in a number of different scientific domains. In a brute force test, the FPGA based single-processor system has achieved an almost two orders of magnitude speedup over a single-processor CPU system. We are now developing implementations of this algorithm on other platforms, including one using a GPU. Given the considerable efforts of the cosmology community in optimizing these classes of algorithms, we are currently working to implement an optimized version of the basic family of correlation functions by using tree-based data structures. Finally, we are also exploring other algorithms, such as instance-based classifiers, power spectrum estimators, and higher-order correlation functions that are also commonly used in a wide range of scientific disciplines.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/581_tesla_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/581_tesla_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>National Center for Supercomputing Applications, University of Illinois atUrbana-Champaign</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2007</ReleaseYear>
        <ReleaseMonth>11</ReleaseMonth>
        <ReleaseDay>21</ReleaseDay>
        <ReleaseDateDisplay>11/21/2007</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>25</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="rb@astro.uiuc.edu">Robert J. Brunner</Author>
           <Author email="">Volodymyr V. Kindratenko</Author>
           <Author email="">Adam D. Myers</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0711.3414">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Robert J. Brunner,Volodymyr V. Kindratenko,Adam D. Myers,rb@astro.uiuc.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>19b10e9a-f467-4538-8587-8594b128eeda</GUID>
        <Name>Fast k Nearest Neighbor Search </Name>
        <ShortDescription>The recent improvements of graphics processing units (GPU) offer to the computer vision community a powerful processing platform. Indeed, a lot of highly-parallelizable computer vision problems can be significantly accelerated using GPU architecture. Among these algorithms, the k nearest neighbor search (KNN) is a well-known problem linked with many applications such as classification, estimation of statistical properties, etc. The main drawback of this task lies in its computation burden, as it grows polynomially with the data size. In this paper, we show that the use of the NVIDIA CUDA API accelerates the search for the KNN up to a factor of 120.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/580_dots_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/580_dots_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName></OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>09</ReleaseDay>
        <ReleaseDateDisplay>04/09/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>120</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Vincent Garcia and Eric Debreuve and Michel Barlaud</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0804.1448">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Vincent Garcia and Eric Debreuve and Michel Barlaud</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>8890dacc-2905-41ac-a0bd-4efc292db999</GUID>
        <Name>A multiphysics and multiscale software environment for modeling astrophysical systems</Name>
        <ShortDescription>We present MUSE, a software framework for combining existing computational tools for different astrophysical domains into a single multiphysics, multiscale application. MUSE facilitates the coupling of existing codes written in different languages by providing inter-language tools and by specifying an interface between each module and the framework that represents a balance between generality and computational efficiency. This approach allows scientists to use combinations of codes to solve highly-coupled problems without the need to write new codes for other domains or significantly alter their existing codes. MUSE currently incorporates the domains of stellar dynamics, stellar evolution and stellar hydrodynamics for studying generalized stellar systems. We have now reached a "Noah's Ark" milestone, with (at least) two available numerical solvers for each domain. MUSE can treat multi-scale and multi-physics systems in which the time- and size-scales are well separated, like simulating the evolution of planetary systems, small stellar associations, dense stellar clusters, galaxies and galactic nuclei. In this paper we describe three examples calculated using MUSE: the merger of two galaxies, the merger of two evolving stars, and a hybrid N-body simulation. In addition, we demonstrate an implementation of MUSE on a distributed computer which may also include special-purpose hardware, such as GRAPEs or GPUs, to accelerate computations. The current MUSE code base is publicly available as open source at this http URL: http://muse.li/.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/579_sidexside_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/579_sidexside_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Amsterdam, Amsterdam, The Netherlands</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>12</ReleaseDay>
        <ReleaseDateDisplay>07/12/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Simon Portegies Zwart</Author>
           <Author email="">Steve McMillan</Author>
           <Author email="">Stefan Harfst </Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0807.1996">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Simon Portegies Zwart,Steve McMillan,Stefan Harfst </Keyword>
        </Keywords>
     </Application>

      <Application>
        <GUID>0ba0bc17-1da4-46b9-8af0-b885bd619e74</GUID>
        <Name>Accelerating Scientific Computations with Mixed Precision Algorithms</Name>
        <ShortDescription>On modern architectures, the performance of 32-bit operations is often at least twice as fast as the performance of 64-bit operations. By using a combination of 32-bit and 64-bit floating point arithmetic, the performance of many dense and sparse linear algebra algorithms can be significantly enhanced while maintaining the 64-bit accuracy of the resulting solution. The approach presented here can apply not only to conventional processors but also to other technologies such as Field Programmable Gate Arrays (FPGA), Graphical Processing Units (GPU), and the STI Cell BE processor. Results on modern processor architectures and the STI Cell BE are presented. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/578_c_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/578_c_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Mathematics, University of Coimbra, Coimbra,Portugal</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>20</ReleaseDay>
        <ReleaseDateDisplay>08/20/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>15</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Marc Baboulin</Author>
           <Author email="">Alfredo Buttari</Author>
           <Author email="">Jack Dongarra</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0808.2794">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Marc Baboulin,Alfredo Buttari,Jack Dongarra</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>cfb65cc8-2394-4a3f-8658-4d117f3a3953</GUID>
        <Name>Parallel GPU Implementation of Iterative PCA Algorithms</Name>
        <ShortDescription>Principal component analysis (PCA) is a key statistical technique for multivariate data analysis. For large data sets the common approach to PCA computation is based on the standard NIPALS-PCA algorithm, which unfortunately suffers from loss of orthogonality, and therefore its applicability is usually limited to the estimation of the first few components. Here we present an algorithm based on Gram-Schmidt orthogonalization (called GS-PCA), which eliminates this shortcoming of NIPALS-PCA. Also, we discuss the GPU (Graphics Processing Unit) parallel implementation of both NIPALS-PCA and GS-PCA algorithms. The numerical results show that the GPU parallel optimized versions, based on CUBLAS (NVIDIA) are substantially faster (up to 12 times) than the CPU optimized versions based on CBLAS (GNU Scientific Library). </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/577_pca_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/577_pca_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Institute for Biocomplexity and Informatics, University of Calgary</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>11</ReleaseMonth>
        <ReleaseDay>07</ReleaseDay>
        <ReleaseDateDisplay>11/07/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>12</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">M. Andrecut</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0811.1081">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>M. Andrecut</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>9e8271a2-4d5e-4e08-868d-fb8c0e0eb80a</GUID>
        <Name>Recent algorithm and machine developments for lattice QCD</Name>
        <ShortDescription>I review recent machine trends and algorithmic developments for dynamical lattice QCD simulations with the HMC algorithm for Wilson-type fermions. The topics include the trend toward multi-core processors and general purpose GPU (GPGPU) computing, and improvements on the quark determinant preconditioning, molecular dynamics integrator, and quark solvers. I also discuss the prospect on the use of these techniques on the forthcoming petaflops machines. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/576_ps_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/576_ps_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Graduate School of Science, Hiroshima University, Higashi-Hiroshima, Hiroshima 739-8526,Japan.</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>11</ReleaseMonth>
        <ReleaseDay>11</ReleaseDay>
        <ReleaseDateDisplay>11/11/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="ishikawa@theo.phys.sci.hiroshima-u.ac.jp">Ken-Ichi Ishikawa</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0811.1661">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Ken-Ichi Ishikawa,ishikawa@theo.phys.sci.hiroshima-u.ac.jp</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>58941ce0-0754-418a-9235-d94fbd05b96f</GUID>
        <Name>Interactive Visualization of Billion Point Cosmological Simulations</Name>
        <ShortDescription>Despite the recent advances in graphics hardware capabilities, a brute force approach is incapable of interactively displaying terabytes of data. We have implemented a system that uses hierarchical level-of-detailing for the results of cosmological simulations, in order to display visually accurate results without loading in the full dataset (containing over 10 billion points). The guiding principle of the program is that the user should not be able to distinguish what they are seeing from a full rendering of the original data. Furthermore, by using a tree-based system for levels of detail, the size of the underlying data is limited only by the capacity of the IO system containing it. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/575_space_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/575_space_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>California Institute of Technology, California Ave, 91126, Pasadena, CA</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>11</ReleaseMonth>
        <ReleaseDay>13</ReleaseDay>
        <ReleaseDateDisplay>11/13/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Tamas Szalay</Author>
           <Author email="">Volker Springel</Author>
           <Author email="">Gerard Lemson</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/ftp/arxiv/papers/0811/0811.2055.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Tamas Szalay,Volker Springel,Gerard Lemson</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>8b031bf9-1fc2-4a6b-a94f-fc9d93433d19</GUID>
        <Name>Parallel Algorithm for Solving Kepler's Equation on Graphics Processing Units: Application to Analysis of Doppler Exoplanet Searches</Name>
        <ShortDescription>We present the results of a highly parallel Kepler equation solver using the Graphics Processing Unit (GPU) on a commercial nVidia GeForce 280GTX and the "Compute Unified Device Architecture" programming environment. We apply this to evaluate a goodness-of-fit statistic (e.g., chi^2) for Doppler observations of stars potentially harboring multiple planetary companions (assuming negligible planet-planet interactions). We tested multiple implementations using single precision, double precision, pairs of single precision, and mixed precision arithmetic. We find that the vast majority of computations can be performed using single precision arithmetic, with selective use of compensated summation for increased precision. However, standard single precision is not adequate for calculating the mean anomaly from the time of observation and orbital period when evaluating the goodness-of-fit for real planetary systems and observational data sets. Using all double precision, our GPU code outperforms a similar code using a modern CPU by a factor of over 60. Using mixed-precision, our GPU code provides a speed-up factor of over 600, when evaluating N_sys > 1024 models planetary systems each containing N_pl = 4 planets and assuming N_obs = 256 observations of each system. We conclude that modern GPUs also offer a powerful tool for repeatedly evaluating Kepler's equation and a goodness-of-fit statistic for orbital models when presented with a large parameter space. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/574_KeplersEquation_small.gif</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/574_KeplersEquation_large.gif</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Astronomy, University of Florida</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>16</ReleaseDay>
        <ReleaseDateDisplay>12/16/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>600</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Eric B. Ford</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0812.2976">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>gravitation,planetary systems,methods: numerical,techniques:radial velocities,Eric B. Ford</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>c8752f5a-9c9d-4f67-9779-6c0ffbd62c22</GUID>
        <Name>Differential Equations for Monte Carlo Recycling and a GPU-Optimized Normal Quantile</Name>
        <ShortDescription>This article presents differential equations and solution methods for the functions of the form $A(z) = F^{-1}(G(z))$, where $F$ and $G$ are cumulative distribution functions. Such functions allow the direct recycling of samples from one distribution into samples from another. The method may be developed analytically for certain special cases, and illuminate the idea that it is a more precise form of the traditional Cornish-Fisher expansion. In this manner the model risk of distributional risk may be assessed free of the Monte Carlo noise associated with resampling. The method may also be regarded as providing both analytical and numerical bases for doing more precise Cornish-Fisher transformations. Examples are given of equations for converting normal samples to Student t, and converting exponential to hyperbolic, variance gamma and normal. In the case of the normal distribution, the change of variables employed allows the sampling to take place to good accuracy based on a single rational approximation over a very wide range of the sample space. The avoidance of any branching statement is of use in optimal GPU computations, and we give example of branch-free normal quantiles that offer performance improvements in a GPU environment, while retaining the precision characteristics of well-known methods. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/573_montecarlo_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/573_montecarlo_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Mathematics King's College, The Strand, LondonWC2R 2LS, England</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>01</ReleaseMonth>
        <ReleaseDay>06</ReleaseDay>
        <ReleaseDateDisplay>01/06/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="william.shaw@kcl.ac.uk">William T. Shaw</Author>
           <Author email="">Nick Brickman</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0901.0638">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>William T. Shaw,Nick Brickman,william.shaw@kcl.ac.uk</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>a8956c98-3e65-4a84-8ed9-2b2c84becf99</GUID>
        <Name>Nodal Discontinuous Galerkin Methods</Name>
        <ShortDescription>Discontinuous Galerkin (DG) methods for the numerical solution of partial differential equations have enjoyed considerable success because they are both flexible and robust: They allow arbitrary unstructured geometries and easy control of accuracy without compromising simulation stability. Lately, another property of DG has been growing in importance: The majority of a DG operator is applied in an element-local way, with weak penalty-based element-to-element coupling. The resulting locality in memory access is one of the factors that enables DG to run on off-the-shelf, massively parallel graphics processors (GPUs). In addition, DG's high-order nature lets it require fewer data points per represented wavelength and hence fewer memory accesses, in exchange for higher arithmetic intensity. Both of these factors work significantly in favor of a GPU implementation of DG. Using a single US$400 Nvidia GTX 280 GPU, we accelerate a solver for Maxwell's equations on a general 3D unstructured grid by a factor of 40 to 60 relative to a serial computation on a current-generation CPU. In many cases, our algorithms exhibit full use of the device's available memory bandwidth. Example computations achieve and surpass 200 gigaflops/s of net application-level floating point work. In this article, we describe and derive the techniques used to reach this level of performance. In addition, we present comprehensive data on the accuracy and runtime behavior of the method. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/572_plane_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/572_plane_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Division of Applied Mathematics, Brown University, Providence, RI 02912</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>01</ReleaseMonth>
        <ReleaseDay>08</ReleaseDay>
        <ReleaseDateDisplay>01/08/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>60</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="andreas@brown.edu">Andreas Klockner</Author>
           <Author email="kloeckner@brown.edu">Tim Warburton</Author>
           <Author email="">Jeffrey Bridge</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0901.1024">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Andreas Klockner,Tim Warburton,Jeffrey Bridge,andreas@brown.edu,kloeckner@brown.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>f93b62b6-b6af-497e-83a8-865af31c8d7a</GUID>
        <Name>Parallelizing Hash-based Data Carving</Name>
        <ShortDescription>The ability to detect fragments of deleted image files and to reconstruct these image files from all available fragments on disk is a key activity in the field of digital forensics. Although reconstruction of image files from the file fragments on disk can be accomplished by simply comparing the content of sectors on disk with the content of known files, this brute-force approach can be time consuming. This paper presents results from research into the use of Graphics Processing Units (GPUs) in detecting specific image file byte patterns in disk clusters. Unique identifying pattern for each disk sector is compared against patterns in known images. A pattern match indicates the potential presence of an image and flags the disk sector for further in-depth examination to confirm the match. The GPU-based implementation outperforms the software implementation by a significant margin. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/571_g80_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/571_g80_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>ELIAUS University of Perpignan</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>01</ReleaseMonth>
        <ReleaseDay>09</ReleaseDay>
        <ReleaseDateDisplay>01/09/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="sylvain.collange@univ-perp.fr">Sylvain Collange</Author>
           <Author email="">Yoginder Dandass</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0901.1307">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Sylvain Collange,Yoginder Dandass,sylvain.collange@univ-perp.fr</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>0cf53113-4ca6-4571-ad43-030fb84f5f1e</GUID>
        <Name>ACEMD: Accelerating bio-molecular dynamics in the microsecond time-scale </Name>
        <ShortDescription>The high arithmetic performance and intrinsic parallelism of recent graphical processing units (GPUs) can offer a technological edge for molecular dynamics simulations. ACEMD is a production-class bio-molecular dynamics (MD) simulation program designed specifically for GPUs which is able to achieve supercomputing scale performance of 40 nanoseconds/day for all-atom protein systems with over 23,000 atoms. We illustrate the characteristics of the code, its validation and performance. We also run a microsecond-long trajectory for an all-atom molecular system in explicit TIP3P water on a single workstation computer equipped with just 3 GPUs. This performance on cost effective hardware allows ACEMD to reach microsecond timescales routinely with important implications in terms of scientific applications. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/570_biomoleculardynamics_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/570_biomoleculardynamics_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Information and Communications Technologies,Imperial College London, South Kensington, London, SW7 2AZ, UK</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>02</ReleaseMonth>
        <ReleaseDay>05</ReleaseDay>
        <ReleaseDateDisplay>02/05/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>19</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="m.j.harvey@imperial.ac.uk">M. J. Harvey</Author>
           <Author email="">G. Giupponi</Author>
           <Author email="">G. De Fabritiis</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0902.0827">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Life Sciences</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>M. J. Harvey,G. Giupponi,G. De Fabritiis,m.j.harvey@imperial.ac.uk</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>f16b23b6-991b-410d-b339-b4815a000f00</GUID>
        <Name>GPUs for data processing in the MWA </Name>
        <ShortDescription>The MWA is a next-generation radio interferometer under construction in remote Western Australia. The data rate from the correlator makes storing the raw data infeasible, so the data must be processed in real-time. The processing task is of order ~10 TFLOPS. The remote location of the MWA limits the power that can be allocated to computing. We describe the design and implementation of elements of the MWA real-time data processing system which leverage the computing abilities of modern graphics processing units (GPUs). The matrix algebra and texture mapping capabilities of GPUs are well suited to the majority of tasks involved in real-time calibration and imaging. Considerable performance advantages over a conventional CPU-based reference implementation are obtained. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/569_wma_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/569_wma_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Harvard-Smithsonian Center for Astrophysics, Cambridge, MA, USA</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>02</ReleaseMonth>
        <ReleaseDay>05</ReleaseDay>
        <ReleaseDateDisplay>02/05/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">S. Ord</Author>
           <Author email="">L. Greenhill</Author>
           <Author email="">R. Wayth</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/PS_cache/arxiv/pdf/0902/0902.0915v1.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>S. Ord,L. Greenhill,R. Wayth</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>8e6979ff-1fb0-4e7c-b28c-9bea0987988c</GUID>
        <Name>The semi-classical spectrum and the Birkhoff normal form</Name>
        <ShortDescription>The purposes of this note are: 1) to propose a direct and "elementary" proof of the main result proved by Guillemin-Paul-Uribe [GPU], namely that the semi-classical spectrum near a global minimum of the classical Hamiltonian determines the whole semi-classical Birkhoff normal form (denoted the BNF) in the non-resonant case. 2) to present in the completely resonant case a similar problem. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/568_normal_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/568_normal_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Institut Fourier, Unite mixte de recherche</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>02</ReleaseMonth>
        <ReleaseDay>17</ReleaseDay>
        <ReleaseDateDisplay>02/17/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="yves.colin-de-verdiere@ujf-grenoble.fr">Yves Colin De Verdiere</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/PS_cache/arxiv/pdf/0902/0902.2470v1.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Yves Colin De Verdiere,yves.colin-de-verdiere@ujf-grenoble.fr</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>31397807-ebad-4ce4-a822-4f66cfe8d3ca</GUID>
        <Name>SAPPORO: A way to turn your graphics cards into a GRAPE-6 </Name>
        <ShortDescription>We present Sapporo, a library for performing high-precision gravitational N-body simulations on NVIDIA Graphical Processing Units GPUs. Our library mimics the GRAPE-6 library, and N-body codes currently running on GRAPE-6 can switch to Sapporo by a simple relinking of the library. The precision of our library is comparable to that of GRAPE-6, even though internally the GPU hardware is limited to single precision arithmetics. This limitation is effectively overcome by emulating double precision for calculating the distance between particles. The performance loss of this operation is small ( 20 percent) compared to the advantage of being able to run at high precision. We tested the library using several GRAPE-6-enabled N-body codes, in particular with Starlab and phiGRAPE. We measured peak performance of 800 Gflop/s for running with 10^6 particles on a PC with four commercial G92 architecture GPUs (two GeForce 9800GX2). As a production test, we simulated a 32k Plummer model with equal mass stars well beyond core collapse. The simulation took 41 days, during which the mean performance was 113 Gflop/s. The GPU did not show any problems from running in a production environment for such an extended period of time. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/567_cpu_gpu_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/567_cpu_gpu_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Astronomical Institute "Anton Pannekoek", University of Amsterdam</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>02</ReleaseMonth>
        <ReleaseDay>25</ReleaseDay>
        <ReleaseDateDisplay>02/25/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="egaburov@strw.leidenuniv.nl">Evghenii Gaburov</Author>
           <Author email="">Stefan Harfst</Author>
           <Author email="">Simon Portegies Zwart</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0902.4463">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Evghenii Gaburov,Stefan Harfst,Simon Portegies Zwart,egaburov@strw.leidenuniv.nl</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>ca427734-eeff-4c03-ae7e-3230ad448d64</GUID>
        <Name>Density Functional Theory calculation on many-cores hybrid CPU-GPU architectures</Name>
        <ShortDescription>The implementation of a full electronic structure calculation code on a hybrid parallel architecture with Graphic Processing Units (GPU) is presented. The code which is on the basis of our implementation is a GNU-GPL code based on Daubechies wavelets. It shows very good performances, systematic convergence properties and an excellent efficiency on parallel computers. Our GPU-based acceleration fully preserves all these properties. In particular, the code is able to run on many cores which may or may not have a GPU associated. It is thus able to run on parallel and massive parallel hybrid environment, also with a non-homogeneous ratio CPU/GPU. With double precision calculations, we may achieve considerable speedup, between a factor of 20 for some operations and a factor of 6 for the whole DFT code. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/566_graph_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/566_graph_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>European Synchrotron Radiation Facility</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>09</ReleaseDay>
        <ReleaseDateDisplay>04/09/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>20</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="luigi.genovese@esrf.fr">Luigi Genovese</Author>
           <Author email="">Matthieu Ospici</Author>
           <Author email="">Thierry Deutsch</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/PS_cache/arxiv/pdf/0904/0904.1543v1.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Luigi Genovese,Matthieu Ospici,Thierry Deutsch,luigi.genovese@esrf.fr</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>cea6c291-c024-4ea7-9cdb-af965bd49771</GUID>
        <Name>Accelerator-Oriented Algorithm Transformation for Temporal Data Mining</Name>
        <ShortDescription>Temporal data mining algorithms are becoming increasingly important in many application domains including computational neuroscience, especially the analysis of spike train data. While application scientists have been able to readily gather multi-neuronal datasets, analysis capabilities have lagged behind, due to both lack of powerful algorithms and inaccessibility to powerful hardware platforms. The advent of GPU architectures such as Nvidia's GTX 280 offers a cost-effective option to bring these capabilities to the neuroscientist's desktop. Rather than port existing algorithms onto this architecture, we advocate the need for algorithm transformation, i.e., rethinking the design of the algorithm in a way that need not necessarily mirror its serial implementation strictly. We present a novel implementation of a frequent episode discovery algorithm by revisiting "in-the-large" issues such as problem decomposition as well as "in-the-small" issues such as data layouts and memory access patterns. This is non-trivial because frequent episode discovery does not lend itself to GPU-friendly data-parallel mapping strategies. Applications to many datasets and comparisons to CPU as well as prior GPU implementations showcase the advantages of our approach.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/564_oriented_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/564_oriented_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Computer Science, Virginia Tech</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>13</ReleaseDay>
        <ReleaseDateDisplay>05/13/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>431</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="patnaik@vt.edu">Debprakash Patnaik</Author>
           <Author email="ponce@vt.edu">Sean P. Ponce</Author>
           <Author email="yongcao@vt.edu">Yong Cao</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0905.2203">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Life Sciences</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Debprakash Patnaik, Sean P. Ponce, Yong Cao, Naren Ramakrishnan</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>aa89b4c8-9abd-4459-adc4-00bfbb8021f7</GUID>
        <Name>Solving $k$-Nearest Vector Problem on Multiple Graphics Processors</Name>
        <ShortDescription>In a recommendation system, customers preferences are encoded into vectors, and finding the nearest vectors to each vector is an essential part. We define this part of problem as a $k$-nearest vector problem and give an effective algorithm to solve it on multiple graphics processor units (GPUs). By an experiment, we show that when the size of the problem is large, an implementation of the algorithm on two GPUs runs more than 260 times faster than a single core implementation on a latest CPU. We also show that our algorithm scales well with respect to the number of GPUs.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/563_k_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/563_k_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Nihon Unisys, Ltd.</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>01</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>01/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>260</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Kimikazu Kato</Author>
           <Author email="">Tikara Hosino </Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0906.0231">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Kimikazu Kato, Tikara Hosino </Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>d4976711-d460-44f9-bfa5-ce9ca5d4c44e</GUID>
        <Name>Elemental Accelerator</Name>
        <ShortDescription>Elemental Accelerator is a video processing solution designed to add power and performance to the Adobe Premiere Pro CS4 workflow. Coupled with NVIDIA Quadro series video cards, Elemental Accelerator harnesses the power of the graphics processing unit (GPU) to perform high-speed video encoding and deliver dramatic time savings over conventional CPU-only encoding solutions. Elemental Accelerator performs GPU-accelerated conversion of commonly distributed digital video formats to H.264/AVC output ready for upload to the web or burning to Blu-ray disc. Elemental Accelerator also supports high-speed MPEG-2 encoding for DVD or digital broadcast. By executing demanding processing tasks on the GPU, Elemental Accelerator not only speeds video transcoding, it frees CPU resources to perform other tasks, resulting in a faster, more efficient video editing and production environment.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/562_accelerator_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/562_accelerator_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>Elemental</OrganizationName>
        <OrganizationURL>http://elementaltechnologies.com/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>10</ReleaseDay>
        <ReleaseDateDisplay>07/10/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>7</SpeedUp>
        <SoftwareLicenseType>Commercial</SoftwareLicenseType>
        <Authors>
           <Author email="">Elemental</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://elementaltechnologies.com/products/accelerator">Application</ContentType>
           <ContentType url="http://elementaltechnologies.com/products/accelerator">Multimedia</ContentType>
           <ContentType url="http://www.nvidia.com/adobeplugins">BUY NOW</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword></Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>14428008-4747-47d1-bcd5-f59bdb8230ec</GUID>
        <Name>Towards Flow Cytometry Data Clustering on Graphics Processing Units</Name>
        <ShortDescription>Like many modern techniques for scientific analysis, flow cytometry produces massive amounts of data that must be analyzed and clustered intelligently to be useful. Current manual binning techniques are cumbersome and limited in both the quality and quantity of analysis produced. To address the quality of results, a new framework applying two different sets of clustering algorithms and inference methods are implemented. The two methods investigated are fuzzy c-means and minimum description length inference and k-medoids with BIC. These approaches lend themselves to large scale parallel processing. To address the computational demands, the Nvidia CUDA framework and Tesla architecture are utilized. The resulting performance demonstrated 1-2 orders of magnitude improvement over an equivalent sequential version. The quality of results is promising and motivates further research and development in this direction.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/561_flow_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/561_flow_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Rochester Institute of Technology, Rochester, NY</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>159</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Jeremy Espenshade</Author>
           <Author email="">Doug Roberts</Author>
           <Author email="">James Cavenaugh</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://cyberaide.googlecode.com/svn/trunk/papers/08-cuda-biostat/vonLaszewski-08-cuda-biostat.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Jeremy Espenshade,Doug Roberts,James Cavenaugh</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>d2033a62-e770-435d-a243-a38ca5a2ac58</GUID>
        <Name>Search Pipeline for Gravitational Waves from Coalescing Binaries of Compact Objects</Name>
        <ShortDescription>We report a novel application of graphics processing units (GPUs) for the purpose of accelerating the search pipelines for gravitational waves from coalescing binaries of compact objects. A speed-up of 16 fold has been achieved compared with a single central processing unit (CPU). We show that substantial improvements are possible and discuss the reduction in CPU count required for the detection of inspiral sources afforded by the use of GPUs. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/560_pipeline_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/560_pipeline_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>School of Computer Science and Engineering, The University of Western Australi</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>23</ReleaseDay>
        <ReleaseDateDisplay>07/23/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>16</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Shin Kee Chung</Author>
           <Author email="">Linqing Wen</Author>
           <Author email="">David Blair</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/PS_cache/arxiv/pdf/0906/0906.4175v1.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Shin Kee Chung, Linqing Wen, David Blair, Kipp Cannon, Amitava Datta</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>e441e930-0ee4-41c2-9fa0-6d18c307ea30</GUID>
        <Name>Neuroblastoma</Name>
        <ShortDescription>Accelerationg dataflow application through the coordination of CPU and GPU</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/559_neuro_results_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/559_neuro_results_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>UFMG</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>04</ReleaseDay>
        <ReleaseDateDisplay>09/04/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>90</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="george@dcc.ufmg.br">George Teodor</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://bmi.osu.edu/~cialab/neuroblastoma.php">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>MedicalImaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>George Teodor,george@dcc.ufmg.br</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>46b9fa44-59f6-4987-a80c-87339387925f</GUID>
        <Name>Abe</Name>
        <ShortDescription>Abe is a different type of search, serching for images with images. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/558_logo_s_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/558_logo_s_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>Quad Streaming</OrganizationName>
        <OrganizationURL>http://www.quadstreaming.com/</OrganizationURL>
        <ReleaseYear>2010</ReleaseYear>
        <ReleaseMonth>02</ReleaseMonth>
        <ReleaseDay>08</ReleaseDay>
        <ReleaseDateDisplay>02/08/2010</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>10</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="office@quadstreaming.com">Quad</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.quadstreaming.com/imagelist.html">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Image search</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Quad,office@quadstreaming.com</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>d84233c9-2041-4233-818b-e72e7813b115</GUID>
        <Name>GPU Satellite Image Processing</Name>
        <ShortDescription>Using CUDA and Tesla, PCI Geomatics has optimized code for orthorectification and pansharpening of high-resolution satellite imagery in the GeoImaging Accelerator (GXL)</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/557_GXL_Server_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/557_GXL_Server_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>PCI Geomatics</OrganizationName>
        <OrganizationURL>http://www.pcigeomatics.com</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>03</ReleaseMonth>
        <ReleaseDay>02</ReleaseDay>
        <ReleaseDateDisplay>03/02/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>2</SpeedUp>
        <SoftwareLicenseType>Commercial</SoftwareLicenseType>
        <Authors>
           <Author email="piekny@pcigeomatics.com">David Piekny</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.pcigeomatics.com/index.php?option=com_content&amp;view=article&amp;id=24&amp;Itemid=5#GXL">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>David Piekny,piekny@pcigeomatics.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>8daa1fae-c95a-42a0-8d4a-82aab0b0d346</GUID>
        <Name>FlaCuda encoder</Name>
        <ShortDescription>Opensource CUDA-enabled FLAC encoder</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/556_flacuda_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/556_flacuda_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName></OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>10</ReleaseDay>
        <ReleaseDateDisplay>09/10/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>3</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="gchudov@gmail.com">Gregory S. Chudov</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.cuetools.net/doku.php/flacuda">Application</ContentType>
           <ContentType url="http://www.cuetools.net/doku.php/flacuda">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Gregory S. Chudov,</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>ef913ea0-85cb-4e53-a970-00b879982728</GUID>
        <Name>Large Integer/polynomial multiplication on GPU</Name>
        <ShortDescription>The paper describes the first implementation of large integer and/or polynomial multiplication using the number theoretic transform on GPU with 24-bit primes. The efficient 24-bit modular reduction is performed in floating-point arithmetic. Our algorithm exploits fused-multiply add (FMA) capabilities of the graphics hardware. DOI: http://dx.doi.org/10.1007/978-3-642-03644-6_11 </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/555_mul_image_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/555_mul_image_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Max Planck Institute for Informatics</OrganizationName>
        <OrganizationURL>http://www.mpi-inf.mpg.de</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>21</ReleaseDay>
        <ReleaseDateDisplay>08/21/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="asm@mpi-sb.mpg.de">Pavel Emeliyanenko</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.mpi-inf.mpg.de/~emeliyan/poly_mul.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Pavel Emeliyanenko,asm@mpi-sb.mpg.de</Keyword>
        </Keywords>
     </Application>

     <Application>
        <GUID>82150290-d681-44c6-a606-35c9565949a8</GUID>
        <Name>A Parallel Annealing Method for Automatic Color Cervigram Image Segmentation</Name>
        <ShortDescription>The accurate and automatic segmentation of tissue regions in cervigram images can aid in the identification and classification of precancerous regions. We implement and analyze four GPU (Graphics Processing Unit) based clustering algorithms: K-means, mean shift, deterministic annealing, and spatially coherent deterministic annealing. From our results, we propose a novel parallel algorithm using the CUDA programming language for digital cervigram segmentation and clustering. The first step of our fully automatic method is to compute the number of modes in the feature space of a color cervigram image using the mean shift clustering algorithm. Next, we use the number of modes in a novel spatially coherent deterministic annealing optimization technique to produce an approximate optimal solution for the clustering problem. Our GPU based methods perform approximately 38x (deterministic annealing),
134x (mean shift), and 276x (spatially coherent deterministic annealing) faster than an equivalent CPU solution. Our implementation decreases the computational time of an annealing method on a 1280x872 pixel image from 5 hours 3 minutes to 72.12 seconds, enabling the use of this optimization method in clinical settings and on large cervigram datasets.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/554_edkim_miccaigpuimage_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/554_edkim_miccaigpuimage_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Lehigh University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>15</ReleaseDay>
        <ReleaseDateDisplay>08/15/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>276</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="edk208@lehigh.edu">Edward Kim</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.cse.lehigh.edu/~idealab/papers/edkim_miccaigpu2009.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>MedicalImaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Edward Kim,edk208@lehigh.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>30159376-0082-49a8-9716-a55f0b2fb707</GUID>
        <Name>Predicting Lightning in Protoplanetary Discs</Name>
        <ShortDescription>We study the role of dust-dust collisional charging in protoplanetary discs. Although in some cases the charge densities for different species differ by 20 orders of magnitude, we transformed algorithm sothat it gives sufficiently precise solutions using only single precision floats. This made the program run faster on GPGPUs, allowing us to survey wide range of parameter space in high resolution. As a result, we found that as dust condensate, the charge distribution experience four phases. At one of these phases the electrostatic field grows as fourth power of dust density and lightning takes place.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/553_lightning-here_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/553_lightning-here_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Theoretical Astrophysics Group, Department of Physics, Kyoto University</OrganizationName>
        <OrganizationURL>http://www-tap.scphys.kyoto-u.ac.jp/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>11</ReleaseDay>
        <ReleaseDateDisplay>08/11/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>140</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="muranushi@gmail.com">Takayuki Muranushi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0908.1575">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Takayuki Muranushi,muranushi@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>2c8af116-762a-4968-b59c-bdc1328b7461</GUID>
        <Name>Optimization of FTLE Calulation</Name>
        <ShortDescription>We calculate the Finite-Time Lyapunov Exponent (FTLE) for several fluid flows and find that CUDA helps us immensely.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/552_rlw_vortex_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/552_rlw_vortex_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>California Institute of Technology</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>14</ReleaseDay>
        <ReleaseDateDisplay>08/14/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>1000</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="raymondj@caltech.edu">Raymond Jimenez</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.its.caltech.edu/~raymondj/LCS">Application</ContentType>
           <ContentType url="http://www.its.caltech.edu/~raymondj/LCS">Multimedia</ContentType>
           <ContentType url="http://www.its.caltech.edu/~raymondj/LCS">Paper</ContentType>
           <ContentType url="http://www.its.caltech.edu/~raymondj/LCS">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Raymond Jimenez,raymondj@caltech.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>d508073b-38bf-4fc7-b99b-1ad6ff71b868</GUID>
        <Name>CBDA: Cyclotron Beam Dynamics Analysis</Name>
        <ShortDescription>Software for the Accelerator Physics</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/551_demo2_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/551_demo2_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>JINR</OrganizationName>
        <OrganizationURL>http://www.jinr.ru</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>07/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>60</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="pevgeny@jinr.ru">Perepelkin Evgeny</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://cbda.jinr.ru">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Perepelkin Evgeny,pevgeny@jinr.ru,Cyclotron, Space charge effect, Acceleration </Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>1b0a6a82-8181-4dd3-9477-0e7d523af249</GUID>
        <Name>Efficient Acceleration of Asymmetric Cryptography on GPUs</Name>
        <ShortDescription>We present implementations of large integer modular exponentiation, the core of public-key cryptosystems such as RSA, on a DirectX 10 compliant GPU. We present high performance modular exponentiation implementations based on integers represented in both standard radix form and residue number system form. We show how a GPU implementation of a 1024-bit RSA decrypt primitive can outperform a comparable CPU implementation by up to 4 times and also improve the performance of previous GPU implementations by decreasing latency by up to 7 times and doubling throughput. We present how an adaptive approach to modular exponentiation involving implementations based on both a radix and a residue number system gives the best all-around performance on the GPU both in terms of latency and throughput. We also highlight the usage criteria necessary to allow the GPU to reach peak performance on public key cryptographic operations.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/550_graph_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/550_graph_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Trinity College Dublin, Ireland</OrganizationName>
        <OrganizationURL>http://www.tcd.ie/</OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>12/01/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>4</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="harrisoo@cs.tcd.ie">Owen Harrison</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="https://www.cs.tcd.ie/publications/tech-reports/reports.08/TCD-CS-2008-60.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Owen Harrison,harrisoo@cs.tcd.ie</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>4eb88033-a856-44e6-aa9c-f8143e624219</GUID>
        <Name>StandardModel on GPU</Name>
        <ShortDescription>This project is a GPU port of the "Standard Model of Visual Cortex" (CBCL, MIT, by Riesenhuber M., Poggio T., Serre T., Wolf L.) </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/549_logo_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/549_logo_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName></OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>10</ReleaseDay>
        <ReleaseDateDisplay>08/10/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>100</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="spiglerg@gmail.com">Giacomo Spigler</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://code.google.com/p/standardmodelgpu">Application</ContentType>
           <ContentType url="http://code.google.com/p/standardmodelgpu">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType>Signal Processing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Giacomo Spigler,spiglerg@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>a927fa37-9635-48c1-8b51-1f237dec4035</GUID>
        <Name>Cuda Jpeg Decoder</Name>
        <ShortDescription>jpeg decoder on GPU</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/548_screenshot_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/548_screenshot_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>2U</OrganizationName>
        <OrganizationURL>http://www.2uinfotech.com</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>13</ReleaseDay>
        <ReleaseDateDisplay>08/13/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>10</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="rados82@gmail.com">Ramazan Dincer</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="https://sourceforge.net/projects/cudajpegdecoder/">Application</ContentType>
           <ContentType url="https://sourceforge.net/projects/cudajpegdecoder/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Ramazan Dincer,rados82@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>36253542-171f-491b-8646-f224a5694e8f</GUID>
        <Name>Hyperspectral unmixing on NVidia GPUs</Name>
        <ShortDescription>Hyperspectral images are now routinely used in several Earth observation and planetary exploration missions. These images can be seen as high-dimensional data cubes with three dimensions: two of which represent the spatial domain, while the third one comprises hundreds of spectral bands collected at different wavelengths. As a result, each pixel is represented by a spatial localization and a spectral signature which provides very detailed information about its composition. One of the main problems in the analysis of hyperspectral data cubes is the problem of mixed pixels, which arise when the spatial resolution of the sensor is not enough to separate spectrally distinct materials. In this case, several spectrally pure signatures (endmembers) are combined into the same (mixed) pixel. Hyperspectral unmixing techniques comprise two stages: 1) automatic identification of spectral endmembers; and 2) estimation of the fractional abundance of each endmember in each pixel. The unmixing process is quite computationally expensive, mainly due to the extremely high dimensionality of hyperspectral data cubes. In this work, we develop a computationally efficient implementation of the full hyperspectral unmixing chain using different endmember extraction and fractional abundance estimation algorithms. The proposed methodology has been implemented, using the compute device unified architecture (CUDA), on an NVidia GeForce 8800 GTX GPU, achieving speedups in the order of 25x when compared to an optimized implementation of the same code in a dual-core CPU.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/547_hyperspectralcube_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/547_hyperspectralcube_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Technology of Computers and Communications, University of Extremadura</OrganizationName>
        <OrganizationURL>http://www.umbc.edu/rssipl/people/aplaza</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>12</ReleaseDay>
        <ReleaseDateDisplay>08/12/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>25</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="aplaza@unex.es">Antonio Plaza</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.umbc.edu/rssipl/people/aplaza/Unmixing_CUDA.pdf">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Antonio Plaza,aplaza@unex.es</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>31d6fa65-0651-4b66-8e96-75dda84f13f6</GUID>
        <Name>Tracking as Segmentation of Spatial-Temporal Volumes</Name>
        <ShortDescription>In this work, we interpret tracking as segmentation of spatial-temporal volumes. Segmentation is done by a variational approach using anisotropic weighted Total Variation (TV) regularization. All major parts of this approach are computed on the GPU using CUDA</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/546_cuda_zone_emmcvpr_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/546_cuda_zone_emmcvpr_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Graz University of Technology, Institute for Computer Graphics and Vision</OrganizationName>
        <OrganizationURL>http://www.gpu4vision.org</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>12</ReleaseDay>
        <ReleaseDateDisplay>08/12/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="info@gpu4vision.org">Markus Unger</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.youtube.com/watch?v=8x3JOzi-hvA">Multimedia</ContentType>
           <ContentType url="http://gpu4vision.icg.tugraz.at/index.php?content=segmentation.php#pub35">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType>Video &amp; Audio</ApplicationType>
           <ApplicationType>Computer Vision</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Markus Unger,info@gpu4vision.org</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>6217e565-1518-4cfd-9644-e7206c32a1a5</GUID>
        <Name>Performance Comparison of Single-Precision SPICE Model-Evaluation on FPGA, GPU, Cell, and Multi-core Processors</Name>
        <ShortDescription>Automated code generation and performance tuning techniques for concurrent architectures such as GPUs, Cell and FPGAs can provide integer factor speedups over multi-core processor organizations for data-parallel, floating-point computation in SPICE Model-Evaluation. Our Verilog AMS compiler produces code for parallel evaluation of non-linear circuit models suitable for use in SPICE simulations where the same model is evaluated several times for all the devices in the circuit. Our compiler uses architecture specific parallelization strategies (OpenMP for multi-core, PThreads for Cell, CUDA for GPU, statically scheduled VLIW for FPGA) when producing code for these different architectures. We automatically explore different implementation configurations (e.g. unroll factor, vector length) using our performance-tuner to identify the best possible configuration for each architecture. We demonstrate speedups of 3--131x for an NVIDIA 9600 GT GPU over a 3 GHz Intel Xeon 5160 implementation for a variety of single-precision device models. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/545_ic_logo_basic_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/545_ic_logo_basic_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>U. Penn. Implementation of Computation Lab</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>08/31/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>133</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="nachiket@ieee.org">Nachiket Kapre</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://ic.ese.upenn.edu/abstracts/spice_fpl2009.html">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Electronic Design Automation</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Nachiket Kapre,nachiket@ieee.org</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>d0e01d7c-0f0f-4dcb-8b28-e0f98e87f914</GUID>
        <Name>Single Pass Depth Peeling via CUDA Renderer</Name>
        <ShortDescription>Multi-fragment effects play important roles on many graphics applications, which require operations on more than one fragment per pixel. The classical depth peeling algorithm provides a simple but robust solution by peeling off one layer each pass, but multi rasterizations will become a performance bottleneck for large and complex scenes. Ideally, we prefer to capture and sort multiple fragments in a single pass, which is difficult because the fragments generated in graphics pipeline are not allowed to be scattered to arbitrary positions of the render targets. Compute unified device architecture (CUDA) provides more flexible control over the GPU memory, but accessing of the fragments generated by graphics pipeline is not yet supported. In this work we design a CUDA rasterizer so that many graphics applications can benefit from the free control of GPU memory, especially for the multi-fragment effects. We present two efficient schemes to capture and sort multiple fragments per pixel in a single geometry pass via the atomic operations of CUDA without read-modify-write (RMW) hazards. Experimental results show significant speedup to classical depth peeling, especially for large scenes. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/544_dragon_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/544_dragon_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Institue of Software, Chinese Academy of Sciences</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>10</ReleaseDay>
        <ReleaseDateDisplay>08/10/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>10</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="liuf@ios.ac.cn">Fang Liu</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://hmcen0921.googlepages.com/cudarasterizer">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Fang Liu,liuf@ios.ac.cn</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>b4cb74db-42cd-4def-86f0-65c87bd36187</GUID>
        <Name>FOLKI GPU</Name>
        <ShortDescription>Fast Optical Flow on GPU at video rate for full HD resolution</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/543_icone_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/543_icone_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Onera</OrganizationName>
        <OrganizationURL>http://www.onera.fr</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>23</ReleaseDay>
        <ReleaseDateDisplay>07/23/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>100</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="aurelien.plyer@gmail.com">Aurelien Plyer</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.onera.fr/dtim/gpu-for-image/folkigpu.php">Application</ContentType>
           <ContentType url="http://www.onera.fr/dtim/gpu-for-image/folkigpu.php">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Signal Processing</ApplicationType>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Aurelien Plyer,aurelien.plyer@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>bdb4e756-195f-4243-94f7-b10b7e11e2bd</GUID>
        <Name>Iterative CUDA</Name>
        <ShortDescription>Iterative CUDA is a CUDA-based solver package for large, sparse linear systems.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/542_sparse-city-small_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/542_sparse-city-small_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Brown University</OrganizationName>
        <OrganizationURL>http://brown.edu</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>08/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>10</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="kloeckner@dam.brown.edu">Andreas Kloeckner</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://mathema.tician.de/software/iterative-cuda">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Libraries</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Andreas Kloeckner,kloeckner@dam.brown.edu,solver,cg,iterative,linear system</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>c620d360-fac3-43d3-a045-9c1aae75ec57</GUID>
        <Name>SARRACUDA: Syntetic Aperture Radar Range-doppler Algorithm using CUDA</Name>
        <ShortDescription>This application is a GPU version of a Synthetic Aperture Radar focusing algorithm. The implemented algorithm is the Range doppler algorithm, one of the most accurates and widely used. Synthetic Aperture Radar (SAR) is an imaging radar for earth observation from satellite and airborne manned/unmanned platforms; it is currently operational in recently launched polar-orbiting platforms such as TerraSAR-X, RadarSAT-2 and Cosmo-SkyMed as well as in previous missions. Applicatons are tailored to disaster observation and management, mapping of renewable resources, geological mapping, snow/ice mapping and strategic surveillance of military sites.The data stream produced by high resolution SAR systems may exceed 1 Gb/s and the real-time or near real-time processing represents a demanding requirement for on-board or even ground-based processing systems. The remote sensing community and the space agencies spend yearly a considerableamount of time and money to implement efficient and accurate processors for SAR data. Moreover, the scientific community is more and more oriented to a wide range of applications where the first step is the focalization of SAR data. The recent development and diffusion of multicore platformsopens new horizons and breaks barriers in the design of architectures for massively parallel processing of SAR data, without loosing in resolution and/or accuracy.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/541_sarracuda_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/541_sarracuda_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Universita degli Studi del Sannio</OrganizationName>
        <OrganizationURL>http://www.ing.unisannio.it</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>05</ReleaseDay>
        <ReleaseDateDisplay>08/05/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>15</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="carmineclemente@gmail.com">Carmine Clemente</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.ing.unisannio.it/labtlc/sarracuda.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Signal Processing</ApplicationType>
           <ApplicationType>Remote Sensing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Carmine Clemente,carmineclemente@gmail.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>9ca18ade-f66f-4f40-9743-e6ebb760de33</GUID>
        <Name>Libra SDK</Name>
        <ShortDescription>Libra SDK is a scientific developer kit for building simple and fast cross CPU-GPU applications suited for scientific computations. Libra 1.1 SDK includes C/C++ matlab style API, sample programs and documentation. Example code and a downloadable trial version is available from GPU Systems website http://www.gpusystems.com</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/540_logo_bg_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/540_logo_bg_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>GPU Systems</OrganizationName>
        <OrganizationURL>http://www.gpusystems.com</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>06</ReleaseMonth>
        <ReleaseDay>24</ReleaseDay>
        <ReleaseDateDisplay>06/24/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>Commercial</SoftwareLicenseType>
        <Authors>
           <Author email="marco.hjerpe@gpusystems.com">Marco Hjerpe</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="www.gpusystems.com/video.html">Multimedia</ContentType>
           <ContentType url="www.gpusystems.com/download.aspx">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Computational Fluid Dynamics,Digital Content Creation,Electronic Design Automation,Finance,Game Physics,Graphics,Imaging,MedicalImaging,Numerics,Life Sciences,Libraries,Oil &amp; Gas,Science,Signal Processing,Video &amp; Audio,matlab programming</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Marco Hjerpe,marco.hjerpe@gpusystems.com,CPU,GPU,C++ programming,gpgpu,matlab,CUDA,OpenCL</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>87d90d59-5c3a-4094-a8cf-0e8da5326193</GUID>
        <Name>Real-time optical manipulation of micron sized structures using GPU generated holograms</Name>
        <ShortDescription>Holographic optical tweezers allow the three dimensional, dynamic, multipoint manipulation of micron sized dielectric objects. Exploiting the massive parallel architecture of modern GPUs we can generate highly optimized holograms at video frame rate allowing the interactive micromanipulation of complex structures.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/539_slm_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/539_slm_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>CNR-INFM, CRS-SOFT Dipartimento di Fisica, Universita di Roma La Sapienza</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>23</ReleaseDay>
        <ReleaseDateDisplay>07/23/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>350</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">S. Bianchi</Author>
           <Author email="">R. Di Leonardo</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/PS_cache/arxiv/pdf/0907/0907.4027v1.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>S. Bianchi,R. Di Leonardo</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>a0d09099-5643-406d-9d4a-9e7053425028</GUID>
        <Name>The Living Application: a Self-Organising System for Complex Grid Tasks </Name>
        <ShortDescription>We present the living application, a method to autonomously manage applications on the grid. During its execution on the grid, the living application makes choices on the resources to use in order to complete its tasks. These choices can be based on the internal state, or on autonomously acquired knowledge from external sensors. By giving limited user capabilities to a living application, the living application is able to port itself from one resource topology to another. The application performs these actions at run-time without depending on users or external workflow tools. We demonstrate this new concept in a special case of a living application: the living simulation. Today, many simulations require a wide range of numerical solvers and run most efficiently if specialized nodes are matched to the solvers. The idea of the living simulation is that it decides itself which grid machines to use based on the numerical solver currently in use. In this paper we apply the living simulation to modelling the collision between two galaxies in a test setup with two specialized computers. This simulation switces at run-time between a GPU-enabled computer in the Netherlands and a GRAPE-enabled machine that resides in the United States, using an oct-tree N-body code whenever it runs in the Netherlands and a direct N-body solver in the United States.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/538_self-organism_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/538_self-organism_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Section Computational Science, University of Amsterdam, Amsterdam, theNetherlands</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>23</ReleaseDay>
        <ReleaseDateDisplay>07/23/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="djgroen@science.uva.nl ">D. Groen</Author>
           <Author email="">S. Harfst</Author>
           <Author email="">S. Portegies Zwart</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/PS_cache/arxiv/pdf/0907/0907.4036v1.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType>Signal Processing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>D. Groen, S. Harfst, S. Portegies Zwart,djgroen@science.uva.nl </Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>057f342d-84b4-4a12-ae35-5208c51ed958</GUID>
        <Name>Synthetic Aperture Radar Back-Projection Algorithm</Name>
        <ShortDescription>Synthetic Aperture Radar(SAR) uses microwaves to create images of the earth. These images provide information not visible to the naked eye, and can be made despite visibility conditions. SAR image formation requires massive amounts of computation and is hard to do in real-time. The best SAR processing algorithm, known as back-projection, is O(N^3) where N is the number of pixels -- which can be many thousands. To reduce computation suboptimal FFT-based algorithms have been traditionally used despite the various limitations and image degradation effects these algorithms have. The back-projection algorithm is however ideal for a highly parallel processor like NVIDIA's GPGPUs. At the Brigham Young University Microwave Earth Remote Sensing Laboratory we have been able to take advantage of the GPGPUs massive processing power to reduce the processing time for a 1500X1600 image that took 31 minutes in a well-optimized, single-threaded C implementation, down to a 5.6 seconds using one of the four processors of a NVIDIA S1070. This is even faster than many FFT-based algorithms! We hope to continue to build off of this speed up to make further advancements in SAR imaging. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/537_sonar_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/537_sonar_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Brigham Young University Microwave Earth Remote Sensing Laboratory </OrganizationName>
        <OrganizationURL>http://www.mers.byu.edu/SAR.html#YIFSAR</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>03</ReleaseDay>
        <ReleaseDateDisplay>08/03/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>300</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="long@ee.byu.edu">David G. Long</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.mers.byu.edu/yinsar/images/flightU/2218.2x30.dem.noroll.int.lpno.hgt.mpg">Multimedia</ContentType>
           <ContentType url="http://www.mers.byu.edu/yinsar/index.html">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Signal Processing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>David G. Long,long@ee.byu.edu</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>069854c1-e5b6-44b1-84a9-eb00831c8fae</GUID>
        <Name>Julia 4D</Name>
        <ShortDescription>Ray tracing of quaternion julia set</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/540_Julia4D_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/540_Julia4D_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>homemade</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>02</ReleaseDay>
        <ReleaseDateDisplay>08/02/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="charles.strub@gmail.com">Charles Strub</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://strub.charles.free.fr/cuda/Julia4D.exe">Application</ContentType>
           <ContentType url="http://www.youtube.com/watch?v=QT4LLbyH3qY">Multimedia</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Charles Strub,charles.strub@gmail.com,Julia 4D quaternion ray tracing</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>7cc11512-08ba-4264-a4c7-fa1c31ae47b2</GUID>
        <Name>cudaseg (Fast Level Set Segmentation of Biomedical Images using Graphics Processing Units )</Name>
        <ShortDescription>n this projet we have engineered a parallel level In this projet we have engineered a parallel level set implementation using the NVIDIA CUDA framework to accelerate image and volume segmentations. The final source code and thesis can be downloaded on this site In this projet we have engineered a parallel level set implementation using the NVIDIA CUDA framework to accelerate image and volume segmentations. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/538_cudasegall_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/538_cudasegall_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Oxford</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>06</ReleaseMonth>
        <ReleaseDay>02</ReleaseDay>
        <ReleaseDateDisplay>06/02/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Hormuz Mostofi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://code.google.com/p/cudaseg/">Application</ContentType>
           <ContentType url="http://www.youtube.com/watch?v=5Bf1zddgySI">Multimedia</ContentType>
           <ContentType url="http://code.google.com/p/cudaseg/">Paper</ContentType>
           <ContentType url="http://code.google.com/p/cudaseg/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Imaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Hormuz Mostofi,</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>d86e78ef-8ada-44e6-ad42-fc6386e55cc0</GUID>
        <Name>Cholesky Decompositions</Name>
        <ShortDescription>Cholesky factorization for dense matrix and reached 450x with GTX 285 </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/536_http_imgload.cgi_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/536_http_imgload.cgi_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>Freelance</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>05</ReleaseDay>
        <ReleaseDateDisplay>09/05/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>450</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="cyrosly@163.com">lixiuyu</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://forums.nvidia.com/index.php?showtopic=106071">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>lixiuyu,cyrosly@163.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>34d84e30-9ec7-42e6-a411-63810d133fc4</GUID>
        <Name>A GPU based GPS software receiver</Name>
        <ShortDescription>Off-the-shelf graphics processing units provide low-cost massive parallel computing performance, which can be utilized for the implementation of a GPS software receiver. In order to realize a real-time capable system the crucial stages of the receiver should be optimized to suit the requirements of a parallel processor. Moreover, the receiver should be capable to provide wider correlation functions and provide easy access to the spectral domain of the signals. Thus, the most suitable correlation algorithm, which forms the core part of each receivers should be chosen and implemented on the graphics processor. Since the sampling rate of the received signal limits the real-time capabilities of the software radio it is necessary to determine an optimum value, considering that the precision of the observable varies with sampling bandwidth. We are going to discuss details and present our single frequency multi-channel implementation, which is capable of operating in real-time mode. Our implementation differs from other solutions by the wideness of the correlation function and allows simple handling of data in the spectral domain. Comparison with output from a commercial hardware receiver, which shares the antenna with the software radio, confirms the consistency and accuracy of our development.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/535_gpsgpu_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/535_gpsgpu_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>National Institute of Information and Communications Technology, Japan</OrganizationName>
        <OrganizationURL>http://www.nict.go.jp </OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>08</ReleaseDay>
        <ReleaseDateDisplay>08/08/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="hobiger@nict.go.jp">Thomas Hobiger</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.springerlink.com/openurl.asp?genre=article&amp;id=doi:10.1007/s10291-009-0135-2 ">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType>Signal Processing</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Thomas Hobiger,hobiger@nict.go.jp</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>ce75850e-89b5-4c0f-af44-1f5f66f91cd1</GUID>
        <Name>framework for efficient and scalable execution of domain-specific templates on GPUs</Name>
        <ShortDescription>Graphics Processing Units (GPUs) have emerged as important players in the transition of the computing industry from sequential to multi- and many-core computing. We propose a software framework for execution of domain specific parallel templates on GPUs, which simultaneously raises the abstraction level of GPU programming and ensures efficient execution with forward scalability to large data sizes and new GPU platforms. To achieve scalable and efficient GPU execution, our framework focuses on two critical problems that have been largely ignored in previous efforts - processing large data sets that do not fit within the GPU memory, and minimizing data transfers between the host and GPU. Our framework takes domain-specific parallel programming templates that are expressed as parallel operator graphs, and performs operator splitting, offload unit identification, and scheduling of off-loaded computations and data transfers between the host and the GPU, to generate a highly optimized execution plan. Finally, a code generator produces a hybrid CPU/GPU program in accordance with the derived execution plan, that uses lower level frameworks such as CUDA. We have applied the proposed framework to templates from the recognition domain, specifically edge detection kernels and convolutional neural networks that are commonly used in image and video analysis. We present results on two different GPU platforms from NVIDIA (a Tesla C870 GPU computing card and a GeForce 8800 graphics card) that demonstrate 1.7 - 7.8X performance improvements over already accelerated baseline GPU implementations. We also demonstrate scalability to input data sets and application memory footprints of 6GB and 17GB, respectively, on GPU platforms with only 768MB and 1.5GB of memory.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/534_image_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/534_image_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Commercial</OrganizationType>
        <OrganizationName>NEC Labs, Berkeley, Purdue</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>05/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="narayans@eecs.berkeley.edu">Narayanan Sundaramyz</Author>
           <Author email="">Anand Raghunathanyx</Author>
           <Author email="">Srimat T. Chakradhar</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.eecs.berkeley.edu/~narayans/Publications_files/ipdps2009.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>MedicalImaging</ApplicationType>
           <ApplicationType>machine learning</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Narayanan Sundaramyz, Anand Raghunathanyx, and Srimat T. Chakradhar</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>a9aaf71b-fcf4-4cab-a580-5029383afb71</GUID>
        <Name>Massively Parallel Population-Based Monte Carlo Methods</Name>
        <ShortDescription>Implementation of population-based MCMC and a sequential Monte Carlo sampler for inference in a Gaussian mixture model and a particle filter for a factor stochastic volatility state-space model.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/533_b1_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/533_b1_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Oxford</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>14</ReleaseDay>
        <ReleaseDateDisplay>05/14/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>500</SpeedUp>
        <SoftwareLicenseType>Open source</SoftwareLicenseType>
        <Authors>
           <Author email="lee@stats.ox.ac.uk">Anthony Lee</Author>
           <Author email="yau@stats.ox.ac.uk">Christopher Yau</Author>
           <Author email="mike.giles@maths.ox.ac.uk">Michael B. Giles</Author>
           <Author email="arnaud@cs.ubc.ca">Arnaud Doucet</Author>
           <Author email="cholmes@stats.ox.ac.uk">Christopher C. Holmes</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.oxford-man.ox.ac.uk/gpuss/">Application</ContentType>
           <ContentType url="http://www.oxford-man.ox.ac.uk/gpuss/">Paper</ContentType>
           <ContentType url="http://www.oxford-man.ox.ac.uk/gpuss/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Statistics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Anthony Lee,lee@stats.ox.ac.uk</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>f549113c-45f8-4aff-96b4-b89db7abe5bb</GUID>
        <Name>3D Image Deconvolution on GPUs</Name>
        <ShortDescription>A popular approach to solving the inverse problem of image deconvolution is to use iterative methods. Iterative deconvolution can provide better results than simpler methods at a cost of higher computational complexity and processing time. In this work we investigate the use of graphics processing units (GPUs) and CUDA to accelerate the execution of one such iterative algorithm, the Richardson-Lucy (RL) algorithm. We compare performance results for a number of 3D Richardson-Lucy implementations on both the CPU and GPU, showing that our best GPU implementation, using Fourier space convolutions (CUFFT), significantly outperforms our best CPU implementation, which uses a publicly available and highly optimised Fast Fourier Transform (FFT) library. L. Domanski, P. Vallotton, and D. Wang. Two and Three-Dimensional Image Deconvolution on Graphics Hardware. In Anderssen, R.S., R.D. Braddock and L.T.H. Newham (eds) 18th World IMACS/MODSIM Congress, Cairns, Australia, pages 1010--1016, 13-17 July 2009. ISBN: 978-0-9758400-7-8. http://www.mssanz.org.au/modsim09/C5/domanski.pdf </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/532_psfteaser_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/532_psfteaser_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Commonwealth Scientific and Industrial Research Organisation</OrganizationName>
        <OrganizationURL>http://www.csiro.au/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>13</ReleaseDay>
        <ReleaseDateDisplay>07/13/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Luke.Domanski@csiro.au">Luke Domanski</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.cmis.csiro.au/iap/">Multimedia</ContentType>
           <ContentType url="http://www.mssanz.org.au/modsim09/C5/domanski.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
<ApplicationType>MedicalImaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Luke Domanski,Luke.Domanski@csiro.au,image deconvolution, image restoration, microscopy, CUDA, CUFFT</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>0313f2ec-fa80-4682-8fb2-8e855c9f2e66</GUID>
        <Name>PAPER - Accelerating Parallel Evaluations of ROCS</Name>
        <ShortDescription>PAPER is a GPU-accelerated implementation of Gaussian molecular shape overlay (the algorithm in OpenEye ROCS) running on NVIDIA graphics cards. We have demonstrated multiple-order-of-magnitude speedups relative to a CPU-based implementation of the same algorithm, and 5x speedup relative to OpenEye ROCS even on low-end graphics hardware (an NVIDIA 8600GT).</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/531_gpuROCS_thumb_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/531_gpuROCS_thumb_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Department of Computer Science, Stanford University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>06</ReleaseDay>
        <ReleaseDateDisplay>05/06/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>35</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="ihaque@cs.stanford.edu">Imran Haque</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.cs.stanford.edu/people/ihaque/">Application</ContentType>
           <ContentType url="http://www.cs.stanford.edu/people/ihaque/papers/gpurocs.pdf">Paper</ContentType>
           <ContentType url="http://www.cs.stanford.edu/people/ihaque/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Life Sciences</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Imran Haque,ihaque@cs.stanford.edu,paper openeye rocs </Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>ffe53df7-0183-443c-a269-710b724d1cb7</GUID>
        <Name>librysq</Name>
        <ShortDescription>librysq is C/C++ implementation of the Rys quadrature for computing arbitrary electron repulsion integrals on CPU and CUDA GPUs. A FORTRAN interface is provided for compatibility with the existing chemistry packages. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/529_MOS-902-8-400x300_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/529_MOS-902-8-400x300_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Source Forge</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>03</ReleaseMonth>
        <ReleaseDay>29</ReleaseDay>
        <ReleaseDateDisplay>03/29/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="andrey_asadchev@users.sourceforge.net">andrey asadchev</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/librysq/">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>andrey asadchev,</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>0fc84b69-6d38-4463-adae-0d6d3ad2fdb0</GUID>
        <Name>GPU Flame Fractal Renderer</Name>
        <ShortDescription>Renderer for flam3 cosmic recursive fractal flames implemented on GPU. Requires a CUDA-capable graphics card. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/528_screenshot_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/528_screenshot_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>SourceForge</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>24</ReleaseDay>
        <ReleaseDateDisplay>07/24/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Keldor@users.sourceforge.net">Keldor</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/flam4/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Keldor,Keldor@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>027d6d57-37b1-4657-9df6-394c24092014</GUID>
        <Name> Combining Molecular Dynamics with Bayesian Analysis To Predict and Evaluate Ligand-Binding Mutations in Influenza Hemagglutinin</Name>
        <ShortDescription>The influenza virus infects people and animals by binding to complex sugar molecules on the surface of the respiratory tract. Bird viruses bind most strongly to bird cell-surface sugars and human viruses bind most strongly to human cell-surface sugars. As the recent swine-origin influenza virus has demonstrated, there is considerable overlap between the binding ability of human and pig viruses to cells of the other host. Changes to this binding affinity are one key component for viruses to make a jump between species, and it is difficult to predict the necessary mutations ahead of time. We would like to predict high-risk mutations to enable better surveillance and early control of potential inter-species transmission events. This work represents a first step in that direction, as we examine mutations to H5N1 avian influenza that alter ligand binding. We use Folding@Home as a powerful computational screen to evaluate mutations that will eventually require experimental testing to verify.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/527_ja904_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/527_ja904_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Departments of Chemistry and Structural Biology, Stanford University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>28</ReleaseDay>
        <ReleaseDateDisplay>07/28/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="kasson@stanford.edu">Peter M Kasson</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://pubs.acs.org/doi/abs/10.1021/ja904557w">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Life Sciences</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Peter M Kasson,kasson@stanford.edu,folding@home influenza</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>d1233002-2132-43e2-8527-3bf5159ddf19</GUID>
        <Name>ViVid</Name>
        <ShortDescription>Python framework for video processing and content analysis using CUDA for acceleration. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/525_6702-Water_Life_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/525_6702-Water_Life_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>Source Forge</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>18</ReleaseDay>
        <ReleaseDateDisplay>04/18/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Dennis_Lin@sourceforge.net">Dennis Lin</Author>
           <Author email="">Mert Dikmen</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/libvivid/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Dennis Lin,Mert Dikmen,Dennis_Lin@sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>0b495833-3a0f-45e7-88ab-43b3f28cc0fe</GUID>
        <Name>SSbump Generator</Name>
        <ShortDescription>A GUI interface to a tool for generating SSBumps (Self Shadowed Bump Maps). Includes a CUDA GPU rendering extension. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/524_screenshot_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/524_screenshot_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>SourceForge</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="SARGE@users.sourceforge.net">SARGE</Author>
           <Author email="">ssbumpgenerator</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/ssbumpgenerator/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>SARGE,ssbumpgenerator,SARGE@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>2444c3a0-b3b0-4584-9b11-6f566f9030ee</GUID>
        <Name>Open64 Compiler and Tools</Name>
        <ShortDescription>The Open64 Compiler and Tools site is dedicated to the continued development of the former SGI Pro64(TM) compiler for the IA64, x86, CUDA and MIPS architecture. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/523_nvidia-2_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/523_nvidia-2_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>NVIDIA</OrganizationName>
        <OrganizationURL>http://www.nvidia.com</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>04</ReleaseDay>
        <ReleaseDateDisplay>04/04/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="adouillet@nvidia.com">Alban Douillet</Author>
           <Author email="">Juergen Ributzka</Author>
           <Author email="">Suneel Jain</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/open64/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Alban Douillet,Juergen Ributzka,Suneel Jain,adouillet@nvidia.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>90e8e358-dfd7-493e-b829-36373e4ab5ee</GUID>
        <Name>CUDA-EC</Name>
        <ShortDescription>A fast parallel error correction tool for short reads. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/522_cuda-ec_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/522_cuda-ec_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Nanyang Technological University </OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>07</ReleaseDay>
        <ReleaseDateDisplay>04/07/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Haixiang_Shi@users.sourceforge.net">Haixiang Shi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cuda-ec/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Haixiang Shi,Haixiang_Shi@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>4c8b5fb1-15cd-4251-bec3-f6de3a414800</GUID>
        <Name>pfsRTtmo</Name>
        <ShortDescription>This project provides realtime implementations of popular HDR tone mapping operators on GeForce 8800 GPUs using the CUDA programming environment. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/521_screenshot_thumb_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/521_screenshot_thumb_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>SourceForce</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear></ReleaseYear>
        <ReleaseMonth></ReleaseMonth>
        <ReleaseDay></ReleaseDay>
        <ReleaseDateDisplay></ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType>07/30/2008</SoftwareLicenseType>
        <Authors>
           <Author email="prkipfer@users.sourceforge.net">Peter Kipfer</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/pfsrttmo/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Peter Kipfer,prkipfer@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>6de04003-5433-4de2-bf86-c308ac51fd12</GUID>
        <Name>GPU Accelerated Real Time HDR Rendering</Name>
        <ShortDescription>A real-time interactive display was developed to showcase timelapse photos by using motion estimation results to produce unique high-dynamic range images as a function of the viewer's position in front of the display. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/520_ir_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/520_ir_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>University of Toronto</OrganizationName>
        <OrganizationURL>http://www.eyetap.org</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>05</ReleaseMonth>
        <ReleaseDay>05</ReleaseDay>
        <ReleaseDateDisplay>05/05/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="">Raymond Lo</Author>
           <Author email="">Eric Tran</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://glogger.mobi/ago">Multimedia</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Graphics</ApplicationType>
           <ApplicationType>Imaging</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Raymond Lo,Eric Tran</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>63d283aa-d137-4705-899c-cbb174ef07ba</GUID>
        <Name>GPU accelerated dose calculations for radiotherapy</Name>
        <ShortDescription>We developed a ray-tracing algorithm for radiotherapy dose calculations that enables (nearly) real-time calculation of the dose for realistic radiotherapy patient data-sets. This reduces the workload for manual determination of the optimal treatment plan. Besides, it offers a speed up for automated optimization of (advanced) radiotherapy treatment plans and/or re-planning after on-line imaging of the patient.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/519_dosedistro_1e6_ptv_only_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/519_dosedistro_1e6_ptv_only_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Academic Medical Center, University of Amsterdam</OrganizationName>
        <OrganizationURL>http://www.amc.nl/radiotherapie</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>08</ReleaseMonth>
        <ReleaseDay>12</ReleaseDay>
        <ReleaseDateDisplay>08/12/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>10</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="m.degreef@amc.uva.nl">M.de Greef</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.youtube.com/watch?v=iUwTLlumKVc">Multimedia</ContentType>
           <ContentType url="http://www.amc.nl/upload/teksten/radiotherapie/hyperthermie/RayForDose-NVIDIA.pdf">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Life Sciences</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>M.de Greef,m.degreef@amc.uva.nl</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>ce36336b-2796-4924-9cb9-a79e4d7992e6</GUID>
        <Name>OpenMS</Name>
        <ShortDescription>An open-source framework for mass spectrometry</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/518_logo_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/518_logo_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Center for Bioinformatics, Saarland University</OrganizationName>
        <OrganizationURL>http://bioinf-www.bioinf.uni-sb.de/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>01</ReleaseMonth>
        <ReleaseDay>14</ReleaseDay>
        <ReleaseDateDisplay>01/14/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="rene@bioinf.uni-sb.de">Rene Hussong</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://bioinformatics.oxfordjournals.org/cgi/content/abstract/25/15/1937">Paper</ContentType>
           <ContentType url="http://bioinformatics.oxfordjournals.org/cgi/content/abstract/25/15/1937">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Life Sciences</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Rene Hussong,rene@bioinf.uni-sb.de,openms proteomics</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>228465d3-3c07-4215-ad0f-8bba8d3f87a8</GUID>
        <Name>parallel for</Name>
        <ShortDescription>A data parallel scientific programming model. Compiles efficiently to different platforms like distributed memory (MPI), shared memory multi-processor (pthreads), Cell BE processor, NVIDIA CUDA, SIMD vectorization (SSE, Altivec), and sequential C++ code. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/517_simd_mimd_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/517_simd_mimd_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>CISCO</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="gwz@cisco.com">GWZ</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/parallel-for/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>GWZ,gwz@cisco.com</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>2c01c333-373e-49f3-9b2e-41a3d14db455</GUID>
        <Name>multiDAC</Name>
        <ShortDescription>multiDAC is intended to become a user-friendly tool for image- and videoprocessing in the field of deformation/movement analysis. It is written in C# with some C routines using CPU/GPU parallelization (e.g. CUDA). </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/516_screenshot_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/516_screenshot_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>SourceForge</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>30</ReleaseDay>
        <ReleaseDateDisplay>07/30/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="purzel@users.sourceforge.net">purzel42</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/multidac/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Video &amp; Audio</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>purzel42,purzel@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>feb82039-4088-43ec-9118-1d2a1c80b349</GUID>
        <Name>CUDA-NN</Name>
        <ShortDescription>A parallel version of Neural Networks using CUDA for optimization, data mining, etc.</ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/515_datamining7_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/515_datamining7_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Nanyang Technological University </OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Haixiang_Shi@users.sourceforge.net">Haixiang Shi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cuda-nn/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Life Sciences</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Haixiang Shi,Haixiang_Shi@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>f11599cd-b3a5-4b08-a9af-801b67ebd826</GUID>
        <Name>IllustStudio</Name>
        <ShortDescription>IllustStudio is the paint tool which allows users to express pen strokes similar to real ones and to expand their range of expressions. IllustStudio has filters corresponding to CUDA and realizes high-speed filtering process by using GPU calculation. According to our research*, with CUDA enables the processing speed 35 times faster than without CUDA. * According to the ratio of CELSYS. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/514_illuststudio_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/514_illuststudio_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>CELSYS,Inc.</OrganizationName>
        <OrganizationURL>http://www.celsys.co.jp/</OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>07</ReleaseMonth>
        <ReleaseDay>29</ReleaseDay>
        <ReleaseDateDisplay>07/29/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>35</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="info@celsys.co.jp">CELSYS,Inc.</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://www.illuststudio.net/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Digital Content Creation</ApplicationType>
           <ApplicationType>Graphics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>CELSYS,Inc.</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>12ba6f44-1cdc-4fad-a5de-7d9e052f76dc</GUID>
        <Name>CUDA-SVM</Name>
        <ShortDescription>A fast parallel SVM tool based on CUDA. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/513_svm_small.png</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/513_svm_large.png</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType></OrganizationType>
        <OrganizationName>Nanyang Technological University </OrganizationName>
        <OrganizationURL>Academia</OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Haixiang_Shi@users.sourceforge.net">Haixiang Shi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cuda-svm/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Haixiang Shi,Haixiang_Shi@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>3662fbe9-eeec-4413-b43c-d42054cbfa52</GUID>
        <Name>CUDA-GA</Name>
        <ShortDescription>A fast parallel genetic algorithm using CUDA. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/512_GAArt_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/512_GAArt_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Nanyang Technological University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Haixiang_Shi@users.sourceforge.net">Haixiang Shi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cuda-ga/">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
           <ApplicationType>Life Sciences</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Haixiang Shi,Haixiang_Shi@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>76057c07-a388-46d9-af21-b9bfcc4453c3</GUID>
        <Name>CUDA-PSO</Name>
        <ShortDescription>A parallel version of Particle Swarm Intelligence (PSO) using nVidia's CUDA. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/511_swarm_intelligence_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/511_swarm_intelligence_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Nanyang Technological University</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2008</ReleaseYear>
        <ReleaseMonth>12</ReleaseMonth>
        <ReleaseDay>31</ReleaseDay>
        <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="Haixiang_Shi@users.sourceforge.net">Haixiang Shi</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cuda-pso/ ">Code</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Imaging</ApplicationType>
           <ApplicationType>Science</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Haixiang Shi,Haixiang_Shi@users.sourceforge.net</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>f4e8deee-ef9f-4047-bafd-0701a9a1bc27</GUID>
        <Name>Magnetohydrodynamics simulations on graphics processing units</Name>
        <ShortDescription>Magnetohydrodynamics (MHD) simulations based on the ideal MHD equations have become a powerful tool for modeling phenomena in a wide range of applications including laboratory, astrophysical, and space plasmas. In general, high-resolution methods for solving the ideal MHD equations are computationally expensive and Beowulf clusters or even supercomputers are often used to run the codes that implemented these methods. With the advent of the Compute Unified Device Architecture (CUDA), modern graphics processing units (GPUs) provide an alternative approach to parallel computing for scientific simulations. In this paper we present, to the authors' knowledge, the first implementation to accelerate computation of MHD simulations on GPUs. Numerical tests have been performed to validate the correctness of our GPU MHD code. Performance measurements show that our GPU-based implementation achieves speedups of 2 (1D problem with 2048 grids), 106 (2D problem with 1024^2 grids), and 43 (3D problem with 128^3 grids), respectively, compared to the corresponding serial CPU MHD implementation. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/510_GPU_MHD_new_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/510_GPU_MHD_new_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Academia</OrganizationType>
        <OrganizationName>Faculty of IT, Macau University of Science and Technology</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>09</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>09/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp>100</SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="hcwong@ieee.org">Hon-Cheng Wong</Author>
           <Author email="uhwong@must.edu.mo">Un-Hong Wong</Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://arxiv.org/abs/0908.4362">Paper</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Science</ApplicationType>
           <ApplicationType>Computational Physics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>Hon-Cheng Wong,hcwong@ieee.org</Keyword>
        </Keywords>
     </Application>


     <Application>
        <GUID>e8dc2667-cce6-47b8-8fe4-c0e18e14972b</GUID>
        <Name>CUDA-ClustalW</Name>
        <ShortDescription>CUDA-ClustalW is publicly available open-source software for high-speed computation of large MSAs running on CUDA-enabled GPUs based on clustalw-2.0.9. The project has been tested on a GeForce GTX 280 graphics card. </ShortDescription>
        <URL></URL>
        <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/509_p53_Hsap_Mmus_Rnor_Frub_ClustalW_6Kb_angle_800p_small.jpg</BoxArtImageURLLow>
        <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/509_p53_Hsap_Mmus_Rnor_Frub_ClustalW_6Kb_angle_800p_large.jpg</BoxArtImageURLMed>
        <BoxArtImageURLHigh></BoxArtImageURLHigh>
        <OrganizationType>Research</OrganizationType>
        <OrganizationName>SourceForge.net</OrganizationName>
        <OrganizationURL></OrganizationURL>
        <ReleaseYear>2009</ReleaseYear>
        <ReleaseMonth>04</ReleaseMonth>
        <ReleaseDay>01</ReleaseDay>
        <ReleaseDateDisplay>04/01/2009</ReleaseDateDisplay>
        <CompatibleGPU></CompatibleGPU>
        <SpeedUp></SpeedUp>
        <SoftwareLicenseType></SoftwareLicenseType>
        <Authors>
           <Author email="nkcslyc@users.sourceforge.net">nkcslyc </Author>
        </Authors>
        <ContentTypes>
           <ContentType url="http://sourceforge.net/projects/cuda-clustalw/">Application</ContentType>
        </ContentTypes>
        <ApplicationTypes>
           <ApplicationType>Numerics</ApplicationType>
        </ApplicationTypes>
        <Keywords>
           <Keyword>nkcslyc,</Keyword>
        </Keywords>
     </Application>

    <Application>
      <GUID>49b7c770-57ef-4530-b9ea-ea804d21c7ff</GUID>
      <Name>Cuda_Wrapper</Name>
<ShortDescription>The CUDA wrapper library provides means for an efficient resource sharing and resource protection on multi-user GPU clusters.It implements the following functionality:1) Virtualization of the physical GPU devices2) Ensuring NUMA affinity for GPUs .</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/507_numerics_rayleighbenard3d_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/507_numerics_rayleighbenard3d_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Illinois at Urbana-Champaign</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>07</ReleaseMonth>
      <ReleaseDay>21</ReleaseDay>
      <ReleaseDateDisplay>07/21/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="gshi@ncsa.uiuc.edu">Guochun Shi</Author>
         <Author email="">Jeremy Enos</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://sourceforge.net/projects/cudawrapper/">Code</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Libraries</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Guochun Shi,Jeremy Enos,gshi@ncsa.uiuc.edu</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>54d76ef2-f4b0-4e80-a3c3-ee8338606f13</GUID>
      <Name>CUDA Neural Network</Name>
<ShortDescription>Implementation of a feed-forward backpropagation artificial neural network using CUDA.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/506_neural_network_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/506_neural_network_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Research</OrganizationType>
      <OrganizationName>Sourceforge</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>03</ReleaseDay>
      <ReleaseDateDisplay>12/03/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="pyrevenant@users.sourceforge.net">Pyrevenant</Author>
      </Authors>
      <ContentTypes>
        <ContentType url="http://sourceforge.net/projects/cudann/">Application</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Life Sciences,Libraries,Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Pyrevenant</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>eabc31be-c665-455a-95bd-0d0e7dd532ab</GUID>
      <Name>cuda-z</Name>
<ShortDescription>Simple program that displays information about CUDA-enabled devices. Program is equipped with GPU performance test. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/505_CUDA-Z_2_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/505_CUDA-Z_2_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Research</OrganizationType>
      <OrganizationName>SourceForge.net</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>04</ReleaseMonth>
      <ReleaseDay>13</ReleaseDay>
      <ReleaseDateDisplay>04/13/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="andrew_golovnia@users.sourceforge.net">Andriy Golovnya</Author>
      </Authors>
      <ContentTypes>
        <ContentType url="http://sourceforge.net/projects/cuda-z/">Application</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Andriy Golovnya,andrew_golovnia@users.sourceforge.net</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>bbf86cd6-59ac-443b-ab02-8ba8ef3bbf60</GUID>
      <Name>Computation of Troposphere Slant Delays on a GPU</Name>
<ShortDescription>Description (i.e. abstract of the paper): The computation of ray-traced troposphere delays which can be utilized for space geodetic applications is a time-consuming effort when a large number of rays has to be calculated. On the other hand, computation time can be tremendously reduced when algorithms are capable of supporting parallel processing architectures. Thus, by the use of an off-the-shelf graphics processing unit (GPU), it is demonstrated that troposphere slant delays can be computed very efficiently, without loss of accuracy. An adopted ray-tracing algorithm is presented, and results from GPU computations are compared with those obtained from calculations on a standard personal computer's CPU.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/504_IEEE_GPU_figureC_new_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/504_IEEE_GPU_figureC_new_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Research</OrganizationType>
      <OrganizationName>National Institute of Information and Communications Technology, Japan</OrganizationName>
      <OrganizationURL>http://www.nict.go.jp</OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>06</ReleaseMonth>
      <ReleaseDay>26</ReleaseDay>
      <ReleaseDateDisplay>06/26/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>18</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="hobiger@nict.go.jp">Hobiger Thomas</Author>
         <Author email="">Ichikawa Ryuichi</Author>         <Author email="">Koyama Yasuhiro</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&amp;arnumber=5129275&amp;isnumber=4358825">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Geoscience</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Hobiger Thomas, Ichikawa Ryuichi, Koyama Yasuhiro, Kondo Tetsuro</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>e97270b7-8c73-45fa-ac57-96e9ab59ca88</GUID>
      <Name>Cuda ITK</Name>
<ShortDescription>This project shows how to integrate NVIDIA CUDA GPU programming API into ITK (Insight Segmentation and Registration Toolkit) library.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/503_226314_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/503_226314_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Harvard University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>06</ReleaseMonth>
      <ReleaseDay>28</ReleaseDay>
      <ReleaseDateDisplay>06/28/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="wkjeong@seas.harvard.edu">Won-Ki Jeong</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://sourceforge.net/projects/cudaitk/">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword></Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>e9097706-9b7a-4d12-9104-a44bd1952348</GUID>
      <Name>Phobos</Name>
<ShortDescription>Phobos is a continuous map-reduce framework built upon NVIDIA CUDA</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/502_1_PHOBOS_461_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/502_1_PHOBOS_461_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>HKUST</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>01</ReleaseMonth>
      <ReleaseDay>01</ReleaseDay>
      <ReleaseDateDisplay>01/01/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="saven@cse.ust.hk">Wenbin Fang</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://bitbucket.org/mansu/phobos/">Code</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Libraries</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Wenbin Fang,saven@cse.ust.hk</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>212c683f-9d5a-4654-9edc-c3e3fcfe8727</GUID>
      <Name>cudatemplates</Name>
<ShortDescription>CUDA Templates" is a collection of C++ template classes and functions which provide a consistent interface to NVidia's "Compute Unified Device Architecture" (CUDA), hiding much of the complexity of the underlying CUDA functions from the programmer.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/501_CUDATemplates_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/501_CUDATemplates_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Technische Unversitat Graz</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="grabner@icg.tu-graz.ac.at">Markus Grabner</Author>
      </Authors>
      <ContentTypes>
        <ContentType url="http://sourceforge.net/projects/cudatemplates/">Application</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword></Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>d321a794-7411-4623-a966-4586c0d149e8</GUID>
      <Name>Application of a Kinetic Theory based solver of the Euler Equations using GPU</Name>
<ShortDescription>Presented is a modified form of the Quiet Direct Simulation (QDS) method [1] adapted for application of Graphics Processing Units (GPU) for flux calculation. Fluxes between source and destination cells calculated by QDS are flux-vector split and (on a regular Cartesian grid) a function of the source cell alone. The resulting advantage is the rapid calculation of fluxes between cells without the prior exchange of information between them, allowing highly efficient calculation using GPU. Various flow problems have been solved and consistent speed-ups of over 35 times (when compared to an equivalent single CPU code) are reported.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/500_kinetic_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/500_kinetic_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>National Centre for High Performance Computing, Hsinchu, Taiwan</OrganizationName>
      <OrganizationURL>http://www.nchc.org.tw/en/</OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>05</ReleaseMonth>
      <ReleaseDay>18</ReleaseDay>
      <ReleaseDateDisplay>05/18/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>35</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="msmith@nchc.org.tw">Matthew Smith</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.nvidia.com/content/cudazone/CUDABrowser/downloads/papers/ParCFDRevisedAbs_Final.pdf">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Computational Fluid Dynamics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Matthew Smith,msmith@nchc.org.tw,Quiet Direct Simulation, Kinetic Theory</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>426e2b01-84c7-4c8c-a935-c652aee3ba78</GUID>
      <Name>Conjugated Gradient CUDA and CPU solvers for float, double and quad precision</Name>
<ShortDescription>Free CUDA CG! Take advantage from our full featured 150GFlop/s Conjugated Gradient CUDA and CPU solvers for float, double and quad precision for free.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/499_CG_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/499_CG_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Commercial</OrganizationType>
      <OrganizationName>Elegant Mathematics Ltd</OrganizationName>
      <OrganizationURL>http://www.elegant-mathematics.com/</OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>01</ReleaseMonth>
      <ReleaseDay>08</ReleaseDay>
      <ReleaseDateDisplay>01/08/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType>Open source</SoftwareLicenseType>
      <Authors>
         <Author email="info@elegant-mathematics.com">Elegant Mathematics Ltd</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.elegant-mathematics.com/images/EM-Free-CG.zip">Code</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Elegant Mathematics Ltd,info@elegant-mathematics.com</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>ef371856-7d1b-4d97-ab4c-6e73f9925992</GUID>
      <Name>GAMER: a GPU-Accelerated Adaptive Mesh Refinement Code for Astrophysics</Name>
<ShortDescription>We present the newly developed code, GAMER (GPU-accelerated Adaptive MEsh Refinement code), which has adopted a novel approach to improve the performance of adaptive mesh refinement (AMR) astrophysical simulations by a large factor with the use of the graphic processing unit (GPU). The AMR implementation is based on a hierarchy of grid patches with an oct-tree data structure. We adopt a three-dimensional relaxing TVD scheme for the hydrodynamic solver, and a multi-level relaxation scheme for the Poisson solver. Both solvers have been implemented in GPU, by which hundreds of patches can be advanced in parallel. The computational overhead associated with the data transfer between CPU and GPU is carefully reduced by utilizing the capability of asynchronous memory copies in GPU, and the computing time of the ghost-zone values for each patch is made to diminish by overlapping it with the GPU computations. We demonstrate the accuracy of the code by performing several standard test problems in astrophysics. GAMER is a parallel code that can be run in a multi-GPU cluster system. We measure the performance of the code by performing purely-baryonic cosmological simulations in different hardware implementations, in which detailed timing analyses provide comparison between the computations with and without GPU(s) acceleration. Maximum speed-up factors of 12.19 and 10.47 are demonstrated using 1 GPU with 4096^3 effective resolution and 16 GPUs with 8192^3 effective resolution, respectively.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/498_fig18_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/498_fig18_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Department of Physics, National Taiwan University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>07</ReleaseMonth>
      <ReleaseDay>30</ReleaseDay>
      <ReleaseDateDisplay>07/30/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>12</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="b88202011@ntu.edu.tw">Hsi-Yu Schive</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://arxiv.org/abs/0907.3390">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Computational Fluid Dynamics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Hsi-Yu Schive,b88202011@ntu.edu.tw</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>ab53f652-f3d5-41b7-ab63-10bbda728871</GUID>
      <Name>Data Parallel Bin-Based Indexing for Answering Queries on Multi-Core Architectures</Name>
      <ShortDescription>The multi-core trend in CPUs and GPUs offers new opportunities for the database community. The increase of cores at exponential rates is likely to affect virtually every server and client in the coming decade, and presents database management systems with a huge, compelling disruption that will radically change how processing is done. This paper presents a new parallel indexing data structure for answering queries that takes full advantage of the increasing thread-level parallelism emerging in multi-core architectures. In our approach, our Data Parallel Bin-based Index Strategy (DP-BIS) first bins the base data, and then partitions and stores the values in each bin as a separate, bin-based data cluster. In answering a query, the procedures for examining the bin numbers and the bin-based data clusters offer the maximum possible level of concurrency; each record is evaluated by a single thread and all threads are processed simultaneously in parallel. We implement and demonstrate the effectiveness of DP-BIS on two multi-core architectures: a multi-core CPU and a GPU. The concurrency afforded by DP-BIS allows us to fully utilize the thread-level parallelism provided by each architecture--for example, our GPU-based DP-BIS implementation simultaneously evaluates over 12,000 records with an equivalent number of concurrently executing threads. In comparing DP-BIS's performance across these architectures, we show that the GPU-based DP-BIS implementation requires significantly less computation time to answer a query than the CPU-based implementation. We also demonstrate in our analysis that DP-BIS provides better overall performance than the commonly utilized CPU and GPU-based projection index. Finally, due to data encoding, we show that DP-BIS accesses significantly smaller amounts of data than index strategies that operate solely on a column's base data; this smaller data footprint is critical for parallel processors that possess limited memory resources (e.g. GPUs).
      </ShortDescription> 
      <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/497_960_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/497_960_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of California at Davis</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>06</ReleaseMonth>
      <ReleaseDay>02</ReleaseDay>
      <ReleaseDateDisplay>06/02/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>18</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="jgosink@ucdavis.edu">Luke J. Gosink</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.idav.ucdavis.edu/publications/print_pub?pub_id=960">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Data Parallel Database Indexing</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Luke J. Gosink,jgosink@ucdavis.edu</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>47cb4eda-e210-4db8-bb80-d6ae342dd454</GUID>
      <Name>Physical-Space Refraction-Corrected Transmission Ultrasound Computed Tomography Made Computationally Practical</Name>
<ShortDescription>Transmission Ultrasound Computed Tomography CT) is strongly affected by the acoustic refraction properties of the imaged tissue, and proper modeling and correction of these effects is crucial to achieving high-quality image reconstructions. Excellent results can be obtained when these physics effects are incorporated, but at considerable computational expense. We have used CUDA to conceive a framework that implements refractive Ultrasound CT and meets the interactive demands of clinical practice, without a loss in reconstruction quality.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/497_us_img_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/497_us_img_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Stony Brook University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>09</ReleaseMonth>
      <ReleaseDay>11</ReleaseDay>
      <ReleaseDateDisplay>09/11/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>85</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="mueller@cs.sunysb.edu">Kllaus Mueller</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.cs.sunysb.edu/~mueller/papers/MICCAI08_final_submit.pdf">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>MedicalImaging</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Kllaus Mueller,mueller@cs.sunysb.edu</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>b912f9a1-2627-4d4f-9f4f-4da3eff3ca78</GUID>
      <Name>Python Parallel Utilities</Name>
<ShortDescription>NVIDIA CUDA and MPI python wrappers. These wrappers are written in pure C no swig or boost necessary. The CUDA wrapper exposes the CUDA runtime and Driver API's. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/496_smoothed_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/496_smoothed_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Seismic Laboratory for Imaging and Modeling</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="srossross@gmail.com ">Sean Ross-Ross </Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://sourceforge.net/projects/pythonparallelu/">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Programming Tools</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword></Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>c58a8810-432f-4757-a91f-c80faabe20ab</GUID>
      <Name>Signal Integrity Simulations</Name>
<ShortDescription>Agilent Technologies Inc. (NYSE:A) announced its work with NVIDIA to accelerate signal integrity simulations using NVIDIAs Compute Unified Device Architecture (CUDA)-based Graphics Processing Units (GPU). The association is expected to yield the commercial release of a GPU-enabled Advanced Design System (ADS) Transient Convolution Simulator that will allow signal integrity designers to run these simulations dramatically faster than was previously possible.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/495_hyperlinx-eye_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/495_hyperlinx-eye_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Commercial</OrganizationType>
      <OrganizationName>EDA Geek News Staff in Models, Simulations</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>08</ReleaseMonth>
      <ReleaseDay>26</ReleaseDay>
      <ReleaseDateDisplay>08/26/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="contact_us@agilent.com">EDA Geek News Staff in Models, Simulations</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://edageek.com/2008/08/26/convolution-simulator/">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Signal Processing</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>contact_us@agilent.com</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>d0dbf768-8c4a-45f6-a6c3-8c38dc100a98</GUID>
      <Name>Applying Modern Soft and Hardware Technologies for Computational Steering Approaches in Computational Fluid Dynamics</Name>
<ShortDescription>In this article we present an educational simulation tool, FlowSim 2007 CUDA edition, a computational steering application for interactive 2D flow simulation based on the Lattice Boltzmann Method. The application combines a comfortable user interface as well as a convenient development platform on the one hand and a high performance flow solver on the other hand. The user interface is implemented using the Microsoft .NET Framework whereas the Lattice Boltzmann kernel is based on the Compute Unified Device Architecture (CUDA) by nVIDIA running on GeForce 8 series featuring G8X GPUs [2]. The gap between the managed intermediate language (IL) code and the hardware specific native code is filled using the recently introduced C++/CLI programming language [1]. We demonstrate that this integrated desktop approach can deliver a performance that exceeds that of a high end PC by at least an order of magnitude. In our conclusion we will focus on extensions to three dimensions and clusters of GPUs. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/494_p175_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/494_p175_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Technology Institute at TU Braunschweig</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2007</ReleaseYear>
      <ReleaseMonth>10</ReleaseMonth>
      <ReleaseDay>26</ReleaseDay>
      <ReleaseDateDisplay>10/26/2007</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="j.linxweiler@tu-bs.de">Jan Linxweiler</Author>
         <Author email="">Jonas T</Author>         <Author email="">lke Manfred Krafczyk</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www2.computer.org/portal/web/csdl/doi/10.1109/CW.2007.53">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Computational Fluid Dynamics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Jan Linxweiler,Jonas T,lke Manfred Krafczyk,</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>a6e5c287-cebe-4a52-b00f-7ec58a5dbdd2</GUID>
      <Name>Computer generated hologram with geometric occlusion using GPU-accelerated depth buffer rasterization for three-dimensional display</Name>
<ShortDescription>We present a method of rapidly producing computer-generated holograms that exhibit geometric occlusion in the reconstructed image. Conceptually, a bundle of rays is shot from every hologram sample into the object volume. We use z buffering to find the nearest intersecting object point for every ray and add its complex field contribution to the corresponding hologram sample. Each hologram sample belongs to an independent operation, allowing us to exploit the parallel computing capability of modern programmable graphics processing units (GPUs). Unlike algorithms that use points or planar segments as the basis for constructing the hologram, our algorithm's complexity is dependent on fixed system parameters, such as the number of ray-casting operations, and can therefore handle complicated models more efficiently. The finite number of hologram pixels is, in effect, a windowing function, and from analyzing the Wigner distribution function of windowed free-space transfer function we find an upper limit on the cone angle of the ray bundle. Experimentally, we found that an angular sampling distance of 0.01 for a 2.66 cone angle produces acceptable reconstruction quality.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/493_h15g_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/493_h15g_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of CambridgeElectrical Engineering Dept.</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>07</ReleaseMonth>
      <ReleaseDay>17</ReleaseDay>
      <ReleaseDateDisplay>07/17/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="">Rick H.-Y. Chen</Author>
         <Author email="">Timothy D. Wilkinson </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/ao/abstract.cfm?URI=ao-48-21-4246">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType></ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Rick H.-Y. Chen,Timothy D. Wilkinson </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>6f121072-b6d3-47ba-9e6b-e872695eaaf8</GUID>
      <Name>Real-Time Fringe Pattern Generation with High Quality</Name>
<ShortDescription>A hologram computation procedure and its GPU implementation are presented. The procedure is based on partitioning. Each segment has an approximate but simpler frequency domain representation. Quality of the results is comparable to Fresnel holograms.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/492_3d-scan1_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/492_3d-scan1_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Department of Electrical and Electronics Engineering and Bilkent University </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>04</ReleaseMonth>
      <ReleaseDay>30</ReleaseDay>
      <ReleaseDateDisplay>04/30/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="hjkang@ee.bilkent.edu.tr">Hoonjong Kang, </Author>
         <Author email="">Fahri Yara, </Author>         <Author email="">Levent Onural,</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/abstract.cfm?URI=DH-2009-DTuB7">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Hoonjong Kang, Fahri Yara, Levent Onural,hjkang@ee.bilkent.edu.tr</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>922028e2-56a9-45cf-99de-ceaa8d0a5370</GUID>
      <Name>Real-Time Multiple SLM Color Holographic Display Using Multiple GPU Acceleration</Name>
<ShortDescription>A real-time color holographic video display system computes holograms from point cloud of a rigid object by using multi-GPU system and uses three different colored LEDs for reconstruction. Experimental results are satisfactory.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/491_slm_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/491_slm_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Dept. of Electrical and Electronics Eng., Bilkent University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>04</ReleaseMonth>
      <ReleaseDay>30</ReleaseDay>
      <ReleaseDateDisplay>04/30/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="chair@ee.bilkent.edu.tr">Fahri Yara</Author>
         <Author email="">Hoonjong Kang</Author>         <Author email="">Levent Onural</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/abstract.cfm?URI=DH-2009-DWA4">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
	<ApplicationType>Video &amp; Audio</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Fahri Yara, Hoonjong Kang,Levent Onural,</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>8bbd6e15-496a-4bb3-9053-b3811821e510</GUID>
      <Name>Fast Hardware-Accelerated Volume Rendering of CT Scans</Name>
<ShortDescription>As CT scanning is a very common medical imaging method, we propose new hardware-based algorithms using GPU (Graphical Processor Unit) programming for rapid visualization. Firstly, 3D volumes are constructed from CT scans. Then volume rendering is used to display anatomical structures via algorithms founded on improved ray casting and 2D textures. Our methods achieve interactive rendering rates and require an ordinary PC with an off-the-shelf graphics card. We expect our approach to be useful to medical practitioners for handling modern, large-scale medical datasets.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/490_ct_head_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/490_ct_head_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Zhejiang University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2007</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>01</ReleaseDay>
      <ReleaseDateDisplay>12/01/2007</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="zgpan@cad.zju.edu.cn">Ronghua Liang</Author>
         <Author email="">Zhigeng Pan</Author>         <Author email="">Meleagros Krokos</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/jdt/abstract.cfm?URI=jdt-4-4-431">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>MedicalImaging</ApplicationType>
	<ApplicationType>Life Sciences</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Ronghua Liang, Zhigeng Pan, Meleagros Krokos,zgpan@cad.zju.edu.cn</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>f10e6a41-90c1-4b44-8a9a-a427e74974f8</GUID>
      <Name>GPU-Based Acceleration Method for Coherent Holographic Stereogram Calculation</Name>
<ShortDescription>In this paper, we show an acceleration method of the coherent holographic stereogram calculation by means of the GPU, and demonstrate the performance gain up to a factor of over 10 compared with CPU-based computing.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/489_mobius_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/489_mobius_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Department of Electrical and Electronics Engineering and Bilkent University </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>03</ReleaseMonth>
      <ReleaseDay>16</ReleaseDay>
      <ReleaseDateDisplay>03/16/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>10</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="hjkang@ee.bilkent.edu.tr">Hoonjong Kang, </Author>
         <Author email="">Takeshi Yamaguchi,</Author>         <Author email="">Hiroshi Yoshikawa</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/abstract.cfm?URI=DH-2008-DWA4">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Hoonjong Kang, Takeshi Yamaguchi,Hiroshi Yoshikawa,hjkang@ee.bilkent.edu.tr</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>f9c1b8ad-db0d-429f-8863-03ded9a69dab</GUID>
      <Name>Atmospheric wavefront phase recovery by use of specialized hardware: graphical processing units and field-programmable gate arrays</Name>
<ShortDescription>To achieve the wavefront phase-recovery stage of an adaptive-optics loop computed in real time for 32x32 or a greater number of subpupils in a Shack-Hartmann sensor, we present here, for what is to our knowledge the first time, preliminary results that we obtained by using innovative techniques: graphical processing units (GPUs) and field-programmable gate arrays (FPGAs). We describe the stream-computing paradigm of the GPU and adapt a zonal algorithm to take advantage of the parallel computational power of the GPU. We also present preliminary results we obtained by use of FPGAs on the same algorithm. GPUs have proved to be a promising technique, but FPGAs are already a feasible solution to adaptive-optics real-time requirements, even for a large number of subpupils.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/488_08_06a_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/488_08_06a_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of La LagunaSpain </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2004</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2004</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="tpc3dtvcon09@tnt.uni-hannover.de">Jose G. Marichal-Hernandez</Author>
         <Author email="">Luis F. Rodriguez-Ramos</Author>         <Author email="">Fernando Rosa</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/ao/abstract.cfm?URI=ao-44-35-7587">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Jose G. Marichal-Hernandez, Luis F. Rodriguez-Ramos, Fernando Rosa,tpc3dtvcon09@tnt.uni-hannover.de</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>8a5f261c-09d8-42a5-8726-7100bdde85c8</GUID>
      <Name>Acceleration method of computing a compensated phase-added stereogram</Name>
<ShortDescription>We have implemented experimental code to compute a compensated phase-added stereogram (CPAS), which was proposed in a previous paper, on a graphic processing unit (GPU). In this paper, we show an acceleration method for CPAS computation by means of the GPU and compare the computation time between CPU-based and GPU-based calculations, which are programmed in our laboratories. In addition, we demonstrate their reconstructed images. As a result, we could achieve a performance gain of a factor of over 33 compared with a CPU-based computing environment and digital holograms can be displayed at 30 frames per second with 15,000 points.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/487_stereo_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/487_stereo_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Department of Electrical and Electronics Engineering and Bilkent University </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>10</ReleaseMonth>
      <ReleaseDay>24</ReleaseDay>
      <ReleaseDateDisplay>10/24/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>33</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="hjkang@ee.bilkent.edu.tr">Hoonjong Kang</Author>
         <Author email="">Takeshi Yamaguchi</Author>         <Author email="">Hiroshi Yoshikawa</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/ao/abstract.cfm?URI=ao-47-31-5784">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Hoonjong Kang, Takeshi Yamaguchi, Hiroshi Yoshikawa, hjkang@ee.bilkent.edu.tr</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>784de106-a069-4478-8a7e-92a0aed3649b</GUID>
      <Name>Hologram synthesis for photorealistic reconstruction</Name>
<ShortDescription>Computation of diffraction patterns, and thus holograms, of scenes with photorealistic properties is a highly complicated and demanding process. An algorithm, based primarily on computer graphics methods, for computing full-parallax diffraction patterns of complicated surfaces with realistic texture and reflectivity properties is proposed and tested. The algorithm is implemented on single-CPU, multiple-CPU and GPU platforms. An alternative algorithm, which implements reduced occlusion diffraction patterns for much faster but somewhat lower quality results, is also developed and tested. The algorithms allow GPU-aided calculations and easy parallelization. Both numerical and optical reconstructions are conducted. The results indicate that the presented algorithms compute diffraction patterns that provide successful photorealistic reconstructions; the computation times are acceptable especially on the GPU implementations.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/486_image018_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/486_image018_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>JOSA A</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>11</ReleaseMonth>
      <ReleaseDay>24</ReleaseDay>
      <ReleaseDateDisplay>11/24/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="mjandakiv@zcu.cz ">Martin Janda</Author>
         <Author email="">Ivo Hanak</Author>         <Author email="">Levent Onural</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/josaa/abstract.cfm?URI=josaa-25-12-3083">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Martin Janda,Ivo Hanak, Levent Onural,mjandakiv@zcu.cz </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>9d2fbd2e-e241-478f-90a1-f40cb04ed084</GUID>
      <Name>Real-time digital holographicmicroscopy</Name>
<ShortDescription>Digital holographic microscopy (DHM) is a well-known powerful method allowing both the amplitude and phase of a specimen to be simultaneously observed. In order to obtain a reconstructed image from a hologram, numerous calculations for the Fresnel diffraction are required. The Fresnel diffraction can be accelerated by the FFT (Fast Fourier Transform) algorithm. However, real-time reconstruction from a hologram is difficult even if we use a recent central processing unit (CPU) to calculate the Fresnel diffraction by the FFT algorithm. In this paper, we describe a real-time DHM system using a graphic processing unit (GPU) with many stream processors, which allows use as a highly parallel processor. The computational speed of the Fresnel diffraction using the GPU is faster than that of recent CPUs. The real-time DHM system can obtain reconstructed images from holograms whose size is 512x512 grids in 24 frames per second.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/485_holo_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/485_holo_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Graduate School of Science and Engineering, Yamagata University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>07</ReleaseMonth>
      <ReleaseDay>23</ReleaseDay>
      <ReleaseDateDisplay>07/23/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="shimo@yz.yamagata-u.ac.jp">Tomoyoshi Shimobaba</Author>
         <Author email="">Yoshikuni Sato</Author>         <Author email="">Junya Miura</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/oe/viewmedia.cfm?URI=oe-16-16-11776-2">Multimedia</ContentType>        <ContentType url="http://www.opticsinfobase.org/DirectPDFAccess/C7FF6525-BDB9-137E-C6A86018F3729D3B_170078.pdf?da=1&amp;id=170078&amp;seq=0&amp;CFID=50857756&amp;CFTOKEN=72234862">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Tomoyoshi Shimobaba,Yoshikuni Sato,Junya Miura,shimo@yz.yamagata-u.ac.jp</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>5068f09c-0433-4ba5-9565-f77fbc04d4c8</GUID>
      <Name>Real-time liquid-crystal atmosphere turbulence simulator</Name>
<ShortDescription>To generate time-evolving atmosphere turbulence in real time, a phase-generating method for our liquid-crystal (LC) atmosphere turbulence simulator (ATS) is derived based on the Fourier series (FS) method. A real matrix expression for generating turbulence phases is given and calculated with a graphic processing unit (GPU), the GeForce 8800 Ultra. A liquid crystal on silicon (LCOS) with 256x256 pixels is used as the turbulence simulator. The total time to generate a turbulence phase is about 7.8 ms for calculation and readout with the GPU. A parallel processing method of calculating and sending a picture to the LCOS is used to improve the simulating speed of our LC ATS. Therefore, the real-time turbulence phasegeneration frequency of our LC ATS is up to 128 Hz. To our knowledge, it is the highest speed used to generate a turbulence phase in real time.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/484_simulator_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/484_simulator_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Changchun Institute of Optics, Fine Mechanics and Physics</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>04</ReleaseMonth>
      <ReleaseDay>17</ReleaseDay>
      <ReleaseDateDisplay>04/17/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="hulifa@ciomp.ac.cn">Lifa Hu</Author>
         <Author email="">Li Xuan</Author>         <Author email="">Dayu Li</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/DirectPDFAccess/C7F65974-BDB9-137E-CDAD5DFA4848D180_179198.pdf?da=1&amp;id=179198&amp;seq=0&amp;CFID=50857756&amp;CFTOKEN=72234862">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Lifa Hu,Li Xuan,Dayu Li,hulifa@ciomp.ac.cn</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>dfaea93f-1724-4e37-b329-5ee4848f3988</GUID>
      <Name>GPU-assisted high-resolution, real-time3-D shape measurement</Name>
<ShortDescription>This paper describes a Graphics Processing Unit (GPU)-assisted real-time three-dimensional shape measurement system. Our experiments demonstrated that the absolute coordinates calculation and rendering speed of a GPU is more than four times faster than that of a dual CPU workstation with the same graphics card. By implementing the GPU into our system, we realized simultaneous absolute coordinate acquisition, reconstruction and display at 30 frames per second with a resolution of approximately 266K points per frame. Moreover, a 2+1 phase-shifting algorithm was employed to alleviate the measurement error caused by motion. Applications of the system include medical imaging, manufacturing, entertainment, and security.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/483_face_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/483_face_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Mathematics Department, Harvard University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2006</ReleaseYear>
      <ReleaseMonth>10</ReleaseMonth>
      <ReleaseDay>02</ReleaseDay>
      <ReleaseDateDisplay>10/02/2006</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>4</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="szhang77@gmail.com">Song Zhang</Author>
         <Author email="">Dale Royer</Author>         <Author email="">Shing-Tung Yau</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/DirectPDFAccess/C7F26E3D-BDB9-137E-C08106FE8B922265_114589.pdf?da=1&amp;id=114589&amp;seq=0&amp;CFID=50857756&amp;CFTOKEN=72234862">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Song Zhang,Dale Royer,Shing-Tung Yau,szhang77@gmail.com</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>6ab09d58-6ea6-4025-8af8-e1925bef8dce</GUID>
      <Name>Computer generated holography</Name>
<ShortDescription>We have applied the graphics processing unit (GPU) to computer generated holograms (CGH) to overcome the high computational cost of CGH and have compared the speed of a GPU implementation to a standard CPU implementation. The calculation speed of a GPU (GeForce 6600, nVIDIA) was found to be about 47 times faster than that of a personal computer with a Pentium 4 processor. Our system can realize real-time reconstruction of a 64-point 3-D object at video rate using a liquid-crystal display of resolution 800x600.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/482_computer-generated-hologram_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/482_computer-generated-hologram_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Department of Medical System Engineering Chiba University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>47</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="masudanb@faculty.chiba-u.jp">Nobuyuki Masuda</Author>
         <Author email="">Tomoyoshi Ito</Author>         <Author email="">Takashi Tanaka</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/DirectPDFAccess/C7ECFB3F-BDB9-137E-C2871E529D1CC8EF_87556.pdf?da=1&amp;id=87556&amp;seq=0&amp;CFID=50857756&amp;CFTOKEN=72234862">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Video &amp; Audio</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Nobuyuki Masuda,Tomoyoshi Ito,Takashi Tanaka,masudanb@faculty.chiba-u.jp</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>028ff6b3-3641-497e-8515-37370d59d3c3</GUID>
      <Name>Flow visualization and flow cytometry with holographic video microscopy</Name>
<ShortDescription>The video stream captured by an in-line holographic microscope can be analyzed on a frame-by-frame basis to track individual colloidal particles three-dimensional motions with nanometer resolution, and simultaneously to measure their sizes and refractive indexes. Through a combination of hardware acceleration and software optimization, this analysis can be carried out in near real time with off-the-shelf instrumentation. An efficient particle identification algorithm automates initial position estimation with sufficient accuracy to enable unattended holographic tracking and characterization. This techniques resolution for particle size is fine enough to detect molecular-scale coatings on the surfaces of colloidal spheres, without requiring staining or fluorescent labeling. We demonstrate this approach to label-free holographic flow cytometry by detecting the binding of avidin to biotinylated polystyrene spheres.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/481_laser_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/481_laser_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Department of Physics and Center for Soft Matter Research, New York University, New York</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>07</ReleaseMonth>
      <ReleaseDay>17</ReleaseDay>
      <ReleaseDateDisplay>07/17/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="david.grier@nyu.edu">Fook Chiong Cheong</Author>
         <Author email="">Bo Sun</Author>         <Author email="">Remi Dreyfus</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.opticsinfobase.org/DirectPDFAccess/C7E741C2-BDB9-137E-CFB8F0BC703CFDBD_183673.pdf?da=1&amp;id=183673&amp;seq=0&amp;CFID=50857756&amp;CFTOKEN=72234862">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Fook Chiong Cheong,Bo Sun,Remi Dreyfus,david.grier@nyu.edu</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>8dba8b26-2a21-43a9-945b-ec9f04d5ff5d</GUID>
      <Name>A QAP Solver with CUDA GPU Computing Architecture</Name>
<ShortDescription>This application solves the quadratic assignment problem (QAP) [1]. In QAP, we are given l locations and l facilities and the task is to assign the facilities to the locations to minimize the cost. We chose QAP for the following reasons: First, problem sizes of QAPs in real life problems are relatively small compared with other problems in permutation domains such as the traveling salesman problem (TSP) and the scheduling problem. This enables us to use the shared memory of a GPU effectively. Second, QAP is one of the most diffcult problems among problems in permutation domains. Thus, QAP is a good test bed to evaluate an optimization algorithm.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/480_qap03_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/480_qap03_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Graduate School of Science, Osaka Prefecture University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="fujimoto@mi.s.osakafu-u.ac.jp">Noriyuki Fujimoto</Author>
         <Author email="">Shigeyoshi Tsutsui</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.gpgpgpu.com/gecco2009/9.pdf">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Noriyuki Fujimoto,Shigeyoshi Tsutsui,fujimoto@mi.s.osakafu-u.ac.jp</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>ca10a525-1ae8-4b4e-9720-2243074cb32e</GUID>
      <Name>A GPU Accelerated Evolutionary Computer Vision System</Name>
<ShortDescription>We have used the graphics processing unit (GPU) of the graphics card to create an evolutionary image processing system which is able to learn how to detect a user-specified object in an image. The system receives an image sequence as input. The user only has to tell the system where this object is located. This is done by using the mouse pointer. The user simply moves the mouse over the desired object and then presses the mouse button as long as the object is located under the mouse pointer. The user follows this object over several frames while keeping the mouse button pressed. As this is being done, the system evolves a population of image processing algorithms by exploiting the power of the GPU at interactive rates. Our system is the first GPU accelerated evolutionary image processing system (Figure 1) which allows the automatic creation of object detection algorithms [2]. This is the first step towards building fully adaptive evolutionary vision systems [1].</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/479_ducks_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/479_ducks_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Universitat Tubingen</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>45</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="marc.ebner@wsii.uni-tuebingen.de">Eberhard Karls</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.gpgpgpu.com/gecco2009/8.pdf">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Eberhard Karls,marc.ebner@wsii.uni-tuebingen.de</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>49ece6ac-6aa1-492c-8ceb-6a748939c306</GUID>
      <Name>GPU-based Acceleration of the Genetic Algorithm</Name>
<ShortDescription>Genetic algorithm (GA) is a stochastic optimization method inspired by nature evolution. Because of their parallel nature, they have been parallelized many times. Graphic Processing Units (GPU) were originally targeted for rasterization of graphics primitives. Today GPUs are more likely fast multi-core processors capable of performing complex mathematical tasks. There are many ways how to exploit GPUs potential for general purpose computation (GPGPU). One option is to employ Compute Unified Device Architecture (CUDA) framework.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/478_voronoi_knauss_oesterle_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/478_voronoi_knauss_oesterle_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Brno University of Technology, Bozetechova 2</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>2600</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="xpospi45@stud.t.vutbr.cz">Petr Pospichal</Author>
         <Author email="">Jiri Jaros</Author>      </Authors>
      <ContentTypes>
            </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Petr Pospichal,Jiri Jaros,xpospi45@stud.t.vutbr.cz</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>4aa192d6-70c5-4040-9fa2-7ee690f988dc</GUID>
      <Name>Parallel Ant System for Traveling Salesman Problem</Name>
<ShortDescription>Ant Colony Optimization(ACO) is a meta-heuristic introduced in 1991 by Dorigo et al. on TSP problem(Dorigo, 1992). This alorithm is inspired by the natural behavior of real ants. Ants usually communicate via pheromone trail, i.e. an ant would lay down some mount of pheromone on the passed path. An ants tendency to choose a specific path is positively correlated to the intensity of trail. The pheromone trail evaporates over time, if on pheromone laid down by other ants. If many ants lay down pheromone on specific path, the intensity would attract more ants forward this path. Although ACO has outstanding performance on TSP problem, it spends huge execution time in large scale TSP problem. However, ACO has highly parallelizable structure(Talbi, Roux, Fonlupt, &amp; Robillard, 1999 St utzle, 1998). In this work, we choose NVIDIAs CUDA programming model and Tesla C1060 as platform to implement our Parallel ACO.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/477_AntLines_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/477_AntLines_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Taiwan Evolutionary Intelligence Laboratory (TEIL) Department of Electrical Engineering, National Taiwan University</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>21</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="r97921039@ntu.edu.tw">Ying-Shiuan You</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.gpgpgpu.com/gecco2009/4.pdf">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Ying-Shiuan You,r97921039@ntu.edu.tw</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>8fc9a55c-d66e-4870-8536-634fad8c6d4a</GUID>
      <Name>StarPU</Name>
<ShortDescription>StarPU is a unified runtime system that offers support for heterogeneous multicore architectures (CPUs, GPUs, Cell's SPUs, ...) . Its unified execution model is tightly coupled with a high-level data management library and provides a convenient way to develop and tune powerful scheduling algorithms. StarPU therefore make it possible to actually get the benefits of hybrid systems in a portable fashion.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/476_starpu-lu-dag_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/476_starpu-lu-dag_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Research</OrganizationType>
      <OrganizationName>INRIA</OrganizationName>
      <OrganizationURL>http://www.inria.fr</OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>07</ReleaseMonth>
      <ReleaseDay>06</ReleaseDay>
      <ReleaseDateDisplay>07/06/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType>Open source</SoftwareLicenseType>
      <Authors>
         <Author email="cedric.augonnet@inria.fr">Cedric Augonnet</Author>
      </Authors>
      <ContentTypes>
        <ContentType url="http://runtime.bordeaux.inria.fr/StarPU/">Application</ContentType>        <ContentType url="http://runtime.bordeaux.inria.fr/StarPU/">Code</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Libraries</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>runtime system, task scheduling, data management, portability</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>8d5ab3be-6289-4b09-8e1f-db37f6d927b0</GUID>
      <Name>Optimization of Primality Testing Methods</Name>
<ShortDescription>Modern fast primality testing uses a combination of Strong Probable Prime (SPRP) rejection tests. We find more powerful combinations by intensive search of the vast domain of SPRP test configurations. Evolutionary guidance using previous promising results boosts search speed. We implement the entire search on the GPU with the CUDA programming language resulting in 65-time speedup over a CPU search. This project has already found a test an order of magnitude more powerful than the best previously known.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/474_rabin_miller_1_small.PNG</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/474_rabin_miller_1_large.PNG</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType></OrganizationType>
      <OrganizationName>Academia</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>65</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="comments@worley.com">Steve Worley</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.gpgpgpu.com/gecco2009/6.pdf">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Steve Worley,comments@worley.com</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>b8b898ab-530a-4119-9976-a20a5fdc492b</GUID>
      <Name>Particle Swarm Optimization</Name>
<ShortDescription>The increasing interest of researchers in using low cost GPUs for applications requiring intensive parallel comput- ing is due to the ability of these devices to solve parallelizable problems much faster than traditional sequential processors. The first applications of evolutionary algorithms (EAs) on GPUs have been developed to solve specific image processing problems; at the beginning they were using textures render- ing for the encoding and evaluation of individuals and most of the times tasks like pseudo random numbers generation and other evolutionary operations were executed on CPU. This project presents an approach for the implementation of PSO algoritms on GPUs which, by means of the nVIDIA CUDA TM environment, avoids the use of textures as data structures and performs all evolution on the GPU, reducing as much as possible the exchange of data with the CPU.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/473_phase_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/473_phase_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Dipartimento di Ingegneria dell InformazioneUniversita degli Studi di Parma</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>50</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="mussi@ce.unipr.it">Luca Mussi</Author>
         <Author email="cagnoni@ce.unipr.it">Stefano Cagnoni</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.gpgpgpu.com/gecco2009/1.pdf">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Luca Mussi,,Stefano Cagnoni,mussi@ce.unipr.it,cagnoni@ce.unipr.it</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>d23dc8ee-f770-49ab-ab28-abe32d6d2d10</GUID>
      <Name>Video Game Tools Used For Defense Needs</Name>
<ShortDescription>Video gaming computers and video game consoles available today typically contain a graphics processing unit (GPU), which is very efficient at manipulating and displaying computer graphics. However, the unit's highly parallel structure also makes it more efficient than a general-purpose central processing unit for a range of complex calculations important to defense applications.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/472_commandandconquer-775336_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/472_commandandconquer-775336_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Georgia Institute of Technology Research News</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>06</ReleaseMonth>
      <ReleaseDay>24</ReleaseDay>
      <ReleaseDateDisplay>06/24/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>350</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="joy.daniell@ap.gatech.edu">Georgia Institute of Technology Research News</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.huliq.com/11/82678/video-game-tools-used-defense-needs">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Game Physics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Video Game </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>1f321cbb-73a3-4321-8858-cf8f5d246fe1</GUID>
      <Name>Using Evolutionary Computing on Consumer GraphicsHardware for Epistasis Analysis in Human Genetics</Name>
<ShortDescription>Biological systems are both complex and robust. Because of this epistasis, or gene-gene interactions, are thought to be a ubiquitous component of common human diseases. Unfortunately, due to the non-linear nature of these interactions, detecting and characterizing epistasis requires algorithms which are combinatorial in complexity. One such algorithm is Multifactor Dimensionality Reduction (MDR). Expert knowledge guided evolutionary computing wrappers around MDR have previously been shown to be a powerful way to efficiently analyze datasets for interactions. Evolutionary computing can effectively address some of the challenges these datasets present. Unfortunately examining the statistical significance of results requires permutation testing, which increases the computation requirements by a factor of 1000. Here we implement an expert knowledge guided ant system on graphics processing units (GPUs) and show that the GPU implementation makes the rigorous statistical analysis of large datasets practical.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/471_karyotype_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/471_karyotype_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Dartmouth Medical School</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>07</ReleaseMonth>
      <ReleaseDay>24</ReleaseDay>
      <ReleaseDateDisplay>07/24/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="nicholas.a.sinnott-armstrong@dartmouth.edu">Nicholas A.Sinnott-Armstrong</Author>
         <Author email="">Casey S. Greene</Author>         <Author email="">Jason H. Moore</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://www.vizworld.com/2009/07/gpgpu-accelerated-epistatis-analysis-in-human-genetics/">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Life Sciences</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Nicholas A.Sinnott-Armstrong,Casey S. Greene,Jason H. Moore,Epistasis Analysis, Consumer app, human genetics</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>42f1a769-33b3-403b-9fbe-3b0e593fc18a</GUID>
      <Name>High performance discrete Fourier transforms </Name>
<ShortDescription>We present novel algorithms for computing discrete Fourier transforms with high performance on GPUs. We present hierarchical, mixed radix FFT algorithms for both power-of-two and non-power-of-two sizes. Our hierarchical FFT algorithms efficiently exploit shared memory on GPUs using a Stockham formulation. We reduce the memory transpose overheads in hierarchical algorithms by combining the transposes into a block-based multi-FFT algorithm. For non-power-of-two sizes, we use a combination of mixed radix FFTs of small primes and Bluestein's algorithm. We use modular arithmetic in Bluestein's algorithm to improve the accuracy. We implemented our algorithms using the NVIDIA CUDA API and compared their performance with NVIDIA's CUFFT library and an optimized CPU-implementation (Intel's MKL) on a high-end quad-core CPU. On an NVIDIA GPU, we obtained performance of up to 300 GFlops, with typical performance improvements of 2--4x over CUFFT and 8--40x improvement over MKL for large sizes. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/470_FourierTransforms_1_new_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/470_FourierTransforms_1_new_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Commercial</OrganizationType>
      <OrganizationName>Microsoft Corporation</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>40</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="nagag@microsoft .com ">Naga K. Govindaraju</Author>
         <Author email="">Brandon Lloyd</Author>         <Author email="">Yuri Dotsenko</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1413370.1413373&amp;dl=ACM&amp;coll=ACM">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Naga K. Govindaraju,Brandon Lloyd,Yuri Dotsenko,Algorithms, Design, Experimentation, Measurement, Performance </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>5a9e5a89-919c-4735-872b-a0670bb94480</GUID>
      <Name>How GPUs can outperform ASICs for fast LDPC decoding</Name>
<ShortDescription>Due to huge computational requirements, powerful Low-Density Parity-Check (LDPC) error correcting codes, discovered in the early 1960s, have only recently been adopted by emerging communication standards. LDPC decoders are supported by VLSI technology, which delivers good parallel computational power with excellent throughputs, but at the expense of significant costs. In this work, we propose an alternative flexible LDPC decoder that exploits data-parallelism for simultaneous multicodeword decoding, supported by multithreading on CUDA-based graphics processing units (GPUs). The ratio of arithmetic operations per memory access is low for the efficient min-sum LDPC decoding algorithm proposed, which causes a bottleneck due to memory latency and data collisions. We propose runtime data realignment to allow coalesced parallel memory accesses to be performed by distinct threads inside the same warp. The memory access patterns of LDPC codes are random, which does not admit the simultaneous use of coalescence in both read and write operations of the decoding process. To overcome this problem we have developed a data mapping transformation which allows new addresses to be contiguously accessed for one of the mentioned memory access types. Our implementation shows throughputs above 100Mbps and BER curves that compare well with ASIC solutions. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/469_QPPldpcgraph_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/469_QPPldpcgraph_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Coimbra, Coimbra, Portugal </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="gff@deec.uc.pt">Gabriel Falcao</Author>
         <Author email="">Vitor Silva</Author>         <Author email="">Leonel Sousa </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1542275.1542330&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Gabriel Falcao,Leonel Sousa,Vitor Silva,</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>24d1eddb-c861-4eae-8746-e0bb6eb9c3f3</GUID>
      <Name>High performance genetic programming </Name>
<ShortDescription>The availability of low cost powerful parallel graphics cards has stimulated the port of Genetic Programming (GP) on Graphics Processing Units (GPUs). Our work focuses on the possibilities offered by Nvidia G80 GPUs when programmed in the CUDA language. We compare two parallelization schemes that evaluate several GP programs in parallel. We show that the fine grain distribution of computations over the elementary processors greatly impacts performances. We also present memory and representation optimizations that further enhance computation speed, up to 2.8 billion GP operations per second. The code has been developed with the well known ECJ library. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/468_mutation_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/468_mutation_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Universite Lille Nord de France, Calais, France</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="poty@lil.univ-littoral.fr ">Denis Robilliard</Author>
         <Author email="">Virginie Marion</Author>         <Author email="">Cyril Fonlupt </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1555284.1555299&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Denis Robilliard,Virginie Marion,Cyril Fonlupt,poty@lil.univ-littoral.fr,genetic algorithms, genetic programming, parallel processing </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>27faebfe-cded-4c12-b4b5-88c50c12807c</GUID>
      <Name>A game loop architecture for the GPU used as a math coprocessor in real-time applications</Name>
<ShortDescription>This article concerns the use of a graphics processor unit (GPU) as a math co-processor in real-time applications in special games and physics simulations. To validate this approach, we present a new game loop architecture that employs GPUs for general-purpose computations (GPGPUs). A critical issue here is the process distribution between the CPU and the GPU. The architecture consists of a model for distribution, and our implementation offers many advantages in comparison to other approaches without the GPGPU stage. This architecture can be used either by a general-purpose language such as the Compute Unified Device Architecture (CUDA), or shader languages such as the High-Level Shader Language (HLSL) and the OpenGL Shading Language (GLSL). Although the architecture proposed here aims at supporting mathematics and physics on the GPU, it is possible to adapt any kind of generic computation. This article discusses the model implementation in an open-source game engine and presents the results of using this platform. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/467_Minna-de-Puzloop-1_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/467_Minna-de-Puzloop-1_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Instituto de Computacao, Universidade Federal Fluminense, Brazil</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="esteban@inf.puc-rio.br">Marcelo P. M. Zamith</Author>
         <Author email="">Esteban W. G. Clua</Author>         <Author email="">Aura Conci</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1394021.1394035&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Game Physics,Numerics,Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Marcelo P. M. Zamith,Esteban W. G. Clua,Aura Conci,esteban@inf.puc-rio.br,Game loop, real-time physics </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>3ac00157-b33b-4de3-87e4-b079e50c6f8a</GUID>
      <Name>A hardware redundancy and recovery mechanism for reliable scientific computation</Name>
<ShortDescription>General purpose computation on graphics processors (GPGPU) has rapidly evolved since the introduction of commodity programmable graphics hardware. With the appearance of GPGPU computation-oriented APIs such as AMD's Close to the Metal (CTM) and NVIDIA's Compute Unified Device Architecture (CUDA), we begin to see GPU vendors putting financial stakes into this non-graphics, one-time niche market. Major supercomputing installations are building GPGPU clusters to take advantage of massively parallel floating point capabilities, and Folding@Home has even released a GPU port of its protein folding distributed computation client. But in order for GPGPU to truly become important to the supercomputing community, vendors will have to address the heretofore unimportant reliability concerns of graphics processors. We present a hardware redundancy-based approach to reliability for general purpose computation on GPUs that requires minimal change to existing GPU architectures. Upon detecting an error, the system invokes an automatic recovery mechanism that only recomputes erroneous results. Our results show that our technique imposes less than a 1.5 x performance penalty and saves energy for GPGPU but is completely transparent to general graphics and does not affect the performance of the games that drive the market. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/466_cuda-nbody-example_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/466_cuda-nbody-example_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Virginia </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="skadron@cs.virginia.edu">Jeremy W. Sheaffer</Author>
         <Author email="">David P. Luebke</Author>         <Author email="">Kevin Skadron </Author>      </Authors>
      <ContentTypes>
              <ContentType url="Application URL	http://portal.acm.org/citation.cfm?id=1280094.1280104&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Jeremy W. Sheaffer,David P. Luebke,Kevin Skadron </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>04b954f1-d86f-409f-a6ad-d3ac9b072663</GUID>
      <Name>Accelerated Pathfinding</Name>
<ShortDescription>In the past few years the graphics programmable processor (GPU) has evolved into an increasingly convincing computational resource for non graphics applications. The GPU is especially well suited to address problem sets expressed as data parallel computation with the same program executed on many data elements concurrently. In pursuing a scalable navigation planning approach for many thousands of agents in crowded game scenes, developers became more attracted to decomposable movement algorithms that lend to explicit parallelism. Pathfinding is one key computational intelligence action in games that is typified by intense search over sparse graph data structures. This paper describes an efficient GPU implementation of parallel global pathfinding using the CUDA programming environment, and demonstrates GPU performance scale advantage in executing an inherently irregular and divergent algorithm. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/465_image006_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/465_image006_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Commercial</OrganizationType>
      <OrganizationName>NVIDIA Corporation</OrganizationName>
      <OrganizationURL>http://www.nvidia.com</OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="ableiweiss@nvidia.com">Avi Bleiweiss</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1413957.1413968&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Avi Bleiweiss,ableiweiss@nvidia.com</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>2b0d90a1-b0dc-4834-833f-c633cdf8bd9b</GUID>
      <Name>BSGP: bulk-synchronous </Name>
<ShortDescription>We present BSGP, a new programming language for general purpose computation on the GPU. A BSGP program looks much the same as a sequential C program. Programmers only need to supply a bare minimum of extra information to describe parallel processing on GPUs. As a result, BSGP programs are easy to read, write, and maintain. Moreover, the ease of programming does not come at the cost of performance. A well-designed BSGP compiler converts BSGP programs to kernels and combines them using optimally allocated temporary streams. In our benchmark, BSGP programs achieve similar or better performance than well-optimized CUDA programs, while the source code complexity and programming time are significantly reduced. To test BSGP's code efficiency and ease of programming, we implemented a variety of GPU applications, including a highly sophisticated X3D parser that would be extremely difficult to develop with existing GPU programming languages.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/464_6_small.JPG</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/464_6_large.JPG</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Tsinghua University </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="rforur@microsoft.com">Qiming Hou</Author>
         <Author email="">Kun Zhou</Author>         <Author email="">Baining Guo </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1399504.1360618&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Qiming Hou,Kun Zhou,Baining Guo </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>0294e1a5-493d-432d-a5f8-66ca43222dc6</GUID>
      <Name>High performance discrete Fourier transforms </Name>
<ShortDescription>We present novel algorithms for computing discrete Fourier transforms with high performance on GPUs. We present hierarchical, mixed radix FFT algorithms for both power-of-two and non-power-of-two sizes. Our hierarchical FFT algorithms efficiently exploit shared memory on GPUs using a Stockham formulation. We reduce the memory transpose overheads in hierarchical algorithms by combining the transposes into a block-based multi-FFT algorithm. For non-power-of-two sizes, we use a combination of mixed radix FFTs of small primes and Bluestein's algorithm. We use modular arithmetic in Bluestein's algorithm to improve the accuracy. We implemented our algorithms using the NVIDIA CUDA API and compared their performance with NVIDIA's CUFFT library and an optimized CPU-implementation (Intel's MKL) on a high-end quad-core CPU. On an NVIDIA GPU, we obtained performance of up to 300 GFlops, with typical performance improvements of 2--4x over CUFFT and 8--40x improvement over MKL for large sizes. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/463_fc100_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/463_fc100_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Commercial</OrganizationType>
      <OrganizationName>Microsoft Corporation</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>40</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="nagag@microsoft.com ">Naga K. Govindaraju</Author>
         <Author email="">Brandon Lloyd</Author>         <Author email="">Yuri Dotsenko</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1413370.1413373&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Naga K. Govindaraju,Brandon Lloyd,Yuri Dotsenko,Algorithms, Design, Experimentation, Measurement, Performance </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>a0694fe3-9fac-4d87-9684-16832426b768</GUID>
      <Name>Wave field synthesis for 3D audio: architectural prospectives</Name>
<ShortDescription>In this paper, we compare the architectural perspectives of the Wave Field Synthesis (WFS) 3D-audio algorithm mapped on three different platforms: a General Purpose Processor (GPP), a Graphics Processor Unit (GPU) and a Field Programmable Gate Array (FPGA). Previous related work reveals that, up to now, WFS sound systems are based on standard PCs. However, on one hand, contemporary GPUs consist of many multiprocessors that can process data concurrently. On the other hand, recent FPGAs provide huge level of parallelism, and reasonably high performance potentials, which can be exploited very efficiently by smart designers. Furthermore, new parallel programming environments, such as the Compute Unified Device Architecture (CUDA) from NVidia and the Stream from ATI, give to the researchers full access to the GPU resources. We use the CUDA to map the WFS kernel on a GeForce 8600GT GPU. Additionally, we implement a reconfigurable and scalable hardware accelerator for the same kernel, and map it onto Virtex4 FPGAs. We compare both architectural approaches against a baseline GPP implementation on a Pentium D at 3.4 GHz. Our conclusion is that in highly demanding WFS-based audio systems, a low-cost GeForce 8600GT desktop GPU can achieve a speedup of up to 8x comparing to a modern Pentium D implementation. An FPGA-based WFS hardware accelerator consisting of a single rendering unit (RU), can provide a speedup of up 10x comparing to the Pentium D approach. It can fit into small FPGAs and consumes approximately 3 Watts. Furthermore, cascading multiple RUs into a larger FPGA, can boost processing throughput up to more than two orders of magnitude higher than a GPP-based implementation and an order of magnitude better than a low-cost GPU one. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/462_wfs-objetos_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/462_wfs-objetos_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Delft University of Technology, Delft, Netherlands</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>10</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="D.Theodoropoulos@tudelft.nl">Dimitris Theodoropoulos</Author>
         <Author email="">Catalin Bogdan Ciobanu</Author>         <Author email="">Georgi Kuzmanov </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1531743.1531764&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Video &amp; Audio</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Dimitris Theodoropoulos,Catalin Bogdan Ciobanu,Georgi Kuzmanov </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>b74a6976-b873-458c-acfe-6057b5eedf72</GUID>
      <Name>A compiler framework for optimization of affine loop nests</Name>
<ShortDescription>GPUs are a class of specialized parallel architectures with tremendous computational power. The new Compute Unified Device Architecture (CUDA) programming model from NVIDIA facilitates programming of general purpose applications on their GPUs. However, manual development of high-performance parallel code for GPUs is still very challenging. In this paper, a number of issues are addressed towards the goal of developing a compiler framework for automatic parallelization and performance optimization of affine loop nests on GPGPUs: 1) approach to program transformation for efficient data access from GPU global memory, using a polyhedral compiler model of data dependence abstraction and program transformation; 2) determination of optimal padding factors for conflict-minimal data access from GPU shared memory; and 3) model-driven empirical search to determine optimal parameters for unrolling and tiling. Experimental results on a number of kernels demonstrate the effectiveness of the compiler optimization approaches developed. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/461_180px-Polytope_model_unskewed.svg_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/461_180px-Polytope_model_unskewed.svg_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>The Ohio State University, Columbus, OH, USA </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="baskaran@cse.ohio-state.edu">Muthu Manikandan Baskaran</Author>
         <Author email="">Uday Bondhugula </Author>         <Author email="">Sriram Krishnamoorthy</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1375527.1375562&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Muthu Manikandan Baskaran,Uday Bondhugula,Sriram Krishnamoorthy </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>bd6d474d-c2bb-423c-b00f-fe9a2fedb280</GUID>
      <Name>Single-particle 3d reconstruction from cryo-electron microscopy images </Name>
<ShortDescription>Single-particle 3D reconstruction from cryo-electron microscopy (cryo-EM) images is a kernel application of biological molecules analysis, as the computational requirement of which is now beyond PetaFlop for a high-resolution 3D structure. In this paper, we quantitatively analyze the workload, computational intensity and memory performance of the application, parallelize it on an emerging multicore architecture GPU-CUDA. Further we apply a percolation technique to decouple computation with memory operations and orchestrate thread-data mapping to reduce the overhead off-chip memory operations. Finally we tested our optimization strategy on a popular open-source package EMAN to GPU-CUDA, which achieves a relative speedup of about 10X to the original CPU-only EMAN. The experimental results also show that the proposed percolation programming greatly improves utilization of memory bandwidth and floating-point units. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/460_kouzouseiri_image_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/460_kouzouseiri_image_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Chinese Academy of Science, Beijing, China</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>10</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="tgm@ncic.ac.cn">Guangming Tan </Author>
         <Author email="">Ziyu Guo</Author>         <Author email="">Mingyu Chen</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1542275.1542329&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>MedicalImaging</ApplicationType>
	<ApplicationType>Life Sciences</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Guangming Tan,Ziyu Guo,Mingyu Chen</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>0d938d85-37c4-41f6-8329-fad802f09c5e</GUID>
      <Name>All-pairs shortest-paths for large graphs </Name>
<ShortDescription>The all-pairs shortest-path problem is an intricate part in numerous practical applications. We describe a shared memory cache efficient GPU implementation to solve transitive closure and the all-pairs shortest-path problem on directed graphs for large datasets. The proposed algorithmic design utilizes the resources available on the NVIDIA G80 GPU architecture using the CUDA API. Our solution generalizes to handle graph sizes that are inherently larger then the DRAM memory available on the GPU. Experiments demonstrate that our method is able to significantly increase processing large graphs making our method applicable for bioinformatics, internet node traffic, social networking, and routing problems.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/459_2_new_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/459_2_new_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Pennsylvania and Lockheed Martin </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="kiderj@seas.upenn.edu">Gary J. Katz</Author>
         <Author email="">Joseph T. Kider, Jr </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1413957.1413966&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Gary J. Katz, Joseph T. Kider, Jr </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>d8ab1ded-aa91-4d3f-be16-5b13f6a6a1e2</GUID>
      <Name>Program optimization space pruning for a multithreaded gpu</Name>
<ShortDescription>Program optimization for highly-parallel systems has historically been considered an art, with experts doing much of the performance tuning by hand. With the introduction of inexpensive, single-chip, massively parallel platforms, more developers will be creating highly-parallel applications for these platforms, who lack the substantial experience and knowledge needed to maximize their performance. This creates a need for more structured optimization methods with means to estimate their performance effects. Furthermore these methods need to be understandable by most programmers. This paper shows the complexity involved in optimizing applications for one such system and one relatively simple methodology for reducing the workload involved in the optimization process. This work is based on one such highly-parallel system, the GeForce 8800 GTX using CUDA. Its flexible allocation of resources to threads allows it to extract performance from a range of applications with varying resource requirements, but places new demands on developers who seek to maximize an application's performance. We show how optimizations interact with the architecture in complex ways, initially prompting an inspection of the entire configuration space to find the optimal configuration. Even for a seemingly simple application such as matrix multiplication, the optimal configuration can be unexpected. We then present metrics derived from static code that capture the first-order factors of performance. We demonstrate how these metrics can be used to prune many optimization configurations, down to those that lie on a Pareto-optimal curve. This reduces the optimization space by as much as 98% and still finds the optimal configuration for each of the studied applications. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/458_deferredshadow_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/458_deferredshadow_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName></OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="sryoo@crhc.uiuc.edu">Shane Ryoo</Author>
         <Author email="">Christopher I. Rodrigues </Author>         <Author email="">Sam S. Stone </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1356058.1356084&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Shane Ryoo,Christopher I. Rodrigues,Sam S. Stone </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>018b5db4-b3cb-444f-be6c-778b8517c99b</GUID>
      <Name>Aspects of GPU for general purpose high performance computing</Name>
<ShortDescription>We discuss hardware and software aspects of GPGPU, specifically focusing on NVIDIA cards and CUDA, from the viewpoints of parallel computing. The major weak points of GPU against newest supercomputers are identified to be and summarized as only four points: large SIMD vector length, small memory, absence of fast L2 cache, and high register spill penalty. As software concerns, we derive optimal scheduling algorithm for latency hiding of host-device data transfer, and discuss SPMD parallelism on GPUs.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/457_GeForce_GTX_280_3qtr_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/457_GeForce_GTX_280_3qtr_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>The University of Tokyo </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="reiji@is.s.u-tokyo.ac.jp ">Reiji Suda </Author>
         <Author email="">Takayuki Aoki </Author>         <Author email="">Shoichi Hirasawa</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1509633.1509696&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Reiji Suda,Takayuki Aoki,Shoichi Hirasawa </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>21306bd4-d5ae-4455-b202-c8bae8a17348</GUID>
      <Name>Software Pipelined Execution of Stream Programs</Name>
<ShortDescription>The StreamIt programming model has been proposed to exploit parallelism in streaming applications on general purpose multi-core architectures. This model allows programmers to specify the structure of a program as a set of filters that act upon data, and a set of communication channels between them. The StreamIt graphs describe task, data and pipeline parallelism which can be exploited on modern Graphics Processing Units (GPUs), as they support abundant parallelism in hardware. In this paper, we describe the challenges in mapping StreamIt to GPUs and propose an efficient technique to software pipeline the execution of stream programs on GPUs. We formulate this problem --- both scheduling and assignment of filters to processors --- as an efficient Integer Linear Program (ILP), which is then solved using ILP solvers. We also describe a novel buffer layout technique for GPUs which facilitates exploiting the high memory bandwidth available in GPUs. The proposed scheduling utilizes both the scalar units in GPU, to exploit data parallelism, and multiprocessors, to exploit task and pipeline parallelism. Further it takes into consideration the synchronization and bandwidth limitations of GPUs, and yields speedups between 1.87X and 36.83X over a single threaded CPU. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/456_pipe_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/456_pipe_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Supercomputer Education and Research Centre, Indian Institute of Science</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>37</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="mjt@csa.iisc.ernet.in">Abhishek Udupa </Author>
         <Author email="">R. Govindarajan</Author>         <Author email="">Matthew J. Thazhuthaveetil </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1545006.1545070&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Abhishek Udupa, R. Govindarajan,Matthew J. Thazhuthaveetil,mjt@csa.iisc.ernet.in </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>ea8c2995-d603-42a2-93f8-633811e8b9c2</GUID>
      <Name>Pervasive massively multithreaded GPU processors</Name>
<ShortDescription>This talk presents an overview of NVIDIA's SIMT architecture and some brief insights on how some CUDA programming paradigms map onto it. A brief history of SIMT is provided to explain how NVIDIA ended up implementing a unified SIMT processor core in its GPUs including how graphics shaders are mapped onto SIMT threads. In addition, a conceptual view of how a SIMT microarchitecture executes threads in parallel is provided. The talk wraps up by describing some pitfalls related to thread synchronization, memory access, and cache management and describes some key problem areas in SIMT programming that NVIDIA would like to address in the future</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/455_nvidia_gpu_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/455_nvidia_gpu_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Commercial</OrganizationType>
      <OrganizationName>NVIDIA Corporation, Santa Clara, CA, USA </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="mshebanow@nvidia.com ">Michael C. Shebanow </Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1531743.1531745&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Science</ApplicationType>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Michael C. Shebanow , mshebanow@nvidia.com </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>d1cab6b3-cfb3-4feb-9d02-4517910a5cf0</GUID>
      <Name>A compiler and runtime system for enabling data mining applications</Name>
<ShortDescription>With increasing need for accelerating data mining and scientific data analysis on large data sets, and less chance to improve processor performance by simply increasing clock frequencies, multi-core architectures and accelerators like FPGAs and GPUs have become popular. A recent development in using GPU for general computing has been the release of CUDA (Compute Unified Device Architecture) by NVIDIA. CUDA allows GPU programming with Clanguage-like features, thus easing the development of non-graphics applications on a GPU. However, several challenges still remain in programming the GPUs with CUDA, because CUDA involves explicit parallel programming and management of its complex memory hierarchy, as well as allocating device memory, moving data between CPU anddevice memory, and specification of thread grid configurations. In this paper, we offer a solution for the programmers to generate CUDA code by specifying the sequential reduction loop(s) with some information about the parameters. With program analysis and code generation, the applications are mapped to a GPU. Several additional optimizations are also performed by the middleware. We have evaluated our system using three popular data miningapplications, k-means clustering, EM clustering, and Principal Component Analysis (PCA). The speedup that each of these applications achieve over a sequential CPU version ranges between 20 and 50. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/454_data-mining_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/454_data-mining_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>The Ohio State University, Columbus, OH, USA </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>50</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="acmw@cse.ohio-state.edu">Wenjing Ma </Author>
         <Author email="">Gagan Agrawal</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1504176.1504218&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Wenjing Ma, Gagan Agrawal </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>9bc5625b-e5f6-4072-a83c-32e59a956b1d</GUID>
      <Name>A control-structure splitting optimization for GPGPU</Name>
<ShortDescription>Control statements in a GPU program such as loops and branches pose serious challenges for the efficient usage of GPU resources because those control statements will lead to the serialization of threads and consequently ruin the occupancy of GPU, that is, the number of threads running concurrently. Unlike traditional vector processing units that are inside a general purpose processor, the GPU cannot leave the control statements to the CPU because fine-grain statement scheduling between GPU and CPU is impossible. We need an effective method to handle the control statements "just in place" on the GPUs. In this paper, we propose novel techniques to transform control statements so that they can be executed efficiently on GPUs. Our techniques smartly increase code redundancy, which might be deemed as "de-optimization" for CPU, to improve the occupancy of a program on GPU and therefore improve performance. We focus our attention on how common programming structures such as loops and branches decrease the occupancy of single kernels and how to counter that. We demonstrate our optimizations on a synthetic benchmark and a complex parallel algorithm, the Lattice Boltzmann Method (LBM). Our results show that these techniques are very efficient and can lead to an increase in occupancy and a drastic improvement in performance compared to non-split version of the programs. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/453_fracorg_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/453_fracorg_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Delaware, Newark, USA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="xli@ece.udel.edu">Snaider Carrillo</Author>
         <Author email="">Jakob Siegal</Author>         <Author email="">Xiaoming Li</Author>      </Authors>
      <ContentTypes>
            </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Snaider Carrillo,Jakob Siegal,Xiaoming Li</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>b223f4bf-c0f0-49bc-876f-1b1d7058d9e7</GUID>
      <Name>Massive parallel LDPC decoding </Name>
<ShortDescription>Low-Density Parity-Check (LDPC) codes are powerful error correcting codes (ECC). They have recently been adopted by several data communication standards such as DVB-S2 and WiMax. LDPCs are represented by bipartite graphs, also called Tanner graphs, and their decoding demands very intensive computation. For that reason, VLSI dedicated architectures have been investigated and developed over the last few years. This paper proposes a new approach for LDPC decoding on graphics processing units (GPUs). Efficient data structures and an new algorithm are proposed to represent the Tanner graph and to perform LDPC decoding according to the stream-based computing model. GPUs were programmed to efficiently implement the proposed algorithms by applying data-parallel intensive computing. Experimental results show that GPUs perform LDPC decoding nearly three orders of magnitude faster than modern CPUs. Moreover, they lead to the conclusion that GPUs with their tremendous processing power can be considered as a consistent alternative to state-of-the-art hardware LDPC decoders. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/452_ldpc_generation_graph_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/452_ldpc_generation_graph_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Instituto de Telecomunicacoes/FCTUC, University of Coimbra, Coimbra, Portugal </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="gff@deec.uc.pt">Gabriel Falcao</Author>
         <Author email="">Leonel Sousa </Author>         <Author email="">Vitor Silva</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1345206.1345221&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Gabriel Falcao,Leonel Sousa,Vitor Silva</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>3510667b-e35f-4140-a7dc-c4ff7e95ee68</GUID>
      <Name>Efficient computation of sum-products on GPUs through software-managed cache</Name>
<ShortDescription>We present a technique for designing memory-bound algorithms with high data reuse on Graphics Processing Units (GPUs) equipped with close-to-ALU software-managed memory. The approach is based on the efficient use of this memory through the implementation of a software-managed cache. We also present an analytical model for performance analysis of such algorithms. We apply this technique to the implementation of the GPU-based solver of the sum-product or marginalize a product of functions (MPF) problem, which arises in a wide variety of real-life applications in artificial intelligence, statistics, image processing, and digital communications. Our motivation to accelerate MPF originated in the context of the analysis of genetic diseases, which in some cases requires years to complete on modern CPUs. Computing MPF is similar to computing the chain matrix product of multi-dimensional matrices, but is more difficult due to a complex data-dependent access pattern, high data reuse, and a low compute-to-memory access ratio. Our GPU-based MPF solver achieves up to 2700-fold speedup on random data and 270-fold on real-life genetic analysis datasets on GeForce 8800GTX GPU from NVIDIA over the optimized CPU version on an Intel 2.4GHz Core 2 with a 4MB L2 cache. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/451_6763420-0-large_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/451_6763420-0-large_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Technion - Israel Institute of Technology, Haifa, Israel</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>270</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="">Mark Silberstein</Author>
         <Author email="">Assaf Schuster</Author>         <Author email="">Dan Geiger</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1375527.1375572&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Life Sciences</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Mark Silberstein, Assaf Schuster,Dan Geiger </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>66160e9c-33cb-4208-9562-05b21eceb571</GUID>
      <Name>Accelerating total variation regularization for matrix-valued images on GPUs</Name>
<ShortDescription>The advent of new matrix-valued magnetic resonance imaging modalities such as Diffusion Tensor Imaging (DTI) requires extensive computational acceleration. Computational acceleration on graphics processing units (GPUs) can make the regularization (denoising) of DTI images attractive in clinical settings, hence improving the quality of DTI images in a broad range of applications. Construction of DTI images consists of direction-specific Magnetic Resonance (MR) measurements. Compared with conventional MR, direction-sensitive acquisition has a lower signal-to-noise ratio (SNR). Therefore, high noise levels often limit DTI imaging. Advanced post-processing of imaging data can improve the quality of estimated tensors. However, the post-processing problem is only made more computationally difficult when considering matrix-valued imaging data. This paper describes the acceleration of a Total Variation regularization method for matrix-valued images, in particular, for DTI images on NVIDIA Quadro FX 5600. The TV regularization of a 3-D image with 1283 voxels ultimately achieves 266X speedup and requires 1 minute and 30 seconds on the Quadro, while this algorithm on a dual-core CPU completes in more than 3 hours. In this application study we are aimed at analyzing the effective of excessive synchronization, which provides an insight into generally adapting Variational methods to the GPU architecture for other image processing algorithms designed for matrix-valued images. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/450_matrix_rose_leaf_3_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/450_matrix_rose_leaf_3_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of California, Los Angeles, CA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>266</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="mmoazeni@cs.ucla.edu ">Maryam Moazeni</Author>
         <Author email="">Alex Bui</Author>         
         <Author email="">Majid Sarrafzadeh </Author>      
      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1531743.1531765&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      
      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Maryam Moazeni,Alex Bui,Majid Sarrafzadeh </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>ca59d3fd-811a-44aa-bffa-097765bd6b20</GUID>
      <Name>Performance analysis of accelerated image registration using GPGPU</Name>
<ShortDescription>This paper presents a performance analysis of an accelerated 2-D rigid image registration implementation that employs the Compute Unified Device Architecture (CUDA) programming environment to take advantage of the parallel processing capabilities of NVIDIA's Tesla C870 GPU. We explain the underlying structure of the GPU implementation and compare its performance and accuracy against a fast CPU-based implementation. Our experimental results demonstrate that our GPU version is capable of up to 90x speedup with bilinear interpolation and 30x speedup with bicubic interpolation while maintaining a high level of accuracy. This compares favorably to recent image registration studies, but it also indicates that our implementation only reaches about 70% of theorectical peak performance. To analyze our results, we utilize profiling data to identify some of the underlying limitations of CUDA that prohibit peak performance. At the end, we emphasize the need to manage memory resources carefully to fully utilize the GPU and obtain maximum speedup. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/449_attention_based_image_registration_saliency_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/449_attention_based_image_registration_saliency_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Notre Dame </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>01</ReleaseDay>
      <ReleaseDateDisplay>12/01/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>90</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="pbui@nd.edu">Peter Bui</Author>
         <Author email="">Jay Brockman</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1513895.1513900&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging,Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Peter Bui,Jay Brockman </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>b33ef247-efa4-414a-bb2a-ee8f016f096f</GUID>
      <Name>Accelerating advanced mri reconstructions</Name>
<ShortDescription>Computational acceleration on graphics processing units (GPUs) can make advanced magnetic resonance imaging (MRI) reconstruction algorithms attractive in clinical settings, thereby improving the quality of MR images across a broad spectrum of applications. At present, MR imaging is often limited by high noise levels, significant imaging artifacts, and/or long data acquisition (scan) times. Advanced image reconstruction algorithms can mitigate these limitations and improve image quality by simultaneously operating on scan data acquired with arbitrary trajectories and incorporating additional information such as anatomical constraints. However, the improvements in image quality come at the expense of a considerable increase in computation. This paper describes the acceleration of an advanced reconstruction algorithm on NVIDIA's Quadro FX 5600. Optimizations such as register allocating the voxel data, tiling the scan data, and storing the scan data in the Quadro's constant memory dramatically reduce the reconstruction's required bandwidth to on-chip memory. The Quadro's special functional units provide substantial acceleration of the trigonometric computations in the algorithm's inner loops, and experimentally-tuned code transformations increase the reconstruction's performance by an additional 20%. The reconstruction of a 3D image with 128^3 voxels ultimately achieves 150 GFLOPS and requires less than two minutes on the Quadro, while reconstruction on a quad-core CPU is thirteen times slower. Furthermore, relative to the true image, the error exhibited by the advanced reconstruction is only 12%, while conventional reconstruction techniques incur error of 42%. In short, the acceleration afforded by the GPU greatly increases the appeal of the advanced reconstruction for clinical MRI applications. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/448_Img00250_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/448_Img00250_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Illinois at Urbana-Champaign, Urbana, IL, USA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>13</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="ssstone2@crhc.uiuc.edu">Samuel S. Stone</Author>
         <Author email="">Justin P. Haldar </Author>         <Author email="">Stephanie C. Tsao</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1366230.1366276&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>MedicalImaging</ApplicationType>
	<ApplicationType>Life Sciences</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Samuel S. Stone, Justin P. Haldar, Stephanie C. Tsao,ssstone2@crhc.uiuc.edu</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>51d66e62-d1a3-42b7-9f4b-29ce84e42a20</GUID>
      <Name>GPU acceleration of cutoff pair potentials for molecular modeling applications</Name>
<ShortDescription>The advent of systems biology requires the simulation of ever-larger biomolecular systems, demanding a commensurate growth in computational power. This paper examines the use of the NVIDIA Tesla C870 graphics card programmed through the CUDA toolkit to accelerate the calculation of cutoff pair potentials, one of the most prevalent computations required by many different molecular modeling applications. We present algorithms to calculate electrostatic potential maps for cutoff pair potentials. Whereas a straightforward approach for decomposing atom data leads to low compute efficiency, a newer strategy enables fine-grained spatial decomposition of atom data that maps efficiently to the C870's memory system while increasing work-efficiency of atom data traversal by a factor of 5. The memory addressing flexibility exposed through CUDA's SPMD programming model is crucial in enabling this new strategy. An implementation of the new algorithm provides a greater than threefold performance improvement over our previously published implementation and runs 12 to 20 times faster than optimized CPU-only code. The lessons learned are generally applicable to algorithms accelerated by uniform grid spatial decomposition. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/447_imprint_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/447_imprint_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Illinois at Urbana-Champaign, Urbana-Champaign, IL, USA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>20</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="cirodrig@illinois.edu">Christopher I. Rodrigues</Author>
         <Author email="">David J. Hardy</Author>         <Author email="">John E. Stone</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1366230.1366277&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Christopher I. Rodrigues,David J. Hardy,John E. Stone, graphics processors, molecular dynamics </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>2155f6f6-18cc-479b-85a0-3e96576dff51</GUID>
      <Name>An analytical model for a GPU architecture with memory-level and thread-level parallelism awareness</Name>
<ShortDescription>GPU architectures are increasingly important in the multi-core era due to their high number of parallel processors. Programming thousands of massively parallel threads is a big challenge for software engineers, but understanding the performance bottlenecks of those parallel programs on GPU architectures to improve application performance is even more difficult. Current approaches rely on programmers to tune their applications by exploiting the design space exhaustively without fully understanding the performance characteristics of their applications. To provide insights into the performance bottlenecks of parallel applications on GPU architectures, we propose a simple analytical model that estimates the execution time of massively parallel programs. The key component of our model is estimating the number of parallel memory requests (we call this the memory warp parallelism) by considering the number of running threads and memory bandwidth. Based on the degree of memory warp parallelism, the model estimates the cost of memory requests, thereby estimating the overall execution time of a program. Comparisons between the outcome of the model and the actual execution time in several GPUs show that the geometric mean of absolute error of our model on micro-benchmarks is 5.4% and on GPU computing applications is 13.3%. All the applications are written in the CUDA programming language. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/446_figure09_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/446_figure09_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Georgia Institute of Technology, Atlanta, GA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>01</ReleaseDay>
      <ReleaseDateDisplay>12/01/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="hyesoon@cc.gatech.edu ">Sunpyo Hong</Author>
         <Author email="">Hyesoon Kim </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1555754.1555775&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Sunpyo Hong, Hyesoon Kim, hyesoon@cc.gatech.edu </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>d9006f32-0255-4aab-a31d-a8f339088809</GUID>
      <Name>A translation system for enabling data mining applications on GPUs</Name>
<ShortDescription>Modern GPUs offer much computing power at a very modest cost. Even though CUDA and other related recent developments are accelerating the use of GPUs for general purpose applications, several challenges still remain in programming the GPUs. Thus, it is clearly desirable to be able to program GPUs using a higher-level interface. In this paper, we offer a solution that targets a specific class of applications, which are the data mining and scientific data analysis applications. Our work is driven by the observation that a common processing structure, that of generalized reductions, fits a large number of popular data mining algorithms. In our solution, the programmers simply need to specify the sequential reduction loop(s) with some additional information about the parameters. We use program analysis and code generation to map the applications to a GPU. Several additional optimizations are also performed by the system. We have evaluated our system using three popular data mining applications, k-means clustering, EM clustering, and Principal Component Analysis (PCA). The main observations from our experiments are as follows. The speedup that each of these applications achieve over a sequential CPU version ranges between 20 and 50. The automatically generated version did not have any noticeable overheads compared to hand written codes. Finally, the optimizations performed in the system resulted in significant performance improvements. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/445_data-mining_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/445_data-mining_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>The Ohio State University, Columbus, OH, USA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>50</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="acmw@cse.ohio-state.edu">Wenjing Ma</Author>
         <Author email="">Gagan Agrawal</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1542275.1542331&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Wenjing Ma,Gagan Agrawal</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>96154328-556a-4904-a557-ae73986ce7bc</GUID>
      <Name>Hughes Trainable Text Skimmer: description of the TTS system as used for MUC-3</Name>
<ShortDescription>The objective of the Hughes Trainable Text Skimmer (TTS) Project is to create text skimming software that: (1) can be easily re-configured for new applications, (2) improves its performance with use, and (3) is fast enough to process megabytes of text per day. The TTS-MUC3 system is our first full scale prototype. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/444_text-deactivation_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/444_text-deactivation_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Hughes Research Laboratories, Malibu, CA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="HRLcontracts@hrl.com">Charles P. Dolan</Author>
         <Author email="">Thomas V. Cuda</Author>         <Author email="">Seth R. Goldman</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1071958.1071985&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Charles P. Dolan, Thomas V. Cuda,Seth R. Goldman, HRLcontracts@hrl.com</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>68f9f27b-3df1-46c8-aebf-1c79fc6f3a47</GUID>
      <Name>Accelerating linpack with CUDA on heterogenous clusters</Name>
<ShortDescription>This paper describes the use of CUDA to accelerate the Linpack benchmark on heterogenous clusters, where both CPUs and GPUs are used in synergy with minor or no modifications to the original source code. A host library intercepts the calls to DGEMM and DTRSM and executes them simultaneously on both GPUs and CPU cores. An 8U cluster is able to sustain more than a Teraflop using a CUDA accelerated version of HPL. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/443_1476-072X-7-57-2-l_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/443_1476-072X-7-57-2-l_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType></OrganizationType>
      <OrganizationName>NVIDIA</OrganizationName>
      <OrganizationURL>http://www.nvidia.com/cuda</OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="">Massimiliano Fatica </Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1513895.1513901&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Massimiliano Fatica, mfatica@nvidia.com, </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>53bada17-5c89-4cf8-8e56-5edee7ba8578</GUID>
      <Name>High-performance CUDA kernel execution on FPGAs</Name>
<ShortDescription>In this work, we propose a new FPGA design flow that combines the CUDA programming model from Nvidia with the state of the art high-level synthesis tool AutoPilot from AutoESL, to efficiently map the exposed parallelism in CUDA kernels onto reconfigurable devices. The use of the CUDA programming model offers the advantage of a common programming interface for exploiting parallelism on two very different types of accelerators -- FPGAs and GPUs. Moreover, by leveraging the advanced synthesis capabilities of AutoPilot we enable efficient exploitation of the FPGA configurability for application specific acceleration. Our flow is based on a compilation process that transforms the SPMD CUDA thread blocks into high-concurrency AutoPilot-C code. We provide an overview of our CUDA-to-FPGA flow and demonstrate the highly competitive performance of the generated multi-core accelerators. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/442_fpga_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/442_fpga_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Illinois, Urbana - Champaign, IL, USA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="apapako2@illinois.edu">Alexandros Papakonstantinou</Author>
         <Author email="">Karthik Gururaj </Author>         
         <Author email="">John A. Stratton</Author>      
      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1542275.1542357&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Electronic Design Automation</ApplicationType>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Alexandros Papakonstantinou,Karthik Gururaj,John A. Stratton</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>56573ff8-9c5d-49b3-a543-759fd0b3dfb8</GUID>
      <Name>A Cross-Input Adaptive Framework for GPU Program Optimizations</Name>
<ShortDescription>This work presents a CUDA program optimizer, named G-ADAPT. It is a tool for helping programmers determine the suitable values of a set of optimization parameters for a CUDA application. It is unique in being adaptive to the influence of program inputs on the application's executions. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/441_tjetb_iso_shaded_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/441_tjetb_iso_shaded_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>The College of William and Mary</OrganizationName>
      <OrganizationURL>http://www.cs.wm.edu/caps/</OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>05</ReleaseMonth>
      <ReleaseDay>25</ReleaseDay>
      <ReleaseDateDisplay>05/25/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="xshen@cs.wm.edu">Xipeng Shen</Author>
      </Authors>
      <ContentTypes>
              <ContentType url="http://www.cs.wm.edu/~xshen/Publications/ipdps09.pdf">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Programming Tools</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Program Optimizations, empirical search, Cross-input Adaptation. Xipeng Shen</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>b6d01d14-c22a-43ac-bc34-9e0ee006c583</GUID>
      <Name>Optimization principles and application performance evaluation of a multithreaded GPU </Name>
<ShortDescription>GPUs have recently attracted the attention of many application developers as commodity data-parallel coprocessors. The newest generations of GPU architecture provide easier programmability and increased generality while maintaining the tremendous memory bandwidth and computational power of traditional GPUs. This opportunity should redirect efforts in GPGPU research from ad hoc porting of applications to establishing principles and strategies that allow efficient mapping of computation to graphics hardware. In this work we discuss the GeForce 8800 GTX processor's organization, features, and generalized optimization strategies. Key to performance on this platform is using massive multithreading to utilize the large number of cores and hide global memory latency. To achieve this, developers face the challenge of striking the right balance between each thread's resource usage and the number of simultaneously active threads. The resources to manage include the number of registers and the amount of on-chip memory used per thread, number of threads per multiprocessor, and global memory bandwidth. We also obtain increased performance by reordering accesses to off-chip memory to combine requests to the same or contiguous memory locations and apply classical optimizations to reduce the number of executed operations. We apply these strategies across a variety of applications and domains and achieve between a 10.5X to 457X speedup in kernel codes and between 1.16X to 431X total application speedup.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/440_comet-connections_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/440_comet-connections_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Illinois at Urbana-Champaign, Urbana, IL, USA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>431</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="sryoo@crhc.uiuc.edu ">Shane Ryoo</Author>
         <Author email="">Christopher I. Rodrigues</Author>         <Author email="">Sara S. Baghsorkhi</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1345206.1345220&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Parallel Algorithms</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Shane Ryoo, Christopher I. Rodrigues, Sara S. Baghsorkhi, GPU computing, parallel computing </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>222b928f-43be-4e02-a007-a005d2655181</GUID>
      <Name>Bandwidth intensive 3-D FFT kernel</Name>
<ShortDescription>Most GPU performance "hypes" have focused around tightly-coupled applications with small memory bandwidth requirements e.g., N-body, but GPUs are also commodity vector machines sporting substantial memory bandwidth; however, effective programming methodologies thereof have been poorly studied. Our new 3-D FFT kernel, written in NVIDIA CUDA, achieves nearly 80 GFLOPS on a top-end GPU, being more than three times faster than any existing FFT implementations on GPUs including CUFFT. Careful programming techniques are employed to fully exploit modern GPU hardware characteristics while overcoming their limitations, including on-chip shared memory utilization, optimizing the number of threads and registers through appropriate localization, and avoiding low-speed stride memory accesses. Our kernel applied to real applications achieves orders of magnitude boost in power&amp;cost vs. performance metrics. The off-card bandwidth limitation is still an issue, which could be alleviated somewhat with application kernels confinement within the card, while ideal solution being facilitation of faster GPU interfaces. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/439_ERGOpage04_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/439_ERGOpage04_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Tokyo Institute of Technology, Tokyo, Japan and Japan Science and Technology Agency, Kawaguchi, Saitama, Japan </OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>3</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="nukada@matsulab.is.titech.ac.jp">Akira Nukada</Author>
         <Author email="">Yasuhiko Ogata</Author>         <Author email="">Toshio Endo</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1413370.1413376&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Akira Nukada, Yasuhiko Ogata, Toshio Endo, Algorithms, Design, Experimentation, Measurement, Performance </Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>ccaf5379-24f8-488e-a424-c6c223458be2</GUID>
      <Name>A High Performance Agent Based Modelling Framework</Name>
<ShortDescription>We present an efficient implementation of a high performance parallel framework for Agent Based Modelling (ABM), exploiting the parallel architecture of the Graphics Processing Unit (GPU). It provides a mapping between formal agent specifications, with C based scripting, and optimised NVIDIA Compute Unified Device Architecture (CUDA) code. The mapping of agent data structures and agent communication is described, and our work is evaluated through a number of simple interacting agent examples. In contrast with an alternative, single machine CPU implementation, a speedup of up to 250 times is reported. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/438_11219696_small.JPG</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/438_11219696_large.JPG</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Sheffield, UK</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>250</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="P.Richmond@Sheffield.ac.uk ">Paul Richmond </Author>
         <Author email="">Simon Coakley</Author>         
         <Author email="">Daniela M. Romano</Author>      
      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1558109.1558172&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      
      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>parallel algorithms</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Paul Richmond, Simon Coakley, Daniela M. Romano</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>108e4261-9842-4b48-9266-5445cda7c5df</GUID>
      <Name>Accelerating phase unwrapping and affine transformations for optical quadrature microscopy</Name>
<ShortDescription>Optical Quadrature Microscopy (OQM) is a process which uses phase data to capture information about the sample being studied. OQM is part of an imaging framework developed by the Optical Science Laboratory at Northeastern University. In one particular application of interest, the framework is used to extract phase information from the image of an embryo to determine embryo viability. Phase Unwrapping is the process of reconstructing the real phase shift (propagation delay) of a sample from the measured "wrapped" representation which is between - and +. Unwrapping can be done using the Minimum LP Norm Phase Unwrap algorithm. Images are first preprocessed using an Affine Transform before they are unwrapped. Both of these steps are time consuming and would benefit greatly from parallelization and acceleration. Faster processing would lower many research barriers (in terms of throughput and performance) present when using OQM. In this paper we report on accelerating Phase Unwrapping and Affine Transformations using NVIDIA's CUDA programming model. We also run elementary noise removal on the GPU using NVIDIA's CUBLAS (CUDA Basic Linear Algebra Subprograms) library. We integrate GPU execution into a Matlab environment to seamlessly interface to the pre-existing image acquisition system. By mapping the unwrap and noise removal to a GPU, and by also reducing the amount of I/O overhead, we are able to accelerate the end-to-end process by more than 7.3x. This enables our imaging framework to perform high speed image acquisition and visualization at near real-time rates. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/437_20060621-QuenchedSi-AFM_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/437_20060621-QuenchedSi-AFM_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Northeastern University, Boston, MA</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>8</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="kaeli@ece.neu.edu">Miriam Leeser</Author>
         <Author email="">Sherman Braganza</Author>         
         <Author email="">David Kaeli</Author>
         <Author email="pmistry@ece.neu.edu">Perhaad Mistry</Author>      
      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1513895.1513899&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>MedicalImaging</ApplicationType>
	<ApplicationType>Life Sciences</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Perhaad Mistry , Sherman Braganza , David Kaeli, pmistry@ece.neu.edu</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>16bab639-6bea-43bf-bb15-a160a8fb5924</GUID>
      <Name>hiCUDA: a high-level directive-based language</Name>
<ShortDescription>The Compute Unified Device Architecture (CUDA) has become a de facto standard for programming NVIDIA GPUs. However, CUDA places on the programmer the burden of packaging GPU code in separate functions, of explicitly managing data transfer between the host memory and various components of the GPU memory, and of manually optimizing the utilization of the GPU memory. Practical experience shows that the programmer needs to make significant code changes, which are often tedious and error-prone, before getting an optimized program. We have designed hiCUDA, a high-level directive-based language for CUDA programming. It allows programmers to perform these tedious tasks in a simpler manner, and directly to the sequential code. Nonetheless, it supports the same programming paradigm already familiar to CUDA programmers. We have prototyped a source-to-source compiler that translates a hiCUDA program to a CUDA program. Experiments using five standard CUDA bechmarks show that the simplicity and flexibility hiCUDA provides come at no expense to performance. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/436_sombrero_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/436_sombrero_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Toronto, Toronto, Ontario, Canada</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>12</ReleaseMonth>
      <ReleaseDay>31</ReleaseDay>
      <ReleaseDateDisplay>12/31/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="director@cms.math.ca">Tianyi David Han</Author>
         <Author email="">Tarek S. Abdelrahman </Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://portal.acm.org/citation.cfm?id=1513895.1513902&amp;coll=ACM&amp;dl=ACM&amp;CFID=46384459&amp;CFTOKEN=29518040">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Parallel Algorithms</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword></Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>b4a32d12-7514-402c-81f6-0f3a1131a030</GUID>
      <Name>Harvesting graphics power for MD simulations</Name>
<ShortDescription>We discuss an implementation of molecular dynamics (MD) simulations on a graphic processing unit (GPU) in the NVIDIA CUDA language. We tested our code on a modern GPU, the NVIDIA GeForce 8800 GTX. Results for two MD algorithms suitable for short-ranged and long-ranged interactions, and a congruential shift random number generator are presented. The performance of the GPU's is compared to their main processor counterpart. We achieve speedups of up to 80, 40 and 150 fold, respectively. With newest generation of GPU's one can run standard MD simulations at 10^7 flops. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/435_math_snap-480_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/435_math_snap-480_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>FOM Institute for Atomic and Molecular Physics, Kruislaan</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2007</ReleaseYear>
      <ReleaseMonth>09</ReleaseMonth>
      <ReleaseDay>01</ReleaseDay>
      <ReleaseDateDisplay>09/01/2007</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>150</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="K.v.Meel@amolf.nl ">J.A. van Meel</Author>
         <Author email="">A. Arnold</Author>         
         <Author email="">D. Frenkel</Author>      
      </Authors>
      <ContentTypes>
              <ContentType url="http://arxiv.org/abs/0709.3225">Paper</ContentType>      
      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Digital Content Creation</ApplicationType>
	<ApplicationType>Graphics</ApplicationType>
	<ApplicationType>Imaging</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>simulations</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>7bb139aa-5808-4bf2-a89b-2c4666abc8cc</GUID>
      <Name>GPU computing for 2-d spin systems: CUDA vs OpenGL</Name>
<ShortDescription>In recent years the more and more powerful GPU's available on the PC market have attracted attention as a cost effective solution for parallel (SIMD) computing. CUDA is a solid evidence of the attention that the major companies are devoting to the field. CUDA is a hardware and software architecture developed by Nvidia for computing on the GPU. It qualifies as a friendly alternative to the approach to GPU computing that has been pioneered in the OpenGL environment. We discuss the application of both the CUDA and the OpenGL approach to the simulation of 2-d spin systems (XY model). </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/434_opengl_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/434_opengl_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>University of Parma</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2008</ReleaseYear>
      <ReleaseMonth>11</ReleaseMonth>
      <ReleaseDay>13</ReleaseDay>
      <ReleaseDateDisplay>11/13/2008</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="viola.anselmi@studenti.unipr.it">Viola Anselmi</Author>
         <Author email="">Giovanni Conti</Author>         <Author email="">Francesco Di Renzo</Author>      </Authors>
      <ContentTypes>
              <ContentType url="http://arxiv.org/abs/0811.2111">Paper</ContentType>      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword></Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>45b89c24-5196-4659-aa01-b47994748c78</GUID>
      <Name>Accelerating numerical solution of Stochastic Differential Equations with CUDA</Name>
<ShortDescription>Numerical integration of stochastic differential equations is commonly used in many branches of science. In this paper we present how to accelerate this kind of numerical calculations with popular NVIDIA Graphics Processing Units using the CUDA programming environment. We address general aspects of numerical programming on stream processors and illustrate them by two examples: the noisy phase dynamics in a Josephson junction and the noisy Kuramoto model. In presented cases the measured speedup can be as high as 675x compared to a standard CPU, which corresponds to sev eral billion integration steps per second. This means that calculations which took weeks can now be completed in less than one hour. This brings stochastic simulation to a completely new level, opening for research a whole new range of problems which can now be solved interactively. </ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/433_numerical_small.png</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/433_numerical_large.png</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Institute of Physics, University of Silesia</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear>2009</ReleaseYear>
      <ReleaseMonth>03</ReleaseMonth>
      <ReleaseDay>23</ReleaseDay>
      <ReleaseDateDisplay>03/23/2009</ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp>675</SpeedUp>
      <SoftwareLicenseType></SoftwareLicenseType>
      <Authors>
         <Author email="mjanusz@us.edu.pl">M. Januszewski</Author>
         <Author email="">M. Kostur</Author>      
      </Authors>
      <ContentTypes>
              <ContentType url="http://arxiv.org/abs/0903.3852">Paper</ContentType>      
      </ContentTypes>
      <ApplicationTypes>
	<ApplicationType>Numerics</ApplicationType>
	<ApplicationType>Science</ApplicationType>
      </ApplicationTypes>
      <Keywords>
	<Keyword>Josephson junction, Kuramoto, graphics processing unit,advanced computer architecture, numerical integration, diusion, stochasticdierential equation, CUDA, Tesla, NVIDIA</Keyword>
      </Keywords>
    </Application>

    <Application>
      <GUID>17d19b5f-5d93-4db7-87a8-1d58ee75a60b</GUID>
      <Name>An exploration of CUDA and CBEA for a gravitational wave data-analysis application (Einstein@Home)</Name>
<ShortDescription>We present a detailed approach for making use of two new computer hardware architectures -- CBEA and CUDA -- for accelerating a scientific data-analysis application (Einstein@Home). Our results suggest that both the architectures suit the application quite well and the achievable performance in the same software developmental time-frame, is nearly identical.</ShortDescription> 
 <URL></URL>
      <BoxArtImageURLLow>/content/cudazone/CUDABrowser/assets/images/applications/432_96714main_DiskPreBurst_lg_web-1_small.jpg</BoxArtImageURLLow>
      <BoxArtImageURLMed>/content/cudazone/CUDABrowser/assets/images/applications/432_96714main_DiskPreBurst_lg_web-1_large.jpg</BoxArtImageURLMed>
      <BoxArtImageURLHigh></BoxArtImageURLHigh>
      <OrganizationType>Academia</OrganizationType>
      <OrganizationName>Research Group Programming Languages, Methodologies Universitat Kassel</OrganizationName>
      <OrganizationURL></OrganizationURL>
      <ReleaseYear></ReleaseYear>
      <ReleaseMonth></ReleaseMonth>
      <ReleaseDay></ReleaseDay>
      <ReleaseDateDisplay></ReleaseDateDisplay>
      <CompatibleGPU></CompatibleGPU>
      <SpeedUp></SpeedUp>
  