Java Hotspot JIT output for math

Ramin Sadre

Updated on September 10, 2022.

No responsibility is taken for the correctness or completeness of the information presented in this article.

1 Introduction

The fact that the Java language (unlike C, C++, and C#) does not know stack-allocated objects is often a source of concern for programmers who want to write efficient programs. A use case that is frequently discussed in this context are operations on data structures that produce intermediate results, which are naturally represented in Java as new objects.

Below is a concrete example extracted from the source code of a (very) simple ray tracer. It consists of a minimal implementation of a 3D vector:

public final class Vector3 {
    public double x,y,z ;

    public Vector3(double x, double y, double z) {
        this.x=x ; this.y=y ; this.z=z ;
    }

    public Vector3 subtract(Vector3 v) {
        return new Vector3(x-v.x,y-v.y,z-v.z) ;
    }
    
    public double lengthSquared() {
        return x*x+y*y+z*z ;
    }
    
    public double dotprod(Vector3 v) {
        return x*v.x+y*v.y+z*v.z ;
    }
}

Objects representing light rays:

public class Ray {
    public final Vector3 origin;
    public final Vector3 dir;
}

And finally a sphere object:

public class Sphere extends RealObject {
    private final Vector3 position;
    private final double radius2; // = radius^2
    
    // returns distance to collision between ray and sphere.
    // result is negative if no collision in front of us.
    public double calculateCollisionDistance(Ray ray) {
        final Vector3 offset = position.subtract(ray.origin);
        final double b = offset.dotprod(ray.dir) ;
        double d = radius2 - offset.lengthSquared() + b*b ;
        
        if(d<0.0) {
            return d;  // sphere missed
        }
        
        // find closest intersection in front of us
        d = Math.sqrt(d);
        double dist = b-d;
        if(dist<0.0) {
            dist=b+d;
        }
        return dist;
    }

People familiar with ray tracing know that a method like calculateCollisionDistance can be called millions of times during the rendering of an image, which also means millions of new Vector3 objects allocated by the subtract method of the Vector3 class. Actually, this is not entirely true…

2 The Hotspot JIT compiler

OpenJDK’s Hotspot JIT compiler uses an approach called tiered compilation. Without going into the details, the result of this is that a “hot” method like calculateCollisionDistance will be compiled, after several intermediate steps, to optimized machine code during runtime. Here is the machine code (in Intel syntax) produced by the Hotspot C2 compiler on the x64 architecture for calculateCollisionDistance (comments by me):

# Parameters:
#   this:  rsi
#   ray:   rdx
# Returns result in xmm0

    mov    DWORD PTR [rsp-0x14000],eax
    push   rbp
    sub    rsp,0x10
 
    mov    r11d,DWORD PTR [rdx+0xc]       ; r11 = ray.origin
    mov    r10d,DWORD PTR [rsi+0xc]       ; r10 = this.position
 
    nop    DWORD PTR [rax+rax*1+0x0]
    data16 data16 xchg ax,ax              ; (just for alignment)
    
    vmovsd xmm0,QWORD PTR [r10+0x10]      ; get position.x
 
    test   r11d,r11d
    je     label1             
                                                       
    mov    r8d,DWORD PTR [rdx+0x10]       ; r8 = ray.dir
    test   r8d,r8d
    je     label2

    vsubsd xmm0,xmm0,QWORD PTR [r11+0x10]
    vmulsd xmm1,xmm0,QWORD PTR [r8+0x10] 
    vmovsd xmm6,QWORD PTR [rsi+0x20]     
    vmulsd xmm0,xmm0,xmm0                
    vmovsd xmm2,QWORD PTR [r10+0x18]     
    vsubsd xmm2,xmm2,QWORD PTR [r11+0x18]
    vmulsd xmm3,xmm2,QWORD PTR [r8+0x18] 
    vmovsd xmm4,QWORD PTR [r10+0x20]     
    vsubsd xmm4,xmm4,QWORD PTR [r11+0x20] ; offset = 
                                          ;   position.subtract(ray.origin)
    vmulsd xmm5,xmm4,QWORD PTR [r8+0x20] 
    vaddsd xmm1,xmm1,xmm3                
    vmulsd xmm3,xmm4,xmm4                
    vaddsd xmm1,xmm1,xmm5                 ; b = offset.dotprod(ray.dir)
 
    vmulsd xmm2,xmm2,xmm2
    vmulsd xmm4,xmm1,xmm1
    vaddsd xmm0,xmm0,xmm2                 
    vaddsd xmm0,xmm0,xmm3
    vsubsd xmm0,xmm6,xmm0
    vaddsd xmm0,xmm0,xmm4    ; d = radius2 - offset.lengthSquared() + b*b
                                  
    vxorpd xmm2,xmm2,xmm2 
    
    data16 xchg ax,ax                     ; (just for alignment)
    
    vucomisd xmm2,xmm0                    ; if(d<0.0)
    jbe    dIsGreaterOrEqual0
  
returnResult:
    add    rsp,0x10
    pop    rbp
    cmp    rsp,QWORD PTR [r15+0x340]
    ja     label3
    ret   
dIsGreaterOrEqual0:
    vsqrtsd xmm0,xmm0,xmm0                ; d = sqrt(d)
    vsubsd xmm3,xmm1,xmm0                 ; dist = b - d
    vucomisd xmm2,xmm3                    ; if(dist<0.0)
    ja     distIsLessThan0           
returnDist:
    vmovapd xmm0,xmm3
    jmp    returnResult
distIsLessThan0:
    vaddsd xmm3,xmm1,xmm0                 ; dist = b + d
    jmp    returnDist

# Not shown here:
# Code for JIT and GC (label1, label2, label3,...)

As can be seen, the compiler makes intensive use of the SSE registers to store intermediate results. In fact, no Vector3 object is created to hold the result of the subtraction operation!

3 Comparison to C

Let’s see what machine code a C compiler would produce. Here is the C source with manually inlined subtract and dot-product operations:

struct Vector3 {
    double x,y,z;
};

struct Ray {
    struct Vector3 origin, dir;
};

struct Sphere {
    struct Vector3 pos;
    double radius2;
};

double getCollisionDistance(struct Sphere *sphere, struct Ray *ray) {
    double ox = sphere->pos.x - ray->origin.x;
    double oy = sphere->pos.y - ray->origin.y;
    double oz = sphere->pos.z - ray->origin.z;
    double b = ox*ray->dir.x + oy*ray->dir.y + oz*ray->dir.z;
    double d = sphere->radius2 - (ox*ox+oy*oy+oz*oz) + b*b;
    if(d<0.0) {
        return d;
    }
    d=sqrt(d);
    double dist = b-d;
    if(dist<0.0) {
        dist=b+d;
    }
    return dist;
}

When compiled with gcc version 12.2 and options -O3 -msse2 -mavx -ffast-math we get for the getCollisionDistance function this machine code (comments by me):

    vmovsd  xmm1, QWORD PTR [rdi]
    vmovsd  xmm3, QWORD PTR [rdi+8]
    vsubsd  xmm1, xmm1, QWORD PTR [rsi]
    vsubsd  xmm3, xmm3, QWORD PTR [rsi+8]
    vmulsd  xmm2, xmm1, QWORD PTR [rsi+24]
    vmovsd  xmm0, QWORD PTR [rdi+16]
    vmulsd  xmm4, xmm3, QWORD PTR [rsi+32]
    vsubsd  xmm0, xmm0, QWORD PTR [rsi+16]
    vmulsd  xmm1, xmm1, xmm1
    vmulsd  xmm3, xmm3, xmm3
    vaddsd  xmm2, xmm2, xmm4
    vmulsd  xmm4, xmm0, QWORD PTR [rsi+40]
    vaddsd  xmm1, xmm1, xmm3
    vaddsd  xmm2, xmm2, xmm4
    vmulsd  xmm4, xmm0, xmm0
    vmovsd  xmm0, QWORD PTR [rdi+24]
    vsubsd  xmm0, xmm0, xmm4
    vsubsd  xmm0, xmm0, xmm1
    vmulsd  xmm1, xmm2, xmm2
    vaddsd  xmm0, xmm0, xmm1
    vxorpd  xmm1, xmm1, xmm1
    vcomisd xmm1, xmm0
    ja      .L1
    vsqrtsd xmm3, xmm0, xmm0
    vsubsd  xmm0, xmm2, xmm3
    vaddsd  xmm2, xmm2, xmm3
    vcmpltsd xmm1, xmm0, xmm1
    vblendvpd xmm0, xmm0, xmm2, xmm1  # avoids the jump for the second if
.L1:
    ret

This looks quite similar to the code generated by the Java JIT compiler. An important, or maybe even the difference between the Java and C version is that C and C++ allow to embed the Vector3 objects into the Ray and Sphere objects, which saves a memory access per vector and improves locality. This can have a significant impact on the speed and memory consumption of an application.

Nevertheless, it’s interesting to see how far the optimizations by the JIT compiler go. Of course, this doesn’t mean that Java is the perfect language for doing high performance computing. In addition to the differences mentioned above, you have more opportunities in C and C++ to tune your code, for example by using SIMD intrinsics (but see also Java’s JEP 417).