c c Marginal/conditional bivariate density estimator based on gridding and c estimating conditional densities (Simonoff, 1995). Current implementation c has a limit on sample size of 2500, which can be changed by changing the c dimension statement in the calling program, and the value of nmax in the c data statements of the calling program and the subroutine dens1d. Current c limit of the output grid is 500 x 500, which can also be changed by changing c dimension statement in the calling program, and the value of koutmax in c the data statement of the calling program. c c REFERENCE c --------- c c Simonoff, J.S. (1995), "A simple, automatic and adaptive bivariate c density estimator based on conditional densities," c Statistics and Computing, 5, 245--252. c ------------------------ c c c c Calling program c dimension data(2500,2),fx1x2(500,500),outgrd1(500),outgrd2(500), x fx1(500),fx2(500) character*80 infile,outfile,perspfil logical persp,fixout data nmax/2500/,koutmax/500/ c c Input various descriptive parameters for data. c write(*,*) 'Input file for the data?' read(*,5) infile 5 format(a80) write(*,*) 'Output file for the results?' read(*,5) outfile write(*,*) 'Sample size of data set?' read(*,*) n c c The estimate uses two grids. One (K x K) is used to estimate the c required conditional probabilities, while the second (M x M) c determines the values where the final estimate is evaluated. c write(*,*) 'Grid size to use for estimating density?' write(*,*) '(K, the number of rows and columns)' read(*,*) k write(*,*) 'Grid size to use for output?' write(*,*) '(M, where the output grid is M x M)' read(*,*) kout c c The output grid can be input to the program, or it can be chosen c automatically in a data-dependent way. c write(*,*) 'Input output grid (T) or let it be data-dependent ', x '(F)?' read(*,*) fixout if (fixout) then xkout=float(kout) write(*,*) 'Min, max values of output grid for x1?' read(*,*) xmin1,xmax1 write(*,*) 'Min, max values of output grid for x2?' read(*,*) xmin2,xmax2 ox1h=(xmax1-xmin1)/(xkout-2.) ox1h2=ox1h/2. do 10 i=1,kout outgrd1(i)=xmin1-ox1h2+float(i-1)*ox1h 10 continue ox2h=(xmax2-xmin2)/(xkout-2.) ox2h2=ox2h/2. do 15 i=1,kout outgrd2(i)=xmin2-ox2h2+float(i-1)*ox2h 15 continue end if c c By default, the final estimate is output to a file in "scatter plot" c form, so that the numbers are in the form that would be presented on c a scatter plot. In addition, the values can be output to another file c in a form that can be used (through the "interp" function) as input c to the "persp" and "contour" functions in S and S-PLUS. c write(*,*) 'Output density estimate to a file for perspective ', x 'or contour plotting? (enter T for yes, F for no)' read(*,*) persp if (persp) then write(*,*) 'Name of output file for perspective output?' write(*,*) '(Output will be in [x1, x2, f(x1, x2)] form)' read(*,5) perspfil open(30,file=perspfil,status='unknown') end if c c Data is read in, and the density estimation routine is called. c open(19,file=infile,status='old') open(20,file=outfile,status='unknown') write(20,20) infile 20 format('Input data file is ',a80) write(20,*) ' ' do 30 i=1,n read(19,*) (data(i,j),j=1,2) 30 continue call bivar(data,n,nmax,k,kout,fx1,fx2,fx1x2,koutmax,fixout, x outgrd1,outgrd2) write(20,*) 'Marginal density estimates for x1,x2' do 100 i=1,kout write(20,*) outgrd1(i),fx1(i),outgrd2(i),fx2(i) 100 continue write(20,*) write(20,*) write(20,*) 'Marginal/conditional bivariate density estimate ', x 'in scatter plot form' do 120 i=1,kout write(20,110) (fx1x2(j,kout-i+1),j=1,kout) 110 format(502(g10.5,1x)) 120 continue if (persp) then do 130 i=1,kout do 130 j=1,kout write(30,110) outgrd1(i),outgrd2(j),fx1x2(i,j) 130 continue end if stop end subroutine bivar(data,n,nmax,k,kout,fx1,fx2,fx1x2,koutmax,fixout, x outgrd1,outgrd2) c c Subroutine to calculate bivariate density estimate based on conditional c densities. The grid used to form the conditional densities has equal c sized bin widths. Output matrix fx1x2(i,j) is the estimated value for the c i-th x1 grid point paired with the j-th x2 grid point. The code restricts c the grid that is used to form the conditional probability esitmates to be c no more than 100 x 100, which is much larger than should ever be needed. c parameter (lnmax=2500,lkoutmax=500,lkmax=100) dimension data(nmax,2),x11(lnmax),x12(lnmax),x21(lnmax), x x22(lnmax),gridx1(lkmax),gridx2(lkmax),fx1(kout),fx2(kout), x fx1gx2(lkmax,lkoutmax),fx2gx1(lkoutmax,lkmax),outgrd1(kout), x outgrd2(kout),fx1x2(koutmax,koutmax),tx(lnmax),tfx(lnmax) logical fixout,zero1(lkoutmax),zero2(lkoutmax) xk=float(k) xkout=float(kout) xn=float(n) eps=.01 c c The needed grids are formed. c do 10 i=1,n x11(i)=data(i,1) x12(i)=x11(i) x21(i)=data(i,2) x22(i)=x21(i) 10 continue call sort2(n,x11,x21) x1h=(x11(n)-x11(1)+eps)/xk do 20 i=1,k gridx1(i)=x11(1)+float(i)*x1h 20 continue call sort2(n,x22,x12) x2h=(x22(n)-x22(1)+eps)/xk do 30 i=1,k gridx2(i)=x22(1)+float(i)*x2h 30 continue if (fixout) go to 55 ox1h=(x11(n)-x11(1))/(xkout-2.) ox1h2=ox1h/2. do 40 i=1,kout outgrd1(i)=x11(1)-ox1h2+float(i-1)*ox1h 40 continue ox2h=(x22(n)-x22(1))/(xkout-2.) ox2h2=ox2h/2. do 50 i=1,kout outgrd2(i)=x22(1)-ox2h2+float(i-1)*ox2h 50 continue c c Marginal densities are estimated. c 55 call dens1d(x11,n,outgrd1,fx1,kout) call dens1d(x22,n,outgrd2,fx2,kout) c c k=1 corresponds to the independence density estimate. c if (k.eq.1) then do 57 i=1,kout do 57 j=1,kout 57 fx1x2(j,i)=fx1(j)*fx2(i) return end if c c Conditional density estimates are determined. c i=1 j=0 do 100 l=1,k 60 if (x11(i).gt.gridx1(l)) go to 70 j=j+1 tx(j)=x21(i) i=i+1 if (i.gt.n) then go to 70 else go to 60 end if 70 if (j.lt.4) then zero1(l)=.true. else zero1(l)=.false. call sort(tx,j) call dens1d(tx,j,outgrd2,tfx,kout) do 80 m=1,kout fx2gx1(m,l)=tfx(m) 80 continue j=0 end if 100 continue i=1 j=0 do 200 l=1,k 160 if (x22(i).gt.gridx2(l)) go to 170 j=j+1 tx(j)=x12(i) i=i+1 if (i.gt.n) then go to 170 else go to 160 end if 170 if (j.lt.4) then zero2(l)=.true. else zero2(l)=.false. call sort(tx,j) call dens1d(tx,j,outgrd1,tfx,kout) do 180 m=1,kout fx1gx2(l,m)=tfx(m) 180 continue j=0 end if 200 continue c c If a row (column) is empty, the conditional density estimate is the c average of the estimates for the neighboring rows (or columns). c do 230 l=2,k-1 if (zero1(l)) then do 210 m=1,kout fx2gx1(m,l)=(fx2gx1(m,l-1)+fx2gx1(m,l+1))/2. 210 continue end if if (zero2(l)) then do 220 m=1,kout fx1gx2(l,m)=(fx1gx2(l-1,m)+fx1gx2(l+1,m))/2. 220 continue end if 230 continue if (zero1(1)) then do 240 m=1,kout fx2gx1(m,1)=fx2gx1(m,2)/2. 240 continue end if if (zero1(k)) then do 250 m=1,kout fx2gx1(m,k)=fx2gx1(m,k-1)/2. 250 continue end if if (zero2(1)) then do 260 m=1,kout fx1gx2(1,m)=fx2gx1(2,m)/2. 260 continue end if if (zero2(k)) then do 270 m=1,kout fx1gx2(k,m)=fx2gx1(k-1,m)/2. 270 continue end if c c Final density estimate is formed. c igr2=1 do 320 i=1,kout 280 if (outgrd2(i).gt.gridx2(igr2)) then igr2=igr2+1 if (igr2.gt.k) then igr2=k go to 290 end if go to 280 end if 290 igr1=1 do 320 j=1,kout 300 if (outgrd1(j).gt.gridx1(igr1)) then igr1=igr1+1 if (igr1.gt.k) then igr1=k go to 310 end if go to 300 end if 310 fx1x2(j,i)=sqrt(fx1gx2(igr2,j)*fx2gx1(i,igr1)*fx1(j)*fx2(i)) 320 continue return end subroutine sort(ra,n) c c Subroutine to sort entries 1 through n in ascending order from a c vector ra, using heapsort c dimension ra(n) l=n/2+1 ir=n 10 if (l.gt.1) then l=l-1 rra=ra(l) else rra=ra(ir) ra(ir)=ra(1) ir=ir-1 if (ir.eq.1) then ra(1)=rra return end if end if i=l j=l+l 20 if (j.le.ir) then if (j.lt.ir) then if (ra(j).lt.ra(j+1)) j=j+1 end if if (rra.lt.ra(j)) then ra(i)=ra(j) i=j j=j+j else j=ir+1 end if go to 20 end if ra(i)=rra go to 10 end subroutine sjeqd(n,x,lambda,hsjd) C C This subroutine calculates the Sheather-Jones C "solve-the-equation" bandwidth. C Double precision is used throughout. C IMPORTANT INPUT VARIABLES C n = sample size C x = vector containing the data C lambda = interquartile range (iqr) C IMPORTANT OUTPUT VARIABLES C hsjd = value of Sheather-Jones bandwidth C OTHER IMPORTANT VARIABLES C hnml = bandwidth based on iqr assuming data is normal C (used as a starting value in this subroutine) C chat = estimate of constant c in bandwidth a C implicit double precision (a-h,o-z) double precision x(n),lambda,l2,k1,k2,k3 parameter(tola=1.0d-5,tolg=1.0d-4,loop=15) xn = dble(n) k3 = (xn - 1.d0)/dsqrt(2.d0) threen = 3.d0*xn pi = 3.141592654d0 rt2pi = dsqrt(2.d0*pi) hnml = 0.79d0*lambda/(xn**0.2) l2=lambda*lambda C *** ESTIMATE VALUE OF c *** ac = (0.920d0*lambda)/(xn**(1.d0/7.d0)) bc = (0.912d0*lambda)/(xn**(1.d0/9.d0)) sa = 0.d0 sb = 0.d0 do 2 i = 1, n-1 do 3 j = i+1, n da = ((x(i) - x(j))/ac)**2 db = ((x(i) - x(j))/bc)**2 ea = dexp(-0.5d0*da) eb = dexp(-0.5d0*db) sa = sa + (da**2 - 6.d0*da + 3.d0)*ea sb = sb + (db**3 - 15.d0*db**2 + 45.d0*db - 15.d0)*eb 3 continue 2 continue rhat2 = (2.d0*sa)/(xn*(xn-1.d0)*(ac**5)*rt2pi) rhat2 = rhat2 + 3.d0/(rt2pi*(xn-1.d0)*(ac**5)) rhat3 = (-2.d0*sb)/(xn*(xn-1.d0)*(bc**7)*rt2pi) rhat3 = rhat3 + 15.d0/(rt2pi*(xn-1.d0)*(bc**7)) chat = 1.357d0*((dabs(rhat2/rhat3))**(1.d0/7.d0)) chat7 = chat**7.d0 C *** USE NEWTON-RAPHSON METHOD TO CALCULATE h *** C INITIAL VALUES FOR h AND a firsth = hnml ainit = chat*(firsth**(5.d0/7.d0)) 99 a = 1.5d0*ainit do 4 m=1,loop s=0.d0 t=0.d0 do 5 i=1,n-1 do 5 j=i+1,n y = (x(i)-x(j))/a y2 = y*y e = dexp(-0.5d0*y2) s = s + (y2*y2 - 6.d0*y2 + 3.d0)*e t = t + y2*(y2*y2-10.d0*y2+15.d0)*e 5 continue t = 2.0d0 * t s = 2.0d0 * s + threen k1 = a*a/chat7 k1 = dsign(dabs(k1)**0.2d0,k1) k2 = k3/s k2 = dsign(dabs(k2)**0.2d0,k2) gprime = -0.2d0*(k2*(t/s-5.0d0) + 7.0d0*k1) g = a*(k2-k1) anew = a - g/gprime C CALCULATE h FROM a k1 = anew*anew/chat7 k1 = dsign(dabs(k1)**0.2d0,k1) h=k1*anew if(anew.le.0.0d0)goto 98 if(dabs(anew-a).lt.tola)goto100 if(dabs(g).lt.tolg)goto100 a=anew 4 continue 98 write(*,910) ainit = 1.5d0*ainit goto 99 100 continue hsjd = h 910 format(' NEWTON`S METHOD UNABLE TO FIND SOLUTION TO sjeqd') return end function xker(x) xker=exp(-x*x/2.)/2.506628275 return end subroutine dens1d(xx,n,grid,est,k) c c Determination of univariate density estimate. The code is written c assuming that the estimate is calculated using the IMSL routine deskn. c Any alternative code to calculate a univariate kernel estimate can be c used instead. It should take the form c c subroutine deskn(xker,n,x,h,t,k,grid,est,it) c c where the kernel function to be used is the function "xker" (which was given c previously as the Gaussian kernel), the sample size is n, the vector of c data values is x, the smoothing parameter is h, and the estimate is c evaluated at the k values given in the vector "grid", and output to the c vector "est". The values t and it are not used. c parameter(nmax=2500) dimension xx(n),grid(k),est(k) double precision x(nmax),xiqr,hsjd external xker xiqr=dble(xx(3*n/4)-xx(n/4)) call sjeqd(n,x,xiqr,hsjd) c c The smoothing parameter actually used is 2.5 times the Sheather-Jones c value. c hsj=hsjd*2.5 call deskn(xker,n,xx,hsj,-1.,k,grid,est,nmiss) return end SUBROUTINE SORT2(N,RA,RB) c c Subroutine to sort entries 1 through n in ascending order from a c vector ra, carrying along the entries in a vector rb c DIMENSION RA(N),RB(N) L=N/2+1 IR=N 10 CONTINUE IF(L.GT.1)THEN L=L-1 RRA=RA(L) RRB=RB(L) ELSE RRA=RA(IR) RRB=RB(IR) RA(IR)=RA(1) RB(IR)=RB(1) IR=IR-1 IF(IR.EQ.1)THEN RA(1)=RRA RB(1)=RRB RETURN ENDIF ENDIF I=L J=L+L 20 IF(J.LE.IR)THEN IF(J.LT.IR)THEN IF(RA(J).LT.RA(J+1))J=J+1 ENDIF IF(RRA.LT.RA(J))THEN RA(I)=RA(J) RB(I)=RB(J) I=J J=J+J ELSE J=IR+1 ENDIF GO TO 20 ENDIF RA(I)=RRA RB(I)=RRB GO TO 10 END