From: Francis Russell Date: Fri, 15 Feb 2013 17:51:54 +0000 (+0000) Subject: Add generated code from the Scala OFC implementation. X-Git-Url: https://git.unchartedbackwaters.co.uk/w/?a=commitdiff_plain;h=b96395bc4dfe38337e91d5c1bcce3e237939d1e7;p=francis%2Fpsl_presentation_20130225.git Add generated code from the Scala OFC implementation. --- diff --git a/code/integrals_kinetic_part_1.F90 b/code/integrals_kinetic_part_1.F90 new file mode 100644 index 0000000..b1c1a97 --- /dev/null +++ b/code/integrals_kinetic_part_1.F90 @@ -0,0 +1,88 @@ +subroutine integrals_kinetic(kinet_1, bras_on_grid_1, bra_basis_1, kets_on_grid_1, ket_basis_1) + use basis, only: basis_copy_function_to_box, basis_ket_start_wrt_fftbox + use comms, only: pub_my_node_id + use fourier, only: fourier_apply_box_pair + use function_basis, only: FUNC_BASIS + use parallel_strategy, only: pub_first_atom_on_node + use simulation_cell, only: pub_fftbox, pub_cell + use sparse, only: sparse_atom_of_elem, sparse_index_length, SPAM3, sparse_put_element, sparse_generate_index, sparse_fi& + &rst_elem_on_node + + implicit none + complex(kind=DP), dimension(:,:,:), allocatable :: reciprocal_box_1 + Complex(kind=DP), dimension(:,:,:), allocatable :: transformed_1 + integer :: col_1 + integer :: col_atom_1 + integer :: fftbox_offset1_1 + integer :: fftbox_offset1_2 + integer :: fftbox_offset2_1 + integer :: fftbox_offset2_2 + integer :: fftbox_offset3_1 + integer :: fftbox_offset3_2 + integer :: freq_1_1 + integer :: freq_2_1 + integer :: freq_3_1 + integer :: i1_1 + integer :: i1_2 + integer :: i1_3 + integer :: i2_1 + integer :: i2_2 + integer :: i2_3 + integer :: i3_1 + integer :: i3_2 + integer :: i3_3 + integer :: local_col_atom_1 + integer :: row_1 + integer :: row_atom_1 + integer :: row_idx_1 + integer :: ternary_1 + integer :: ternary_2 + integer :: ternary_3 + integer :: tightbox_origin1_1 + integer :: tightbox_origin1_2 + integer :: tightbox_origin2_1 + integer :: tightbox_origin2_2 + integer :: tightbox_origin3_1 + integer :: tightbox_origin3_2 + integer, dimension(:), allocatable :: sparse_idx_1 + real(kind=DP) :: inner_product_result_1 + real(kind=DP) :: reciprocal_vector1_1 + real(kind=DP) :: reciprocal_vector2_1 + real(kind=DP) :: reciprocal_vector3_1 + real(kind=DP), dimension(:) :: bras_on_grid_1 + real(kind=DP), dimension(:) :: kets_on_grid_1 + real(kind=DP), dimension(:,:,:), allocatable :: dummybox_1 + real(kind=DP), dimension(:,:,:), allocatable :: fftbox_1 + real(kind=DP), dimension(:,:,:), allocatable :: fftbox_2 + real(kind=DP), dimension(:,:,:), allocatable :: fftbox_3 + real(kind=DP), dimension(:,:,:), allocatable :: scaled_1 + type(FUNC_BASIS) :: bra_basis_1 + type(FUNC_BASIS) :: ket_basis_1 + type(SPAM3) :: kinet_1 + + allocate(sparse_idx_1(sparse_index_length(kinet_1))) + call sparse_generate_index(sparse_idx_1, kinet_1) + do row_1 = sparse_first_elem_on_node(pub_my_node_id, kinet_1, 'R'), sparse_first_elem_on_node(pub_my_node_id + 1, kinet& + &_1, 'R') - 1 + row_atom_1 = sparse_atom_of_elem(row_1, kinet_1, 'R') + do col_1 = sparse_first_elem_on_node(pub_my_node_id, kinet_1, 'C'), sparse_first_elem_on_node(pub_my_node_id + 1, kin& + &et_1, 'C') - 1 + col_atom_1 = sparse_atom_of_elem(col_1, kinet_1, 'C') + local_col_atom_1 = col_atom_1 - pub_first_atom_on_node(pub_my_node_id) + 1 + do row_idx_1 = sparse_idx_1(local_col_atom_1), sparse_idx_1(local_col_atom_1 + 1) - 1 + if (sparse_idx_1(row_idx_1) .eq. row_atom_1) then + allocate(fftbox_3(pub_fftbox%total_pt1,pub_fftbox%total_pt2,pub_fftbox%total_pt3)) + call basis_ket_start_wrt_fftbox(fftbox_offset1_2, fftbox_offset2_2, fftbox_offset3_2, pub_fftbox%total_pt1, pub& + &_fftbox%total_pt2, pub_fftbox%total_pt3) + call basis_copy_function_to_box(fftbox_3, pub_fftbox%total_pt1, pub_fftbox%total_pt2, pub_fftbox%total_pt3, fft& + &box_offset1_2, fftbox_offset2_2, fftbox_offset3_2, bra_basis_1%tight_boxes(row_1), bras_on_grid_1, bra_basis_1& + &%spheres(row_1)) + tightbox_origin1_1 = (bra_basis_1%tight_boxes(row_1)%start_ppds1 - 1) * pub_cell%n_pt1 + bra_basis_1%tight_boxe& + &s(row_1)%start_pts1 + tightbox_origin2_2 = (bra_basis_1%tight_boxes(row_1)%start_ppds2 - 1) * pub_cell%n_pt2 + bra_basis_1%tight_boxe& + &s(row_1)%start_pts2 + tightbox_origin3_1 = (bra_basis_1%tight_boxes(row_1)%start_ppds3 - 1) * pub_cell%n_pt3 + bra_basis_1%tight_boxe& + &s(row_1)%start_pts3 + allocate(fftbox_1(pub_fftbox%total_pt1,pub_fftbox%total_pt2,pub_fftbox%total_pt3)) + call basis_ket_start_wrt_fftbox(fftbox_offset1_1, fftbox_offset2_1, fftbox_offset3_1, pub_fftbox%total_pt1, pub& + &_fftbox%total_pt2, pub_fftbox%total_pt3) diff --git a/code/integrals_kinetic_part_2.F90 b/code/integrals_kinetic_part_2.F90 new file mode 100644 index 0000000..ec23681 --- /dev/null +++ b/code/integrals_kinetic_part_2.F90 @@ -0,0 +1,87 @@ + call basis_copy_function_to_box(fftbox_1, pub_fftbox%total_pt1, pub_fftbox%total_pt2, pub_fftbox%total_pt3, fft& + &box_offset1_1, fftbox_offset2_1, fftbox_offset3_1, ket_basis_1%tight_boxes(col_1), kets_on_grid_1, ket_basis_1& + &%spheres(col_1)) + tightbox_origin1_2 = (ket_basis_1%tight_boxes(col_1)%start_ppds1 - 1) * pub_cell%n_pt1 + ket_basis_1%tight_boxe& + &s(col_1)%start_pts1 + tightbox_origin2_1 = (ket_basis_1%tight_boxes(col_1)%start_ppds2 - 1) * pub_cell%n_pt2 + ket_basis_1%tight_boxe& + &s(col_1)%start_pts2 + tightbox_origin3_2 = (ket_basis_1%tight_boxes(col_1)%start_ppds3 - 1) * pub_cell%n_pt3 + ket_basis_1%tight_boxe& + &s(col_1)%start_pts3 + allocate(reciprocal_box_1(pub_fftbox%total_pt1,pub_fftbox%total_pt2,pub_fftbox%total_pt3)) + call fourier_apply_box_pair('C', 'F', fftbox_1, fftbox_1, reciprocal_box_1) + deallocate(fftbox_1) + allocate(transformed_1(pub_fftbox%total_pt1,pub_fftbox%total_pt2,pub_fftbox%total_pt3)) + do i3_1 = 1, pub_fftbox%total_pt3 + if (i3_1 .gt. pub_fftbox%total_pt3 / 2 + 1) then + ternary_1 = i3_1 - pub_fftbox%total_pt3 - 1 + else + ternary_1 = i3_1 - 1 + endif + freq_3_1 = ternary_1 + do i2_3 = 1, pub_fftbox%total_pt2 + if (i2_3 .gt. pub_fftbox%total_pt2 / 2 + 1) then + ternary_2 = i2_3 - pub_fftbox%total_pt2 - 1 + else + ternary_2 = i2_3 - 1 + endif + freq_2_1 = ternary_2 + do i1_3 = 1, pub_fftbox%total_pt1 + if (i1_3 .gt. pub_fftbox%total_pt1 / 2 + 1) then + ternary_3 = i1_3 - pub_fftbox%total_pt1 - 1 + else + ternary_3 = i1_3 - 1 + endif + freq_1_1 = ternary_3 + reciprocal_vector1_1 = 0.0 + pub_fftbox%b1%X * real(freq_1_1) + pub_fftbox%b2%X * real(freq_2_1) + pub_ff& + &tbox%b3%X * real(freq_3_1) + reciprocal_vector2_1 = 0.0 + pub_fftbox%b1%Y * real(freq_1_1) + pub_fftbox%b2%Y * real(freq_2_1) + pub_ff& + &tbox%b3%Y * real(freq_3_1) + reciprocal_vector3_1 = 0.0 + pub_fftbox%b1%Z * real(freq_1_1) + pub_fftbox%b2%Z * real(freq_2_1) + pub_ff& + &tbox%b3%Z * real(freq_3_1) + transformed_1(i1_3, i2_3, i3_1) = reciprocal_box_1(i1_3, i2_3, i3_1) * cmplx((0.0 + reciprocal_vector1_1 & + &* reciprocal_vector1_1 + reciprocal_vector2_1 * reciprocal_vector2_1 + reciprocal_vector3_1 * reciprocal& + &_vector3_1) * (-1.0)) + end do + end do + end do + deallocate(reciprocal_box_1) + allocate(fftbox_2(pub_fftbox%total_pt1,pub_fftbox%total_pt2,pub_fftbox%total_pt3)) + allocate(dummybox_1(pub_fftbox%total_pt1,pub_fftbox%total_pt2,pub_fftbox%total_pt3)) + call fourier_apply_box_pair('C', 'B', fftbox_2, dummybox_1, transformed_1) + deallocate(transformed_1) + allocate(scaled_1(pub_fftbox%total_pt1,pub_fftbox%total_pt2,pub_fftbox%total_pt3)) + do i3_3 = 1, pub_fftbox%total_pt3 + do i2_2 = 1, pub_fftbox%total_pt2 + do i1_2 = 1, pub_fftbox%total_pt1 + scaled_1(i1_2, i2_2, i3_3) = fftbox_2(i1_2, i2_2, i3_3) * (-0.5) + end do + end do + end do + deallocate(fftbox_2) + deallocate(dummybox_1) + inner_product_result_1 = 0.0 + do i3_2 = max(tightbox_origin3_1 - fftbox_offset3_2, tightbox_origin3_2 - fftbox_offset3_1), min(tightbox_origi& + &n3_1 - fftbox_offset3_2 + pub_fftbox%total_pt3, tightbox_origin3_2 - fftbox_offset3_1 + pub_fftbox%total_pt3) & + &- 1 + do i2_1 = max(tightbox_origin2_2 - fftbox_offset2_2, tightbox_origin2_1 - fftbox_offset2_1), min(tightbox_ori& + &gin2_2 - fftbox_offset2_2 + pub_fftbox%total_pt2, tightbox_origin2_1 - fftbox_offset2_1 + pub_fftbox%total_p& + &t2) - 1 + do i1_1 = max(tightbox_origin1_1 - fftbox_offset1_2, tightbox_origin1_2 - fftbox_offset1_1), min(tightbox_o& + &rigin1_1 - fftbox_offset1_2 + pub_fftbox%total_pt1, tightbox_origin1_2 - fftbox_offset1_1 + pub_fftbox%tot& + &al_pt1) - 1 + inner_product_result_1 = inner_product_result_1 + fftbox_3(i1_1 - (tightbox_origin1_1 - fftbox_offset1_2)& + & + 1, i2_1 - (tightbox_origin2_2 - fftbox_offset2_2) + 1, i3_2 - (tightbox_origin3_1 - fftbox_offset3_2)& + & + 1) * scaled_1(i1_1 - (tightbox_origin1_2 - fftbox_offset1_1) + 1, i2_1 - (tightbox_origin2_1 - fftbox& + &_offset2_1) + 1, i3_2 - (tightbox_origin3_2 - fftbox_offset3_1) + 1) * pub_cell%weight + end do + end do + end do + deallocate(fftbox_3) + deallocate(scaled_1) + call sparse_put_element(inner_product_result_1, kinet_1, row_1, col_1) + endif + end do + end do + end do + deallocate(sparse_idx_1) +end subroutine diff --git a/presentation.tex b/presentation.tex index 17d3492..ddd1481 100644 --- a/presentation.tex +++ b/presentation.tex @@ -257,7 +257,7 @@ Permutation & $T_{perm} = (I \succ I') \cup (I' \succ I)$ \\ \frametitle{Our initial implementation} We developed an code generator in Scala that attempted to reason -abstractly about the indices involved in the computation. +about the indices involved in the computation. \vspace{0.5em} \centering @@ -269,6 +269,54 @@ abstractly about the indices involved in the computation. } +\frame{ + +\frametitle{Our initial implementation} + +The system was designed to reason about the relationships between indices, and +generate code subject to the various constraints. + +\begin{itemize} + +\item Nodes that could be applied point-wise did not affect any of their indices. + +\item Nodes that constructed entirely new fields (e.g. FFTs) removed indices of + the operands they used and exposed new indices. + +\item Nodes could be transparent to indices they did not act on, so an FFT node + acting on a set of functions would represent the FFT of all of them. + +\item Nodes could require random access to certain indices, forcing the system + to introduce temporary storage. + +\item The system attempted to synthesise a loop nesting that respected all + dependencies whilst minimizing the amout of intermediate storage required to + pass data between each node in the graph. + +\end{itemize} + +} + +\frame{ + +\frametitle{Generated code for ONETEP's kinetic energy integral} + +\resizebox{0.48\textwidth}{!}{ +\begin{minipage}{2.5\textwidth} +\begin{alltt} +\input{code/integrals_kinetic_part_1.F90} +\end{alltt} +\end{minipage} +} +\resizebox{0.48\textwidth}{!}{ +\begin{minipage}{2.5\textwidth} +\begin{alltt} +\input{code/integrals_kinetic_part_2.F90} +\end{alltt} +\end{minipage} +} + +} \end{document}