diff --git a/.github/workflows/run_test.yml b/.github/workflows/run_test.yml index bf8e9169..92878f19 100644 --- a/.github/workflows/run_test.yml +++ b/.github/workflows/run_test.yml @@ -89,6 +89,35 @@ jobs: use-cross: true args: --release --target aarch64-unknown-linux-gnu + check_arm64_neon: + name: Check and test Linux arm 64bit with neon + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v2 + + - name: Install nightly toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + target: aarch64-unknown-linux-gnu + override: true + + - name: Run cargo check + uses: actions-rs/cargo@v1 + with: + command: check + use-cross: true + args: --features neon-nightly --target aarch64-unknown-linux-gnu + + - name: Run cargo test for arm + uses: actions-rs/cargo@v1 + with: + command: test + use-cross: true + args: --release --features neon-nightly --target aarch64-unknown-linux-gnu + check_x86: name: Check and test Linux x86 32bit runs-on: ubuntu-latest diff --git a/Cargo.toml b/Cargo.toml index 9e1cd012..77650620 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,8 +21,11 @@ default = ["avx", "sse"] # If both are enabled, RustFFT will use AVX if the CPU supports it. If not, it will check for SSE4.1. # If neither instruction set is available, it will fall back to the scalar code. # On every other platform, these features do nothing, and RustFFT will behave like they are not set. +# +# On AArch64, the "neon-nightly" feature enables compilation of Neon-accelerated code. It requires a nightly compiler, and is disabled by default. avx = [] sse = [] +neon-nightly = [] [dependencies] diff --git a/README.md b/README.md index 310e4a49..43659e73 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,8 @@ Disabling them reduces compile time and binary size. On other platform than x86_64, these features do nothing and RustFFT will behave like they are not set. +On AArch64, the `neon-nightly` feature enables compilation of Neon-accelerated code. It requires a nightly compiler, and is disabled by default. Be warned that new nightly versions may break RustFFT's Neon support. + ## Stability/Future Breaking Changes Version 5.0 contains several breaking API changes. In the interest of stability, we're committing to making no more breaking changes for 3 years, aka until 2024. diff --git a/benches/bench_check_neon_2to1024.rs b/benches/bench_check_neon_2to1024.rs new file mode 100644 index 00000000..fe51d7a1 --- /dev/null +++ b/benches/bench_check_neon_2to1024.rs @@ -0,0 +1,168 @@ +#![feature(test)] +extern crate rustfft; +extern crate test; + +use paste::paste; +use rustfft::num_complex::Complex; +use rustfft::num_traits::Zero; +use rustfft::Fft; +use std::sync::Arc; +use test::Bencher; + +// Make fft using planner +fn bench_planned_32(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerNeon::new().unwrap(); + let fft: Arc> = planner.plan_fft_forward(len); + + let mut buffer: Vec> = vec![Complex::zero(); len]; + let mut scratch: Vec> = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} + +// Make fft using planner +fn bench_planned_64(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerNeon::new().unwrap(); + let fft: Arc> = planner.plan_fft_forward(len); + + let mut buffer: Vec> = vec![Complex::zero(); len]; + let mut scratch: Vec> = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} + +// Create benches using functions taking one argument +macro_rules! make_benches { + ($name:ident, $fname:ident, { $($len:literal),* }) => { + paste! { + $( + #[bench] + fn [](b: &mut Bencher) { + [](b, $len); + } + + #[bench] + fn [](b: &mut Bencher) { + [](b, $len); + } + )* + } + } +} + +make_benches!(from2to1024, planned, {2, 3, 4, 5, 6, 7, 8, 9 }); +make_benches!(from2to1024, planned, {10, 11, 12, 13, 14, 15, 16, 17, 18, 19 }); +make_benches!(from2to1024, planned, {20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }); +make_benches!(from2to1024, planned, {30, 31, 32, 33, 34, 35, 36, 37, 38, 39 }); +make_benches!(from2to1024, planned, {40, 41, 42, 43, 44, 45, 46, 47, 48, 49 }); +make_benches!(from2to1024, planned, {50, 51, 52, 53, 54, 55, 56, 57, 58, 59 }); +make_benches!(from2to1024, planned, {60, 61, 62, 63, 64, 65, 66, 67, 68, 69 }); +make_benches!(from2to1024, planned, {70, 71, 72, 73, 74, 75, 76, 77, 78, 79 }); +make_benches!(from2to1024, planned, {80, 81, 82, 83, 84, 85, 86, 87, 88, 89 }); +make_benches!(from2to1024, planned, {90, 91, 92, 93, 94, 95, 96, 97, 98, 99 }); + +make_benches!(from2to1024, planned, {100, 101, 102, 103, 104, 105, 106, 107, 108, 109 }); +make_benches!(from2to1024, planned, {110, 111, 112, 113, 114, 115, 116, 117, 118, 119 }); +make_benches!(from2to1024, planned, {120, 121, 122, 123, 124, 125, 126, 127, 128, 129 }); +make_benches!(from2to1024, planned, {130, 131, 132, 133, 134, 135, 136, 137, 138, 139 }); +make_benches!(from2to1024, planned, {140, 141, 142, 143, 144, 145, 146, 147, 148, 149 }); +make_benches!(from2to1024, planned, {150, 151, 152, 153, 154, 155, 156, 157, 158, 159 }); +make_benches!(from2to1024, planned, {160, 161, 162, 163, 164, 165, 166, 167, 168, 169 }); +make_benches!(from2to1024, planned, {170, 171, 172, 173, 174, 175, 176, 177, 178, 179 }); +make_benches!(from2to1024, planned, {180, 181, 182, 183, 184, 185, 186, 187, 188, 189 }); +make_benches!(from2to1024, planned, {190, 191, 192, 193, 194, 195, 196, 197, 198, 199 }); +/* +make_benches!(from2to1024, planned, {200, 201, 202, 203, 204, 205, 206, 207, 208, 209 }); +make_benches!(from2to1024, planned, {210, 211, 212, 213, 214, 215, 216, 217, 218, 219 }); +make_benches!(from2to1024, planned, {220, 221, 222, 223, 224, 225, 226, 227, 228, 229 }); +make_benches!(from2to1024, planned, {230, 231, 232, 233, 234, 235, 236, 237, 238, 239 }); +make_benches!(from2to1024, planned, {240, 241, 242, 243, 244, 245, 246, 247, 248, 249 }); +make_benches!(from2to1024, planned, {250, 251, 252, 253, 254, 255, 256, 257, 258, 259 }); +make_benches!(from2to1024, planned, {260, 261, 262, 263, 264, 265, 266, 267, 268, 269 }); +make_benches!(from2to1024, planned, {270, 271, 272, 273, 274, 275, 276, 277, 278, 279 }); +make_benches!(from2to1024, planned, {280, 281, 282, 283, 284, 285, 286, 287, 288, 289 }); +make_benches!(from2to1024, planned, {290, 291, 292, 293, 294, 295, 296, 297, 298, 299 }); + +make_benches!(from2to1024, planned, {300, 301, 302, 303, 304, 305, 306, 307, 308, 309 }); +make_benches!(from2to1024, planned, {310, 311, 312, 313, 314, 315, 316, 317, 318, 319 }); +make_benches!(from2to1024, planned, {320, 321, 322, 323, 324, 325, 326, 327, 328, 329 }); +make_benches!(from2to1024, planned, {330, 331, 332, 333, 334, 335, 336, 337, 338, 339 }); +make_benches!(from2to1024, planned, {340, 341, 342, 343, 344, 345, 346, 347, 348, 349 }); +make_benches!(from2to1024, planned, {350, 351, 352, 353, 354, 355, 356, 357, 358, 359 }); +make_benches!(from2to1024, planned, {360, 361, 362, 363, 364, 365, 366, 367, 368, 369 }); +make_benches!(from2to1024, planned, {370, 371, 372, 373, 374, 375, 376, 377, 378, 379 }); +make_benches!(from2to1024, planned, {380, 381, 382, 383, 384, 385, 386, 387, 388, 389 }); +make_benches!(from2to1024, planned, {390, 391, 392, 393, 394, 395, 396, 397, 398, 399 }); + +make_benches!(from2to1024, planned, {400, 401, 402, 403, 404, 405, 406, 407, 408, 409 }); +make_benches!(from2to1024, planned, {410, 411, 412, 413, 414, 415, 416, 417, 418, 419 }); +make_benches!(from2to1024, planned, {420, 421, 422, 423, 424, 425, 426, 427, 428, 429 }); +make_benches!(from2to1024, planned, {430, 431, 432, 433, 434, 435, 436, 437, 438, 439 }); +make_benches!(from2to1024, planned, {440, 441, 442, 443, 444, 445, 446, 447, 448, 449 }); +make_benches!(from2to1024, planned, {450, 451, 452, 453, 454, 455, 456, 457, 458, 459 }); +make_benches!(from2to1024, planned, {460, 461, 462, 463, 464, 465, 466, 467, 468, 469 }); +make_benches!(from2to1024, planned, {470, 471, 472, 473, 474, 475, 476, 477, 478, 479 }); +make_benches!(from2to1024, planned, {480, 481, 482, 483, 484, 485, 486, 487, 488, 489 }); +make_benches!(from2to1024, planned, {490, 491, 492, 493, 494, 495, 496, 497, 498, 499 }); + +make_benches!(from2to1024, planned, {500, 501, 502, 503, 504, 505, 506, 507, 508, 509 }); +make_benches!(from2to1024, planned, {510, 511, 512, 513, 514, 515, 516, 517, 518, 519 }); +make_benches!(from2to1024, planned, {520, 521, 522, 523, 524, 525, 526, 527, 528, 529 }); +make_benches!(from2to1024, planned, {530, 531, 532, 533, 534, 535, 536, 537, 538, 539 }); +make_benches!(from2to1024, planned, {540, 541, 542, 543, 544, 545, 546, 547, 548, 549 }); +make_benches!(from2to1024, planned, {550, 551, 552, 553, 554, 555, 556, 557, 558, 559 }); +make_benches!(from2to1024, planned, {560, 561, 562, 563, 564, 565, 566, 567, 568, 569 }); +make_benches!(from2to1024, planned, {570, 571, 572, 573, 574, 575, 576, 577, 578, 579 }); +make_benches!(from2to1024, planned, {580, 581, 582, 583, 584, 585, 586, 587, 588, 589 }); +make_benches!(from2to1024, planned, {590, 591, 592, 593, 594, 595, 596, 597, 598, 599 }); + +make_benches!(from2to1024, planned, {600, 601, 602, 603, 604, 605, 606, 607, 608, 609 }); +make_benches!(from2to1024, planned, {610, 611, 612, 613, 614, 615, 616, 617, 618, 619 }); +make_benches!(from2to1024, planned, {620, 621, 622, 623, 624, 625, 626, 627, 628, 629 }); +make_benches!(from2to1024, planned, {630, 631, 632, 633, 634, 635, 636, 637, 638, 639 }); +make_benches!(from2to1024, planned, {640, 641, 642, 643, 644, 645, 646, 647, 648, 649 }); +make_benches!(from2to1024, planned, {650, 651, 652, 653, 654, 655, 656, 657, 658, 659 }); +make_benches!(from2to1024, planned, {660, 661, 662, 663, 664, 665, 666, 667, 668, 669 }); +make_benches!(from2to1024, planned, {670, 671, 672, 673, 674, 675, 676, 677, 678, 679 }); +make_benches!(from2to1024, planned, {680, 681, 682, 683, 684, 685, 686, 687, 688, 689 }); +make_benches!(from2to1024, planned, {690, 691, 692, 693, 694, 695, 696, 697, 698, 699 }); + +make_benches!(from2to1024, planned, {700, 701, 702, 703, 704, 705, 706, 707, 708, 709 }); +make_benches!(from2to1024, planned, {710, 711, 712, 713, 714, 715, 716, 717, 718, 719 }); +make_benches!(from2to1024, planned, {720, 721, 722, 723, 724, 725, 726, 727, 728, 729 }); +make_benches!(from2to1024, planned, {730, 731, 732, 733, 734, 735, 736, 737, 738, 739 }); +make_benches!(from2to1024, planned, {740, 741, 742, 743, 744, 745, 746, 747, 748, 749 }); +make_benches!(from2to1024, planned, {750, 751, 752, 753, 754, 755, 756, 757, 758, 759 }); +make_benches!(from2to1024, planned, {760, 761, 762, 763, 764, 765, 766, 767, 768, 769 }); +make_benches!(from2to1024, planned, {770, 771, 772, 773, 774, 775, 776, 777, 778, 779 }); +make_benches!(from2to1024, planned, {780, 781, 782, 783, 784, 785, 786, 787, 788, 789 }); +make_benches!(from2to1024, planned, {790, 791, 792, 793, 794, 795, 796, 797, 798, 799 }); + +make_benches!(from2to1024, planned, {800, 801, 802, 803, 804, 805, 806, 807, 808, 809 }); +make_benches!(from2to1024, planned, {810, 811, 812, 813, 814, 815, 816, 817, 818, 819 }); +make_benches!(from2to1024, planned, {820, 821, 822, 823, 824, 825, 826, 827, 828, 829 }); +make_benches!(from2to1024, planned, {830, 831, 832, 833, 834, 835, 836, 837, 838, 839 }); +make_benches!(from2to1024, planned, {840, 841, 842, 843, 844, 845, 846, 847, 848, 849 }); +make_benches!(from2to1024, planned, {850, 851, 852, 853, 854, 855, 856, 857, 858, 859 }); +make_benches!(from2to1024, planned, {860, 861, 862, 863, 864, 865, 866, 867, 868, 869 }); +make_benches!(from2to1024, planned, {870, 871, 872, 873, 874, 875, 876, 877, 878, 879 }); +make_benches!(from2to1024, planned, {880, 881, 882, 883, 884, 885, 886, 887, 888, 889 }); +make_benches!(from2to1024, planned, {890, 891, 892, 893, 894, 895, 896, 897, 898, 899 }); + +make_benches!(from2to1024, planned, {900, 901, 902, 903, 904, 905, 906, 907, 908, 909 }); +make_benches!(from2to1024, planned, {910, 911, 912, 913, 914, 915, 916, 917, 918, 919 }); +make_benches!(from2to1024, planned, {920, 921, 922, 923, 924, 925, 926, 927, 928, 929 }); +make_benches!(from2to1024, planned, {930, 931, 932, 933, 934, 935, 936, 937, 938, 939 }); +make_benches!(from2to1024, planned, {940, 941, 942, 943, 944, 945, 946, 947, 948, 949 }); +make_benches!(from2to1024, planned, {950, 951, 952, 953, 954, 955, 956, 957, 958, 959 }); +make_benches!(from2to1024, planned, {960, 961, 962, 963, 964, 965, 966, 967, 968, 969 }); +make_benches!(from2to1024, planned, {970, 971, 972, 973, 974, 975, 976, 977, 978, 979 }); +make_benches!(from2to1024, planned, {980, 981, 982, 983, 984, 985, 986, 987, 988, 989 }); +make_benches!(from2to1024, planned, {990, 991, 992, 993, 994, 995, 996, 997, 998, 999 }); + +make_benches!(from2to1024, planned, {1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009 }); +make_benches!(from2to1024, planned, {1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019 }); +make_benches!(from2to1024, planned, {1020, 1021, 1022, 1023, 1024 }); +*/ diff --git a/benches/bench_compare_scalar_neon.rs b/benches/bench_compare_scalar_neon.rs new file mode 100644 index 00000000..0ce3b2c8 --- /dev/null +++ b/benches/bench_compare_scalar_neon.rs @@ -0,0 +1,91 @@ +#![feature(test)] +extern crate rustfft; +extern crate test; + +use paste::paste; +use rustfft::num_complex::Complex; +use rustfft::num_traits::Zero; +use rustfft::Fft; +use std::sync::Arc; +use test::Bencher; + +// Make fft using scalar planner +fn bench_scalar_32(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerScalar::new(); + let fft: Arc> = planner.plan_fft_forward(len); + + let mut buffer: Vec> = vec![Complex::zero(); len]; + let mut scratch: Vec> = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} + +// Make fft using scalar planner +fn bench_scalar_64(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerScalar::new(); + let fft: Arc> = planner.plan_fft_forward(len); + + let mut buffer: Vec> = vec![Complex::zero(); len]; + let mut scratch: Vec> = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} + +// Make fft using sse planner +fn bench_neon_32(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerNeon::new().unwrap(); + let fft: Arc> = planner.plan_fft_forward(len); + + let mut buffer: Vec> = vec![Complex::zero(); len]; + let mut scratch: Vec> = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} + +// Make fft using sse planner +fn bench_neon_64(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerNeon::new().unwrap(); + let fft: Arc> = planner.plan_fft_forward(len); + + let mut buffer: Vec> = vec![Complex::zero(); len]; + let mut scratch: Vec> = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} + + +// Create benches using functions taking one argument +macro_rules! make_benches { + ($name:ident, { $($len:literal),* }) => { + paste! { + $( + #[bench] + fn [<$name _ $len _f32_scalar>](b: &mut Bencher) { + [](b, $len); + } + + #[bench] + fn [<$name _ $len _f64_scalar>](b: &mut Bencher) { + [](b, $len); + } + + #[bench] + fn [<$name _ $len _f32_neon>](b: &mut Bencher) { + [](b, $len); + } + + #[bench] + fn [<$name _ $len _f64_neon>](b: &mut Bencher) { + [](b, $len); + } + )* + } + } +} + +make_benches!(neoncomparison, {4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072}); +make_benches!(neoncomparison, { 262144, 524288, 1048576, 2097152, 4194304 }); diff --git a/benches/bench_rustfft_neon.rs b/benches/bench_rustfft_neon.rs new file mode 100644 index 00000000..337ab1a8 --- /dev/null +++ b/benches/bench_rustfft_neon.rs @@ -0,0 +1,172 @@ +#![feature(test)] +extern crate rustfft; +extern crate test; + +use rustfft::num_complex::Complex; +use rustfft::num_traits::Zero; +use rustfft::Fft; +use std::sync::Arc; +use test::Bencher; + + +/// Times just the FFT execution (not allocation and pre-calculation) +/// for a given length +fn bench_planned_f32(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerNeon::new().unwrap(); + let fft: Arc> = planner.plan_fft_forward(len); + assert_eq!(fft.len(), len); + + let mut buffer = vec![Complex::zero(); len]; + let mut scratch = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} +fn bench_planned_f64(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerNeon::new().unwrap(); + let fft: Arc> = planner.plan_fft_forward(len); + assert_eq!(fft.len(), len); + + let mut buffer = vec![Complex::zero(); len]; + let mut scratch = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} + +/// Times just the FFT execution (not allocation and pre-calculation) +/// for a given length. +/// Run the fft on a 10*len vector, similar to how the butterflies are often used. +fn bench_planned_multi_f32(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerNeon::new().unwrap(); + let fft: Arc> = planner.plan_fft_forward(len); + + let mut buffer = vec![Complex::zero(); len * 10]; + let mut scratch = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} +fn bench_planned_multi_f64(b: &mut Bencher, len: usize) { + let mut planner = rustfft::FftPlannerNeon::new().unwrap(); + let fft: Arc> = planner.plan_fft_forward(len); + + let mut buffer = vec![Complex::zero(); len * 10]; + let mut scratch = vec![Complex::zero(); fft.get_inplace_scratch_len()]; + b.iter(|| { + fft.process_with_scratch(&mut buffer, &mut scratch); + }); +} + +// All butterflies +#[bench] fn neon_butterfly32_02(b: &mut Bencher) { bench_planned_multi_f32(b, 2);} +#[bench] fn neon_butterfly32_03(b: &mut Bencher) { bench_planned_multi_f32(b, 3);} +#[bench] fn neon_butterfly32_04(b: &mut Bencher) { bench_planned_multi_f32(b, 4);} +#[bench] fn neon_butterfly32_05(b: &mut Bencher) { bench_planned_multi_f32(b, 5);} +#[bench] fn neon_butterfly32_06(b: &mut Bencher) { bench_planned_multi_f32(b, 6);} +#[bench] fn neon_butterfly32_07(b: &mut Bencher) { bench_planned_multi_f32(b, 7);} +#[bench] fn neon_butterfly32_08(b: &mut Bencher) { bench_planned_multi_f32(b, 8);} +#[bench] fn neon_butterfly32_09(b: &mut Bencher) { bench_planned_multi_f32(b, 9);} +#[bench] fn neon_butterfly32_10(b: &mut Bencher) { bench_planned_multi_f32(b, 10);} +#[bench] fn neon_butterfly32_11(b: &mut Bencher) { bench_planned_multi_f32(b, 11);} +#[bench] fn neon_butterfly32_12(b: &mut Bencher) { bench_planned_multi_f32(b, 12);} +#[bench] fn neon_butterfly32_13(b: &mut Bencher) { bench_planned_multi_f32(b, 13);} +#[bench] fn neon_butterfly32_15(b: &mut Bencher) { bench_planned_multi_f32(b, 15);} +#[bench] fn neon_butterfly32_16(b: &mut Bencher) { bench_planned_multi_f32(b, 16);} +#[bench] fn neon_butterfly32_17(b: &mut Bencher) { bench_planned_multi_f32(b, 17);} +#[bench] fn neon_butterfly32_19(b: &mut Bencher) { bench_planned_multi_f32(b, 19);} +#[bench] fn neon_butterfly32_23(b: &mut Bencher) { bench_planned_multi_f32(b, 23);} +#[bench] fn neon_butterfly32_29(b: &mut Bencher) { bench_planned_multi_f32(b, 29);} +#[bench] fn neon_butterfly32_31(b: &mut Bencher) { bench_planned_multi_f32(b, 31);} +#[bench] fn neon_butterfly32_32(b: &mut Bencher) { bench_planned_multi_f32(b, 32);} + +#[bench] fn neon_butterfly64_02(b: &mut Bencher) { bench_planned_multi_f64(b, 2);} +#[bench] fn neon_butterfly64_03(b: &mut Bencher) { bench_planned_multi_f64(b, 3);} +#[bench] fn neon_butterfly64_04(b: &mut Bencher) { bench_planned_multi_f64(b, 4);} +#[bench] fn neon_butterfly64_05(b: &mut Bencher) { bench_planned_multi_f64(b, 5);} +#[bench] fn neon_butterfly64_06(b: &mut Bencher) { bench_planned_multi_f64(b, 6);} +#[bench] fn neon_butterfly64_07(b: &mut Bencher) { bench_planned_multi_f64(b, 7);} +#[bench] fn neon_butterfly64_08(b: &mut Bencher) { bench_planned_multi_f64(b, 8);} +#[bench] fn neon_butterfly64_09(b: &mut Bencher) { bench_planned_multi_f64(b, 9);} +#[bench] fn neon_butterfly64_10(b: &mut Bencher) { bench_planned_multi_f64(b, 10);} +#[bench] fn neon_butterfly64_11(b: &mut Bencher) { bench_planned_multi_f64(b, 11);} +#[bench] fn neon_butterfly64_12(b: &mut Bencher) { bench_planned_multi_f64(b, 12);} +#[bench] fn neon_butterfly64_13(b: &mut Bencher) { bench_planned_multi_f64(b, 13);} +#[bench] fn neon_butterfly64_15(b: &mut Bencher) { bench_planned_multi_f64(b, 15);} +#[bench] fn neon_butterfly64_16(b: &mut Bencher) { bench_planned_multi_f64(b, 16);} +#[bench] fn neon_butterfly64_17(b: &mut Bencher) { bench_planned_multi_f64(b, 17);} +#[bench] fn neon_butterfly64_19(b: &mut Bencher) { bench_planned_multi_f64(b, 19);} +#[bench] fn neon_butterfly64_23(b: &mut Bencher) { bench_planned_multi_f64(b, 23);} +#[bench] fn neon_butterfly64_29(b: &mut Bencher) { bench_planned_multi_f64(b, 29);} +#[bench] fn neon_butterfly64_31(b: &mut Bencher) { bench_planned_multi_f64(b, 31);} +#[bench] fn neon_butterfly64_32(b: &mut Bencher) { bench_planned_multi_f64(b, 32);} + +// Powers of 2 +#[bench] fn neon_planned32_p2_00000064(b: &mut Bencher) { bench_planned_f32(b, 64); } +#[bench] fn neon_planned32_p2_00000128(b: &mut Bencher) { bench_planned_f32(b, 128); } +#[bench] fn neon_planned32_p2_00000256(b: &mut Bencher) { bench_planned_f32(b, 256); } +#[bench] fn neon_planned32_p2_00000512(b: &mut Bencher) { bench_planned_f32(b, 512); } +#[bench] fn neon_planned32_p2_00001024(b: &mut Bencher) { bench_planned_f32(b, 1024); } +#[bench] fn neon_planned32_p2_00002048(b: &mut Bencher) { bench_planned_f32(b, 2048); } +#[bench] fn neon_planned32_p2_00004096(b: &mut Bencher) { bench_planned_f32(b, 4096); } +#[bench] fn neon_planned32_p2_00016384(b: &mut Bencher) { bench_planned_f32(b, 16384); } +#[bench] fn neon_planned32_p2_00065536(b: &mut Bencher) { bench_planned_f32(b, 65536); } +#[bench] fn neon_planned32_p2_01048576(b: &mut Bencher) { bench_planned_f32(b, 1048576); } + +#[bench] fn neon_planned64_p2_00000064(b: &mut Bencher) { bench_planned_f64(b, 64); } +#[bench] fn neon_planned64_p2_00000128(b: &mut Bencher) { bench_planned_f64(b, 128); } +#[bench] fn neon_planned64_p2_00000256(b: &mut Bencher) { bench_planned_f64(b, 256); } +#[bench] fn neon_planned64_p2_00000512(b: &mut Bencher) { bench_planned_f64(b, 512); } +#[bench] fn neon_planned64_p2_00001024(b: &mut Bencher) { bench_planned_f64(b, 1024); } +#[bench] fn neon_planned64_p2_00002048(b: &mut Bencher) { bench_planned_f64(b, 2048); } +#[bench] fn neon_planned64_p2_00004096(b: &mut Bencher) { bench_planned_f64(b, 4096); } +#[bench] fn neon_planned64_p2_00016384(b: &mut Bencher) { bench_planned_f64(b, 16384); } +#[bench] fn neon_planned64_p2_00065536(b: &mut Bencher) { bench_planned_f64(b, 65536); } +#[bench] fn neon_planned64_p2_01048576(b: &mut Bencher) { bench_planned_f64(b, 1048576); } + + +// Powers of 7 +#[bench] fn neon_planned32_p7_00343(b: &mut Bencher) { bench_planned_f32(b, 343); } +#[bench] fn neon_planned32_p7_02401(b: &mut Bencher) { bench_planned_f32(b, 2401); } +#[bench] fn neon_planned32_p7_16807(b: &mut Bencher) { bench_planned_f32(b, 16807); } + +#[bench] fn neon_planned64_p7_00343(b: &mut Bencher) { bench_planned_f64(b, 343); } +#[bench] fn neon_planned64_p7_02401(b: &mut Bencher) { bench_planned_f64(b, 2401); } +#[bench] fn neon_planned64_p7_16807(b: &mut Bencher) { bench_planned_f64(b, 16807); } + +// Prime lengths +#[bench] fn neon_planned32_prime_0149(b: &mut Bencher) { bench_planned_f32(b, 149); } +#[bench] fn neon_planned32_prime_0151(b: &mut Bencher) { bench_planned_f32(b, 151); } +#[bench] fn neon_planned32_prime_0251(b: &mut Bencher) { bench_planned_f32(b, 251); } +#[bench] fn neon_planned32_prime_0257(b: &mut Bencher) { bench_planned_f32(b, 257); } +#[bench] fn neon_planned32_prime_2017(b: &mut Bencher) { bench_planned_f32(b, 2017); } +#[bench] fn neon_planned32_prime_2879(b: &mut Bencher) { bench_planned_f32(b, 2879); } +#[bench] fn neon_planned32_prime_65521(b: &mut Bencher) { bench_planned_f32(b, 65521); } +#[bench] fn neon_planned32_prime_746497(b: &mut Bencher) { bench_planned_f32(b,746497); } + +#[bench] fn neon_planned64_prime_0149(b: &mut Bencher) { bench_planned_f64(b, 149); } +#[bench] fn neon_planned64_prime_0151(b: &mut Bencher) { bench_planned_f64(b, 151); } +#[bench] fn neon_planned64_prime_0251(b: &mut Bencher) { bench_planned_f64(b, 251); } +#[bench] fn neon_planned64_prime_0257(b: &mut Bencher) { bench_planned_f64(b, 257); } +#[bench] fn neon_planned64_prime_2017(b: &mut Bencher) { bench_planned_f64(b, 2017); } +#[bench] fn neon_planned64_prime_2879(b: &mut Bencher) { bench_planned_f64(b, 2879); } +#[bench] fn neon_planned64_prime_65521(b: &mut Bencher) { bench_planned_f64(b, 65521); } +#[bench] fn neon_planned64_prime_746497(b: &mut Bencher) { bench_planned_f64(b,746497); } + +// small mixed composites +#[bench] fn neon_planned32_composite_000018(b: &mut Bencher) { bench_planned_f32(b, 00018); } +#[bench] fn neon_planned32_composite_000360(b: &mut Bencher) { bench_planned_f32(b, 00360); } +#[bench] fn neon_planned32_composite_001200(b: &mut Bencher) { bench_planned_f32(b, 01200); } +#[bench] fn neon_planned32_composite_044100(b: &mut Bencher) { bench_planned_f32(b, 44100); } +#[bench] fn neon_planned32_composite_048000(b: &mut Bencher) { bench_planned_f32(b, 48000); } +#[bench] fn neon_planned32_composite_046656(b: &mut Bencher) { bench_planned_f32(b, 46656); } + +#[bench] fn neon_planned64_composite_000018(b: &mut Bencher) { bench_planned_f64(b, 00018); } +#[bench] fn neon_planned64_composite_000360(b: &mut Bencher) { bench_planned_f64(b, 00360); } +#[bench] fn neon_planned64_composite_001200(b: &mut Bencher) { bench_planned_f64(b, 01200); } +#[bench] fn neon_planned64_composite_044100(b: &mut Bencher) { bench_planned_f64(b, 44100); } +#[bench] fn neon_planned64_composite_048000(b: &mut Bencher) { bench_planned_f64(b, 48000); } +#[bench] fn neon_planned64_composite_046656(b: &mut Bencher) { bench_planned_f64(b, 46656); } + + + diff --git a/benches/bench_rustfft_sse.rs b/benches/bench_rustfft_sse.rs index 680fe8ff..962e7e56 100644 --- a/benches/bench_rustfft_sse.rs +++ b/benches/bench_rustfft_sse.rs @@ -59,114 +59,114 @@ fn bench_planned_multi_f64(b: &mut Bencher, len: usize) { } // All butterflies -#[bench] fn butterfly32_02(b: &mut Bencher) { bench_planned_multi_f32(b, 2);} -#[bench] fn butterfly32_03(b: &mut Bencher) { bench_planned_multi_f32(b, 3);} -#[bench] fn butterfly32_04(b: &mut Bencher) { bench_planned_multi_f32(b, 4);} -#[bench] fn butterfly32_05(b: &mut Bencher) { bench_planned_multi_f32(b, 5);} -#[bench] fn butterfly32_06(b: &mut Bencher) { bench_planned_multi_f32(b, 6);} -#[bench] fn butterfly32_07(b: &mut Bencher) { bench_planned_multi_f32(b, 7);} -#[bench] fn butterfly32_08(b: &mut Bencher) { bench_planned_multi_f32(b, 8);} -#[bench] fn butterfly32_09(b: &mut Bencher) { bench_planned_multi_f32(b, 9);} -#[bench] fn butterfly32_10(b: &mut Bencher) { bench_planned_multi_f32(b, 10);} -#[bench] fn butterfly32_11(b: &mut Bencher) { bench_planned_multi_f32(b, 11);} -#[bench] fn butterfly32_12(b: &mut Bencher) { bench_planned_multi_f32(b, 12);} -#[bench] fn butterfly32_13(b: &mut Bencher) { bench_planned_multi_f32(b, 13);} -#[bench] fn butterfly32_15(b: &mut Bencher) { bench_planned_multi_f32(b, 15);} -#[bench] fn butterfly32_16(b: &mut Bencher) { bench_planned_multi_f32(b, 16);} -#[bench] fn butterfly32_17(b: &mut Bencher) { bench_planned_multi_f32(b, 17);} -#[bench] fn butterfly32_19(b: &mut Bencher) { bench_planned_multi_f32(b, 19);} -#[bench] fn butterfly32_23(b: &mut Bencher) { bench_planned_multi_f32(b, 23);} -#[bench] fn butterfly32_29(b: &mut Bencher) { bench_planned_multi_f32(b, 29);} -#[bench] fn butterfly32_31(b: &mut Bencher) { bench_planned_multi_f32(b, 31);} -#[bench] fn butterfly32_32(b: &mut Bencher) { bench_planned_multi_f32(b, 32);} - -#[bench] fn butterfly64_02(b: &mut Bencher) { bench_planned_multi_f64(b, 2);} -#[bench] fn butterfly64_03(b: &mut Bencher) { bench_planned_multi_f64(b, 3);} -#[bench] fn butterfly64_04(b: &mut Bencher) { bench_planned_multi_f64(b, 4);} -#[bench] fn butterfly64_05(b: &mut Bencher) { bench_planned_multi_f64(b, 5);} -#[bench] fn butterfly64_06(b: &mut Bencher) { bench_planned_multi_f64(b, 6);} -#[bench] fn butterfly64_07(b: &mut Bencher) { bench_planned_multi_f64(b, 7);} -#[bench] fn butterfly64_08(b: &mut Bencher) { bench_planned_multi_f64(b, 8);} -#[bench] fn butterfly64_09(b: &mut Bencher) { bench_planned_multi_f64(b, 9);} -#[bench] fn butterfly64_10(b: &mut Bencher) { bench_planned_multi_f64(b, 10);} -#[bench] fn butterfly64_11(b: &mut Bencher) { bench_planned_multi_f64(b, 11);} -#[bench] fn butterfly64_12(b: &mut Bencher) { bench_planned_multi_f64(b, 12);} -#[bench] fn butterfly64_13(b: &mut Bencher) { bench_planned_multi_f64(b, 13);} -#[bench] fn butterfly64_15(b: &mut Bencher) { bench_planned_multi_f64(b, 15);} -#[bench] fn butterfly64_16(b: &mut Bencher) { bench_planned_multi_f64(b, 16);} -#[bench] fn butterfly64_17(b: &mut Bencher) { bench_planned_multi_f64(b, 17);} -#[bench] fn butterfly64_19(b: &mut Bencher) { bench_planned_multi_f64(b, 19);} -#[bench] fn butterfly64_23(b: &mut Bencher) { bench_planned_multi_f64(b, 23);} -#[bench] fn butterfly64_29(b: &mut Bencher) { bench_planned_multi_f64(b, 29);} -#[bench] fn butterfly64_31(b: &mut Bencher) { bench_planned_multi_f64(b, 31);} -#[bench] fn butterfly64_32(b: &mut Bencher) { bench_planned_multi_f64(b, 32);} +#[bench] fn sse_butterfly32_02(b: &mut Bencher) { bench_planned_multi_f32(b, 2);} +#[bench] fn sse_butterfly32_03(b: &mut Bencher) { bench_planned_multi_f32(b, 3);} +#[bench] fn sse_butterfly32_04(b: &mut Bencher) { bench_planned_multi_f32(b, 4);} +#[bench] fn sse_butterfly32_05(b: &mut Bencher) { bench_planned_multi_f32(b, 5);} +#[bench] fn sse_butterfly32_06(b: &mut Bencher) { bench_planned_multi_f32(b, 6);} +#[bench] fn sse_butterfly32_07(b: &mut Bencher) { bench_planned_multi_f32(b, 7);} +#[bench] fn sse_butterfly32_08(b: &mut Bencher) { bench_planned_multi_f32(b, 8);} +#[bench] fn sse_butterfly32_09(b: &mut Bencher) { bench_planned_multi_f32(b, 9);} +#[bench] fn sse_butterfly32_10(b: &mut Bencher) { bench_planned_multi_f32(b, 10);} +#[bench] fn sse_butterfly32_11(b: &mut Bencher) { bench_planned_multi_f32(b, 11);} +#[bench] fn sse_butterfly32_12(b: &mut Bencher) { bench_planned_multi_f32(b, 12);} +#[bench] fn sse_butterfly32_13(b: &mut Bencher) { bench_planned_multi_f32(b, 13);} +#[bench] fn sse_butterfly32_15(b: &mut Bencher) { bench_planned_multi_f32(b, 15);} +#[bench] fn sse_butterfly32_16(b: &mut Bencher) { bench_planned_multi_f32(b, 16);} +#[bench] fn sse_butterfly32_17(b: &mut Bencher) { bench_planned_multi_f32(b, 17);} +#[bench] fn sse_butterfly32_19(b: &mut Bencher) { bench_planned_multi_f32(b, 19);} +#[bench] fn sse_butterfly32_23(b: &mut Bencher) { bench_planned_multi_f32(b, 23);} +#[bench] fn sse_butterfly32_29(b: &mut Bencher) { bench_planned_multi_f32(b, 29);} +#[bench] fn sse_butterfly32_31(b: &mut Bencher) { bench_planned_multi_f32(b, 31);} +#[bench] fn sse_butterfly32_32(b: &mut Bencher) { bench_planned_multi_f32(b, 32);} + +#[bench] fn sse_butterfly64_02(b: &mut Bencher) { bench_planned_multi_f64(b, 2);} +#[bench] fn sse_butterfly64_03(b: &mut Bencher) { bench_planned_multi_f64(b, 3);} +#[bench] fn sse_butterfly64_04(b: &mut Bencher) { bench_planned_multi_f64(b, 4);} +#[bench] fn sse_butterfly64_05(b: &mut Bencher) { bench_planned_multi_f64(b, 5);} +#[bench] fn sse_butterfly64_06(b: &mut Bencher) { bench_planned_multi_f64(b, 6);} +#[bench] fn sse_butterfly64_07(b: &mut Bencher) { bench_planned_multi_f64(b, 7);} +#[bench] fn sse_butterfly64_08(b: &mut Bencher) { bench_planned_multi_f64(b, 8);} +#[bench] fn sse_butterfly64_09(b: &mut Bencher) { bench_planned_multi_f64(b, 9);} +#[bench] fn sse_butterfly64_10(b: &mut Bencher) { bench_planned_multi_f64(b, 10);} +#[bench] fn sse_butterfly64_11(b: &mut Bencher) { bench_planned_multi_f64(b, 11);} +#[bench] fn sse_butterfly64_12(b: &mut Bencher) { bench_planned_multi_f64(b, 12);} +#[bench] fn sse_butterfly64_13(b: &mut Bencher) { bench_planned_multi_f64(b, 13);} +#[bench] fn sse_butterfly64_15(b: &mut Bencher) { bench_planned_multi_f64(b, 15);} +#[bench] fn sse_butterfly64_16(b: &mut Bencher) { bench_planned_multi_f64(b, 16);} +#[bench] fn sse_butterfly64_17(b: &mut Bencher) { bench_planned_multi_f64(b, 17);} +#[bench] fn sse_butterfly64_19(b: &mut Bencher) { bench_planned_multi_f64(b, 19);} +#[bench] fn sse_butterfly64_23(b: &mut Bencher) { bench_planned_multi_f64(b, 23);} +#[bench] fn sse_butterfly64_29(b: &mut Bencher) { bench_planned_multi_f64(b, 29);} +#[bench] fn sse_butterfly64_31(b: &mut Bencher) { bench_planned_multi_f64(b, 31);} +#[bench] fn sse_butterfly64_32(b: &mut Bencher) { bench_planned_multi_f64(b, 32);} // Powers of 2 -#[bench] fn planned32_p2_00000064(b: &mut Bencher) { bench_planned_f32(b, 64); } -#[bench] fn planned32_p2_00000128(b: &mut Bencher) { bench_planned_f32(b, 128); } -#[bench] fn planned32_p2_00000256(b: &mut Bencher) { bench_planned_f32(b, 256); } -#[bench] fn planned32_p2_00000512(b: &mut Bencher) { bench_planned_f32(b, 512); } -#[bench] fn planned32_p2_00001024(b: &mut Bencher) { bench_planned_f32(b, 1024); } -#[bench] fn planned32_p2_00002048(b: &mut Bencher) { bench_planned_f32(b, 2048); } -#[bench] fn planned32_p2_00004096(b: &mut Bencher) { bench_planned_f32(b, 4096); } -#[bench] fn planned32_p2_00016384(b: &mut Bencher) { bench_planned_f32(b, 16384); } -#[bench] fn planned32_p2_00065536(b: &mut Bencher) { bench_planned_f32(b, 65536); } -#[bench] fn planned32_p2_01048576(b: &mut Bencher) { bench_planned_f32(b, 1048576); } - -#[bench] fn planned64_p2_00000064(b: &mut Bencher) { bench_planned_f64(b, 64); } -#[bench] fn planned64_p2_00000128(b: &mut Bencher) { bench_planned_f64(b, 128); } -#[bench] fn planned64_p2_00000256(b: &mut Bencher) { bench_planned_f64(b, 256); } -#[bench] fn planned64_p2_00000512(b: &mut Bencher) { bench_planned_f64(b, 512); } -#[bench] fn planned64_p2_00001024(b: &mut Bencher) { bench_planned_f64(b, 1024); } -#[bench] fn planned64_p2_00002048(b: &mut Bencher) { bench_planned_f64(b, 2048); } -#[bench] fn planned64_p2_00004096(b: &mut Bencher) { bench_planned_f64(b, 4096); } -#[bench] fn planned64_p2_00016384(b: &mut Bencher) { bench_planned_f64(b, 16384); } -#[bench] fn planned64_p2_00065536(b: &mut Bencher) { bench_planned_f64(b, 65536); } -#[bench] fn planned64_p2_01048576(b: &mut Bencher) { bench_planned_f64(b, 1048576); } +#[bench] fn sse_planned32_p2_00000064(b: &mut Bencher) { bench_planned_f32(b, 64); } +#[bench] fn sse_planned32_p2_00000128(b: &mut Bencher) { bench_planned_f32(b, 128); } +#[bench] fn sse_planned32_p2_00000256(b: &mut Bencher) { bench_planned_f32(b, 256); } +#[bench] fn sse_planned32_p2_00000512(b: &mut Bencher) { bench_planned_f32(b, 512); } +#[bench] fn sse_planned32_p2_00001024(b: &mut Bencher) { bench_planned_f32(b, 1024); } +#[bench] fn sse_planned32_p2_00002048(b: &mut Bencher) { bench_planned_f32(b, 2048); } +#[bench] fn sse_planned32_p2_00004096(b: &mut Bencher) { bench_planned_f32(b, 4096); } +#[bench] fn sse_planned32_p2_00016384(b: &mut Bencher) { bench_planned_f32(b, 16384); } +#[bench] fn sse_planned32_p2_00065536(b: &mut Bencher) { bench_planned_f32(b, 65536); } +#[bench] fn sse_planned32_p2_01048576(b: &mut Bencher) { bench_planned_f32(b, 1048576); } + +#[bench] fn sse_planned64_p2_00000064(b: &mut Bencher) { bench_planned_f64(b, 64); } +#[bench] fn sse_planned64_p2_00000128(b: &mut Bencher) { bench_planned_f64(b, 128); } +#[bench] fn sse_planned64_p2_00000256(b: &mut Bencher) { bench_planned_f64(b, 256); } +#[bench] fn sse_planned64_p2_00000512(b: &mut Bencher) { bench_planned_f64(b, 512); } +#[bench] fn sse_planned64_p2_00001024(b: &mut Bencher) { bench_planned_f64(b, 1024); } +#[bench] fn sse_planned64_p2_00002048(b: &mut Bencher) { bench_planned_f64(b, 2048); } +#[bench] fn sse_planned64_p2_00004096(b: &mut Bencher) { bench_planned_f64(b, 4096); } +#[bench] fn sse_planned64_p2_00016384(b: &mut Bencher) { bench_planned_f64(b, 16384); } +#[bench] fn sse_planned64_p2_00065536(b: &mut Bencher) { bench_planned_f64(b, 65536); } +#[bench] fn sse_planned64_p2_01048576(b: &mut Bencher) { bench_planned_f64(b, 1048576); } // Powers of 7 -#[bench] fn planned32_p7_00343(b: &mut Bencher) { bench_planned_f32(b, 343); } -#[bench] fn planned32_p7_02401(b: &mut Bencher) { bench_planned_f32(b, 2401); } -#[bench] fn planned32_p7_16807(b: &mut Bencher) { bench_planned_f32(b, 16807); } +#[bench] fn sse_planned32_p7_00343(b: &mut Bencher) { bench_planned_f32(b, 343); } +#[bench] fn sse_planned32_p7_02401(b: &mut Bencher) { bench_planned_f32(b, 2401); } +#[bench] fn sse_planned32_p7_16807(b: &mut Bencher) { bench_planned_f32(b, 16807); } -#[bench] fn planned64_p7_00343(b: &mut Bencher) { bench_planned_f64(b, 343); } -#[bench] fn planned64_p7_02401(b: &mut Bencher) { bench_planned_f64(b, 2401); } -#[bench] fn planned64_p7_16807(b: &mut Bencher) { bench_planned_f64(b, 16807); } +#[bench] fn sse_planned64_p7_00343(b: &mut Bencher) { bench_planned_f64(b, 343); } +#[bench] fn sse_planned64_p7_02401(b: &mut Bencher) { bench_planned_f64(b, 2401); } +#[bench] fn sse_planned64_p7_16807(b: &mut Bencher) { bench_planned_f64(b, 16807); } // Prime lengths -#[bench] fn planned32_prime_0149(b: &mut Bencher) { bench_planned_f32(b, 149); } -#[bench] fn planned32_prime_0151(b: &mut Bencher) { bench_planned_f32(b, 151); } -#[bench] fn planned32_prime_0251(b: &mut Bencher) { bench_planned_f32(b, 251); } -#[bench] fn planned32_prime_0257(b: &mut Bencher) { bench_planned_f32(b, 257); } -#[bench] fn planned32_prime_2017(b: &mut Bencher) { bench_planned_f32(b, 2017); } -#[bench] fn planned32_prime_2879(b: &mut Bencher) { bench_planned_f32(b, 2879); } -#[bench] fn planned32_prime_65521(b: &mut Bencher) { bench_planned_f32(b, 65521); } -#[bench] fn planned32_prime_746497(b: &mut Bencher) { bench_planned_f32(b,746497); } - -#[bench] fn planned64_prime_0149(b: &mut Bencher) { bench_planned_f64(b, 149); } -#[bench] fn planned64_prime_0151(b: &mut Bencher) { bench_planned_f64(b, 151); } -#[bench] fn planned64_prime_0251(b: &mut Bencher) { bench_planned_f64(b, 251); } -#[bench] fn planned64_prime_0257(b: &mut Bencher) { bench_planned_f64(b, 257); } -#[bench] fn planned64_prime_2017(b: &mut Bencher) { bench_planned_f64(b, 2017); } -#[bench] fn planned64_prime_2879(b: &mut Bencher) { bench_planned_f64(b, 2879); } -#[bench] fn planned64_prime_65521(b: &mut Bencher) { bench_planned_f64(b, 65521); } -#[bench] fn planned64_prime_746497(b: &mut Bencher) { bench_planned_f64(b,746497); } +#[bench] fn sse_planned32_prime_0149(b: &mut Bencher) { bench_planned_f32(b, 149); } +#[bench] fn sse_planned32_prime_0151(b: &mut Bencher) { bench_planned_f32(b, 151); } +#[bench] fn sse_planned32_prime_0251(b: &mut Bencher) { bench_planned_f32(b, 251); } +#[bench] fn sse_planned32_prime_0257(b: &mut Bencher) { bench_planned_f32(b, 257); } +#[bench] fn sse_planned32_prime_2017(b: &mut Bencher) { bench_planned_f32(b, 2017); } +#[bench] fn sse_planned32_prime_2879(b: &mut Bencher) { bench_planned_f32(b, 2879); } +#[bench] fn sse_planned32_prime_65521(b: &mut Bencher) { bench_planned_f32(b, 65521); } +#[bench] fn sse_planned32_prime_746497(b: &mut Bencher) { bench_planned_f32(b,746497); } + +#[bench] fn sse_planned64_prime_0149(b: &mut Bencher) { bench_planned_f64(b, 149); } +#[bench] fn sse_planned64_prime_0151(b: &mut Bencher) { bench_planned_f64(b, 151); } +#[bench] fn sse_planned64_prime_0251(b: &mut Bencher) { bench_planned_f64(b, 251); } +#[bench] fn sse_planned64_prime_0257(b: &mut Bencher) { bench_planned_f64(b, 257); } +#[bench] fn sse_planned64_prime_2017(b: &mut Bencher) { bench_planned_f64(b, 2017); } +#[bench] fn sse_planned64_prime_2879(b: &mut Bencher) { bench_planned_f64(b, 2879); } +#[bench] fn sse_planned64_prime_65521(b: &mut Bencher) { bench_planned_f64(b, 65521); } +#[bench] fn sse_planned64_prime_746497(b: &mut Bencher) { bench_planned_f64(b,746497); } // small mixed composites -#[bench] fn planned32_composite_000018(b: &mut Bencher) { bench_planned_f32(b, 00018); } -#[bench] fn planned32_composite_000360(b: &mut Bencher) { bench_planned_f32(b, 00360); } -#[bench] fn planned32_composite_001200(b: &mut Bencher) { bench_planned_f32(b, 01200); } -#[bench] fn planned32_composite_044100(b: &mut Bencher) { bench_planned_f32(b, 44100); } -#[bench] fn planned32_composite_048000(b: &mut Bencher) { bench_planned_f32(b, 48000); } -#[bench] fn planned32_composite_046656(b: &mut Bencher) { bench_planned_f32(b, 46656); } - -#[bench] fn planned64_composite_000018(b: &mut Bencher) { bench_planned_f64(b, 00018); } -#[bench] fn planned64_composite_000360(b: &mut Bencher) { bench_planned_f64(b, 00360); } -#[bench] fn planned64_composite_001200(b: &mut Bencher) { bench_planned_f64(b, 01200); } -#[bench] fn planned64_composite_044100(b: &mut Bencher) { bench_planned_f64(b, 44100); } -#[bench] fn planned64_composite_048000(b: &mut Bencher) { bench_planned_f64(b, 48000); } -#[bench] fn planned64_composite_046656(b: &mut Bencher) { bench_planned_f64(b, 46656); } +#[bench] fn sse_planned32_composite_000018(b: &mut Bencher) { bench_planned_f32(b, 00018); } +#[bench] fn sse_planned32_composite_000360(b: &mut Bencher) { bench_planned_f32(b, 00360); } +#[bench] fn sse_planned32_composite_001200(b: &mut Bencher) { bench_planned_f32(b, 01200); } +#[bench] fn sse_planned32_composite_044100(b: &mut Bencher) { bench_planned_f32(b, 44100); } +#[bench] fn sse_planned32_composite_048000(b: &mut Bencher) { bench_planned_f32(b, 48000); } +#[bench] fn sse_planned32_composite_046656(b: &mut Bencher) { bench_planned_f32(b, 46656); } + +#[bench] fn sse_planned64_composite_000018(b: &mut Bencher) { bench_planned_f64(b, 00018); } +#[bench] fn sse_planned64_composite_000360(b: &mut Bencher) { bench_planned_f64(b, 00360); } +#[bench] fn sse_planned64_composite_001200(b: &mut Bencher) { bench_planned_f64(b, 01200); } +#[bench] fn sse_planned64_composite_044100(b: &mut Bencher) { bench_planned_f64(b, 44100); } +#[bench] fn sse_planned64_composite_048000(b: &mut Bencher) { bench_planned_f64(b, 48000); } +#[bench] fn sse_planned64_composite_046656(b: &mut Bencher) { bench_planned_f64(b, 46656); } diff --git a/examples/asmtest.rs b/examples/asmtest.rs index 40e0efea..50124182 100644 --- a/examples/asmtest.rs +++ b/examples/asmtest.rs @@ -1,13 +1,29 @@ -//! Compile something that has a scalar butterfly 4 +//! This example is mean to be used for inspecting the generated assembly. +//! This can be interesting when working with simd intrinsics. +//! +//! To use: +//! - Mark the function that should be investigated with `#[inline(never)]`. +//! - If needed, add any required feature to the function, for example `#[target_feature(enable = "sse4.1")]` +//! - Change the code below to use the changed function. +//! Currently it is set up to look at the f32 version of the SSE 4-point butterfly. +//! It uses the FftPlannerSse to plan a length 4 FFT, that will use the modified butterfly. +//! - Ask rustc to output assembly code: +//! `cargo rustc --release --features sse --example asmtest -- --emit=asm` +//! - This will create a file at `target/release/examples/asmtest-0123456789abcdef.s` (with a random number in the filename). +//! - Open this file and search for the function. -//use rustfft::num_complex::Complex32; -use rustfft::num_complex::Complex64; -use rustfft::FftPlannerScalar; +use rustfft::num_complex::Complex32; +//use rustfft::num_complex::Complex64; +//use rustfft::FftPlannerScalar; +use rustfft::FftPlannerSse; +//use rustfft::FftPlannerNeon; fn main() { - let mut planner = FftPlannerScalar::new(); + //let mut planner = FftPlannerScalar::new(); + let mut planner = FftPlannerSse::new().unwrap(); + //let mut planner = FftPlannerNeon::new().unwrap(); let fft = planner.plan_fft_forward(4); - let mut buffer = vec![Complex64::new(0.0, 0.0); 100]; + let mut buffer = vec![Complex32::new(0.0, 0.0); 100]; fft.process(&mut buffer); } diff --git a/rustfmt.toml b/rustfmt.toml index 064b0750..2ee121bd 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,3 +1,11 @@ unstable_features = true # for "ignore", remove asap -ignore = ["benches/bench_rustfft.rs", "benches/bench_rustfft_sse.rs", "src/sse/sse_prime_butterflies.rs"] +ignore = [ + "benches/bench_rustfft.rs", + "benches/bench_rustfft_sse.rs", + "benches/bench_rustfft_neon.rs", + "src/sse/sse_prime_butterflies.rs", + "src/neon/neon_prime_butterflies.rs", + "benches/bench_compare_scalar_neon.rs", + ] + diff --git a/src/lib.rs b/src/lib.rs index 66a6d373..3f920ad3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,12 @@ #![cfg_attr(all(feature = "bench", test), feature(test))] +#![cfg_attr( + all(feature = "neon-nightly", target_arch = "aarch64"), + feature(aarch64_target_feature) +)] +#![cfg_attr( + all(feature = "neon-nightly", target_arch = "aarch64"), + feature(stdsimd) +)] //! RustFFT is a high-performance FFT library written in pure Rust. //! @@ -9,6 +17,8 @@ //! For machines that do not have AVX, RustFFT also supports the SSE4.1 instruction set. //! As for AVX, this is enabled automatically when using the FftPlanner. //! +//! Additionally, there is (opt-in, nightly-only) support for the Neon instruction set on AArch64. +//! //! ### Usage //! //! The recommended way to use RustFFT is to create a [`FftPlanner`](crate::FftPlanner) instance and then call its @@ -45,7 +55,7 @@ //! advanced users may have better insight than the planner into which algorithms are best for a specific size. See the //! [`algorithm`](crate::algorithm) module for a complete list of scalar algorithms implemented by RustFFT. //! -//! Users should beware, however, that bypassing the planner will disable all AVX and SSE optimizations. +//! Users should beware, however, that bypassing the planner will disable all AVX, SSE and Neon optimizations. //! //! ### Feature Flags //! @@ -62,6 +72,11 @@ //! supported and its feature flag is enabled, RustFFT will use AVX instead of SSE4.1. //! //! On every platform besides x86_64, this feature does nothing, and RustFFT will behave like it's not set. +//! * `neon` (Experimental, disabled by default) +//! +//! On AArch64 (64-bit ARM) the `neon` feature enables compilation of Neon-accelerated code. Enabling it improves +//! performance, while disabling it reduces compile time and binary size. +//! Note that Rust's Neon support is very new, and the `neon` feature must use a nightly compiler. //! //! ### Normalization //! @@ -264,7 +279,7 @@ mod avx { /// let fft = planner.plan_fft_forward(1234); /// /// let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 1234]; - /// fft.process_inplace(&mut buffer); + /// fft.process(&mut buffer); /// /// // The FFT instance returned by the planner has the type `Arc>`, /// // where T is the numeric type, ie f32 or f64, so it's cheap to clone @@ -395,5 +410,84 @@ mod sse { pub use self::sse::sse_planner::FftPlannerSse; +// Algorithms implemented to use Neon instructions. Only compiled on AArch64, and only compiled if the "neon-nightly" feature flag is set. +#[cfg(all(target_arch = "aarch64", feature = "neon-nightly"))] +mod neon; + +// If we're not on AArch64, or if the "neon-nightly" feature was disabled, keep a stub implementation around that has the same API, but does nothing +// That way, users can write code using the Neon planner and compile it on any platform +#[cfg(not(all(target_arch = "aarch64", feature = "neon-nightly")))] +mod neon { + pub mod neon_planner { + use crate::{Fft, FftDirection, FftNum}; + use std::sync::Arc; + + /// The Neon FFT planner creates new FFT algorithm instances using a mix of scalar and Neon accelerated algorithms. + /// It is supported when using the 64-bit AArch64 instruction set. + /// + /// RustFFT has several FFT algorithms available. For a given FFT size, the `FftPlannerNeon` decides which of the + /// available FFT algorithms to use and then initializes them. + /// + /// ~~~ + /// // Perform a forward Fft of size 1234 + /// use std::sync::Arc; + /// use rustfft::{FftPlannerNeon, num_complex::Complex}; + /// + /// if let Ok(mut planner) = FftPlannerNeon::new() { + /// let fft = planner.plan_fft_forward(1234); + /// + /// let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 1234]; + /// fft.process(&mut buffer); + /// + /// // The FFT instance returned by the planner has the type `Arc>`, + /// // where T is the numeric type, ie f32 or f64, so it's cheap to clone + /// let fft_clone = Arc::clone(&fft); + /// } + /// ~~~ + /// + /// If you plan on creating multiple FFT instances, it is recommended to reuse the same planner for all of them. This + /// is because the planner re-uses internal data across FFT instances wherever possible, saving memory and reducing + /// setup time. (FFT instances created with one planner will never re-use data and buffers with FFT instances created + /// by a different planner) + /// + /// Each FFT instance owns [`Arc`s](std::sync::Arc) to its internal data, rather than borrowing it from the planner, so it's perfectly + /// safe to drop the planner after creating Fft instances. + pub struct FftPlannerNeon { + _phantom: std::marker::PhantomData, + } + impl FftPlannerNeon { + /// Creates a new `FftPlannerNeon` instance. + /// + /// Returns `Ok(planner_instance)` if this machine has the required instruction sets. + /// Returns `Err(())` if some instruction sets are missing. + pub fn new() -> Result { + Err(()) + } + /// Returns a `Fft` instance which uses Neon instructions to compute FFTs of size `len`. + /// + /// If the provided `direction` is `FftDirection::Forward`, the returned instance will compute forward FFTs. If it's `FftDirection::Inverse`, it will compute inverse FFTs. + /// + /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time. + pub fn plan_fft(&mut self, _len: usize, _direction: FftDirection) -> Arc> { + unreachable!() + } + /// Returns a `Fft` instance which uses Neon instructions to compute forward FFTs of size `len`. + /// + /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time. + pub fn plan_fft_forward(&mut self, _len: usize) -> Arc> { + unreachable!() + } + /// Returns a `Fft` instance which uses Neon instructions to compute inverse FFTs of size `len. + /// + /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time. + pub fn plan_fft_inverse(&mut self, _len: usize) -> Arc> { + unreachable!() + } + } + } +} + +pub use self::neon::neon_planner::FftPlannerNeon; + #[cfg(test)] mod test_utils; diff --git a/src/neon/mod.rs b/src/neon/mod.rs new file mode 100644 index 00000000..2c90de43 --- /dev/null +++ b/src/neon/mod.rs @@ -0,0 +1,17 @@ +#[macro_use] +mod neon_common; +#[macro_use] +mod neon_vector; + +#[macro_use] +pub mod neon_butterflies; +pub mod neon_prime_butterflies; +pub mod neon_radix4; + +mod neon_utils; + +pub mod neon_planner; + +pub use self::neon_butterflies::*; +pub use self::neon_prime_butterflies::*; +pub use self::neon_radix4::*; diff --git a/src/neon/neon_butterflies.rs b/src/neon/neon_butterflies.rs new file mode 100644 index 00000000..d4fc18d6 --- /dev/null +++ b/src/neon/neon_butterflies.rs @@ -0,0 +1,3444 @@ +use core::arch::aarch64::*; +use num_complex::Complex; + +use crate::{common::FftNum, FftDirection}; + +use crate::array_utils; +use crate::array_utils::{RawSlice, RawSliceMut}; +use crate::common::{fft_error_inplace, fft_error_outofplace}; +use crate::twiddles; +use crate::{Direction, Fft, Length}; + +use super::neon_common::{assert_f32, assert_f64}; +use super::neon_utils::*; +use super::neon_vector::{NeonArray, NeonArrayMut}; + +#[allow(unused)] +macro_rules! boilerplate_fft_neon_f32_butterfly { + ($struct_name:ident, $len:expr, $direction_fn:expr) => { + impl $struct_name { + //#[target_feature(enable = "neon")] + //#[inline(always)] + pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex]) { + self.perform_fft_contiguous( + RawSlice::new_transmuted(buffer), + RawSliceMut::new_transmuted(buffer), + ); + } + + //#[target_feature(enable = "neon")] + //#[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_butterfly(&self, buffer: &mut [Complex]) { + self.perform_parallel_fft_contiguous( + RawSlice::new_transmuted(buffer), + RawSliceMut::new_transmuted(buffer), + ); + } + + // Do multiple ffts over a longer vector inplace, called from "process_with_scratch" of Fft trait + //#[target_feature(enable = "neon")] + pub(crate) unsafe fn perform_fft_butterfly_multi( + &self, + buffer: &mut [Complex], + ) -> Result<(), ()> { + let len = buffer.len(); + let alldone = array_utils::iter_chunks(buffer, 2 * self.len(), |chunk| { + self.perform_parallel_fft_butterfly(chunk) + }); + if alldone.is_err() && buffer.len() >= self.len() { + self.perform_fft_butterfly(&mut buffer[len - self.len()..]); + } + Ok(()) + } + + // Do multiple ffts over a longer vector outofplace, called from "process_outofplace_with_scratch" of Fft trait + //#[target_feature(enable = "neon")] + pub(crate) unsafe fn perform_oop_fft_butterfly_multi( + &self, + input: &mut [Complex], + output: &mut [Complex], + ) -> Result<(), ()> { + let len = input.len(); + let alldone = array_utils::iter_chunks_zipped( + input, + output, + 2 * self.len(), + |in_chunk, out_chunk| { + self.perform_parallel_fft_contiguous( + RawSlice::new_transmuted(in_chunk), + RawSliceMut::new_transmuted(out_chunk), + ) + }, + ); + if alldone.is_err() && input.len() >= self.len() { + self.perform_fft_contiguous( + RawSlice::new_transmuted(&input[len - self.len()..]), + RawSliceMut::new_transmuted(&mut output[len - self.len()..]), + ); + } + Ok(()) + } + } + }; +} + +macro_rules! boilerplate_fft_neon_f64_butterfly { + ($struct_name:ident, $len:expr, $direction_fn:expr) => { + impl $struct_name { + // Do a single fft + //#[target_feature(enable = "neon")] + pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: &mut [Complex]) { + self.perform_fft_contiguous( + RawSlice::new_transmuted(buffer), + RawSliceMut::new_transmuted(buffer), + ); + } + + // Do multiple ffts over a longer vector inplace, called from "process_with_scratch" of Fft trait + //#[target_feature(enable = "neon")] + pub(crate) unsafe fn perform_fft_butterfly_multi( + &self, + buffer: &mut [Complex], + ) -> Result<(), ()> { + array_utils::iter_chunks(buffer, self.len(), |chunk| { + self.perform_fft_butterfly(chunk) + }) + } + + // Do multiple ffts over a longer vector outofplace, called from "process_outofplace_with_scratch" of Fft trait + //#[target_feature(enable = "neon")] + pub(crate) unsafe fn perform_oop_fft_butterfly_multi( + &self, + input: &mut [Complex], + output: &mut [Complex], + ) -> Result<(), ()> { + array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| { + self.perform_fft_contiguous( + RawSlice::new_transmuted(in_chunk), + RawSliceMut::new_transmuted(out_chunk), + ) + }) + } + } + }; +} + +#[allow(unused)] +macro_rules! boilerplate_fft_neon_common_butterfly { + ($struct_name:ident, $len:expr, $direction_fn:expr) => { + impl Fft for $struct_name { + fn process_outofplace_with_scratch( + &self, + input: &mut [Complex], + output: &mut [Complex], + _scratch: &mut [Complex], + ) { + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + let result = unsafe { self.perform_oop_fft_butterfly_multi(input, output) }; + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } + fn process_with_scratch(&self, buffer: &mut [Complex], _scratch: &mut [Complex]) { + if buffer.len() < self.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_inplace(self.len(), buffer.len(), 0, 0); + return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here + } + + let result = unsafe { self.perform_fft_butterfly_multi(buffer) }; + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_inplace(self.len(), buffer.len(), 0, 0); + } + } + #[inline(always)] + fn get_inplace_scratch_len(&self) -> usize { + 0 + } + #[inline(always)] + fn get_outofplace_scratch_len(&self) -> usize { + 0 + } + } + impl Length for $struct_name { + #[inline(always)] + fn len(&self) -> usize { + $len + } + } + impl Direction for $struct_name { + #[inline(always)] + fn fft_direction(&self) -> FftDirection { + $direction_fn(self) + } + } + }; +} + +// _ _________ _ _ _ +// / | |___ /___ \| |__ (_) |_ +// | | _____ |_ \ __) | '_ \| | __| +// | | |_____| ___) / __/| |_) | | |_ +// |_| |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly1 { + direction: FftDirection, + _phantom: std::marker::PhantomData, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly1, 1, |this: &NeonF32Butterfly1<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly1, 1, |this: &NeonF32Butterfly1<_>| this + .direction); +impl NeonF32Butterfly1 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + Self { + direction, + _phantom: std::marker::PhantomData, + } + } + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + _input: RawSlice>, + _output: RawSliceMut>, + ) { + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + _input: RawSlice>, + _output: RawSliceMut>, + ) { + } +} + +// _ __ _ _ _ _ _ +// / | / /_ | || | | |__ (_) |_ +// | | _____ | '_ \| || |_| '_ \| | __| +// | | |_____| | (_) |__ _| |_) | | |_ +// |_| \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly1 { + direction: FftDirection, + _phantom: std::marker::PhantomData, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly1, 1, |this: &NeonF64Butterfly1<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly1, 1, |this: &NeonF64Butterfly1<_>| this + .direction); +impl NeonF64Butterfly1 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + Self { + direction, + _phantom: std::marker::PhantomData, + } + } + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + _input: RawSlice>, + _output: RawSliceMut>, + ) { + } +} + +// ____ _________ _ _ _ +// |___ \ |___ /___ \| |__ (_) |_ +// __) | _____ |_ \ __) | '_ \| | __| +// / __/ |_____| ___) / __/| |_) | | |_ +// |_____| |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly2 { + direction: FftDirection, + _phantom: std::marker::PhantomData, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly2, 2, |this: &NeonF32Butterfly2<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly2, 2, |this: &NeonF32Butterfly2<_>| this + .direction); +impl NeonF32Butterfly2 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + Self { + direction, + _phantom: std::marker::PhantomData, + } + } + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = input.load_complex(0); + + let temp = self.perform_fft_direct(values); + + output.store_complex(temp, 0); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values_a = input.load_complex(0); + let values_b = input.load_complex(2); + + let out = self.perform_parallel_fft_direct(values_a, values_b); + + let [out02, out13] = transpose_complex_2x2_f32(out[0], out[1]); + + output.store_complex(out02, 0); + output.store_complex(out13, 2); + } + + // length 2 fft of x, given as [x0, x1] + // result is [X0, X1] + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: float32x4_t) -> float32x4_t { + solo_fft2_f32(values) + } + + // dual length 2 fft of x and y, given as [x0, x1], [y0, y1] + // result is [X0, Y0], [X1, Y1] + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + values_x: float32x4_t, + values_y: float32x4_t, + ) -> [float32x4_t; 2] { + parallel_fft2_contiguous_f32(values_x, values_y) + } +} + +// double lenth 2 fft of a and b, given as [x0, y0], [x1, y1] +// result is [X0, Y0], [X1, Y1] +#[inline(always)] +pub(crate) unsafe fn parallel_fft2_interleaved_f32( + val02: float32x4_t, + val13: float32x4_t, +) -> [float32x4_t; 2] { + let temp0 = vaddq_f32(val02, val13); + let temp1 = vsubq_f32(val02, val13); + [temp0, temp1] +} + +// double lenth 2 fft of a and b, given as [x0, x1], [y0, y1] +// result is [X0, Y0], [X1, Y1] +#[inline(always)] +unsafe fn parallel_fft2_contiguous_f32(left: float32x4_t, right: float32x4_t) -> [float32x4_t; 2] { + let [temp02, temp13] = transpose_complex_2x2_f32(left, right); + parallel_fft2_interleaved_f32(temp02, temp13) +} + +// length 2 fft of x, given as [x0, x1] +// result is [X0, X1] +#[inline(always)] +unsafe fn solo_fft2_f32(values: float32x4_t) -> float32x4_t { + let high = vget_high_f32(values); + let low = vget_low_f32(values); + vcombine_f32(vadd_f32(low, high), vsub_f32(low, high)) +} + +// ____ __ _ _ _ _ _ +// |___ \ / /_ | || | | |__ (_) |_ +// __) | _____ | '_ \| || |_| '_ \| | __| +// / __/ |_____| | (_) |__ _| |_) | | |_ +// |_____| \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly2 { + direction: FftDirection, + _phantom: std::marker::PhantomData, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly2, 2, |this: &NeonF64Butterfly2<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly2, 2, |this: &NeonF64Butterfly2<_>| this + .direction); +impl NeonF64Butterfly2 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + Self { + direction, + _phantom: std::marker::PhantomData, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value0 = input.load_complex(0); + let value1 = input.load_complex(1); + + let out = self.perform_fft_direct(value0, value1); + + output.store_complex(out[0], 0); + output.store_complex(out[1], 1); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value0: float64x2_t, + value1: float64x2_t, + ) -> [float64x2_t; 2] { + solo_fft2_f64(value0, value1) + } +} + +#[inline(always)] +pub(crate) unsafe fn solo_fft2_f64(left: float64x2_t, right: float64x2_t) -> [float64x2_t; 2] { + let temp0 = vaddq_f64(left, right); + let temp1 = vsubq_f64(left, right); + [temp0, temp1] +} + +// _____ _________ _ _ _ +// |___ / |___ /___ \| |__ (_) |_ +// |_ \ _____ |_ \ __) | '_ \| | __| +// ___) | |_____| ___) / __/| |_) | | |_ +// |____/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly3 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle: float32x4_t, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly3, 3, |this: &NeonF32Butterfly3<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly3, 3, |this: &NeonF32Butterfly3<_>| this + .direction); +impl NeonF32Butterfly3 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 3, direction); + let twiddle = unsafe { vld1q_f32([tw1.re, tw1.re, -tw1.im, -tw1.im].as_ptr()) }; + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle, + twiddle1re, + twiddle1im, + } + } + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value0x = input.load_partial1_complex(0); + let value12 = input.load_complex(1); + + let out = self.perform_fft_direct(value0x, value12); + + output.store_partial_lo_complex(out[0], 0); + output.store_complex(out[1], 1); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let valuea0a1 = input.load_complex(0); + let valuea2b0 = input.load_complex(2); + let valueb1b2 = input.load_complex(4); + + let value0 = extract_lo_hi_f32(valuea0a1, valuea2b0); + let value1 = extract_hi_lo_f32(valuea0a1, valueb1b2); + let value2 = extract_lo_hi_f32(valuea2b0, valueb1b2); + + let out = self.perform_parallel_fft_direct(value0, value1, value2); + + let out0 = extract_lo_lo_f32(out[0], out[1]); + let out1 = extract_lo_hi_f32(out[2], out[0]); + let out2 = extract_hi_hi_f32(out[1], out[2]); + + output.store_complex(out0, 0); + output.store_complex(out1, 2); + output.store_complex(out2, 4); + } + + // length 3 fft of a, given as [x0, 0.0], [x1, x2] + // result is [X0, Z], [X1, X2] + // The value Z should be discarded. + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value0x: float32x4_t, + value12: float32x4_t, + ) -> [float32x4_t; 2] { + // This is a Neon translation of the scalar 3-point butterfly + let rev12 = reverse_complex_and_negate_hi_f32(value12); + let temp12pn = self.rotate.rotate_hi(vaddq_f32(value12, rev12)); + let twiddled = vmulq_f32(temp12pn, self.twiddle); + let temp = vaddq_f32(value0x, twiddled); + + let out12 = solo_fft2_f32(temp); + let out0x = vaddq_f32(value0x, temp12pn); + [out0x, out12] + } + + // length 3 dual fft of a, given as (x0, y0), (x1, y1), (x2, y2). + // result is [(X0, Y0), (X1, Y1), (X2, Y2)] + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + value0: float32x4_t, + value1: float32x4_t, + value2: float32x4_t, + ) -> [float32x4_t; 3] { + // This is a Neon translation of the scalar 3-point butterfly + let x12p = vaddq_f32(value1, value2); + let x12n = vsubq_f32(value1, value2); + let sum = vaddq_f32(value0, x12p); + + let temp_a = vmulq_f32(self.twiddle1re, x12p); + let temp_a = vaddq_f32(temp_a, value0); + + let n_rot = self.rotate.rotate_both(x12n); + let temp_b = vmulq_f32(self.twiddle1im, n_rot); + + let x1 = vaddq_f32(temp_a, temp_b); + let x2 = vsubq_f32(temp_a, temp_b); + [sum, x1, x2] + } +} + +// _____ __ _ _ _ _ _ +// |___ / / /_ | || | | |__ (_) |_ +// |_ \ _____ | '_ \| || |_| '_ \| | __| +// ___) | |_____| | (_) |__ _| |_) | | |_ +// |____/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly3 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly3, 3, |this: &NeonF64Butterfly3<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly3, 3, |this: &NeonF64Butterfly3<_>| this + .direction); +impl NeonF64Butterfly3 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 3, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value0 = input.load_complex(0); + let value1 = input.load_complex(1); + let value2 = input.load_complex(2); + + let out = self.perform_fft_direct(value0, value1, value2); + + output.store_complex(out[0], 0); + output.store_complex(out[1], 1); + output.store_complex(out[2], 2); + } + + // length 3 fft of x, given as x0, x1, x2. + // result is [X0, X1, X2] + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value0: float64x2_t, + value1: float64x2_t, + value2: float64x2_t, + ) -> [float64x2_t; 3] { + // This is a Neon translation of the scalar 3-point butterfly + let x12p = vaddq_f64(value1, value2); + let x12n = vsubq_f64(value1, value2); + let sum = vaddq_f64(value0, x12p); + + let temp_a = vfmaq_f64(value0, self.twiddle1re, x12p); + + let n_rot = self.rotate.rotate(x12n); + let temp_b = vmulq_f64(self.twiddle1im, n_rot); + + let x1 = vaddq_f64(temp_a, temp_b); + let x2 = vsubq_f64(temp_a, temp_b); + [sum, x1, x2] + } +} + +// _ _ _________ _ _ _ +// | || | |___ /___ \| |__ (_) |_ +// | || |_ _____ |_ \ __) | '_ \| | __| +// |__ _| |_____| ___) / __/| |_) | | |_ +// |_| |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly4 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly4, 4, |this: &NeonF32Butterfly4<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly4, 4, |this: &NeonF32Butterfly4<_>| this + .direction); +impl NeonF32Butterfly4 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = if direction == FftDirection::Inverse { + Rotate90F32::new(true) + } else { + Rotate90F32::new(false) + }; + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + } + } + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value01 = input.load_complex(0); + let value23 = input.load_complex(2); + + let out = self.perform_fft_direct(value01, value23); + + output.store_complex(out[0], 0); + output.store_complex(out[1], 2); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value01a = input.load_complex(0); + let value23a = input.load_complex(2); + let value01b = input.load_complex(4); + let value23b = input.load_complex(6); + + let [value0ab, value1ab] = transpose_complex_2x2_f32(value01a, value01b); + let [value2ab, value3ab] = transpose_complex_2x2_f32(value23a, value23b); + + let out = self.perform_parallel_fft_direct(value0ab, value1ab, value2ab, value3ab); + + let [out0, out1] = transpose_complex_2x2_f32(out[0], out[1]); + let [out2, out3] = transpose_complex_2x2_f32(out[2], out[3]); + + output.store_complex(out0, 0); + output.store_complex(out1, 4); + output.store_complex(out2, 2); + output.store_complex(out3, 6); + } + + // length 4 fft of a, given as [x0, x1], [x2, x3] + // result is [[X0, X1], [X2, X3]] + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value01: float32x4_t, + value23: float32x4_t, + ) -> [float32x4_t; 2] { + //we're going to hardcode a step of mixed radix + //aka we're going to do the six step algorithm + + // step 1: transpose + // and + // step 2: column FFTs + let mut temp = parallel_fft2_interleaved_f32(value01, value23); + + // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i) + temp[1] = self.rotate.rotate_hi(temp[1]); + + // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous + + // step 5: row FFTs + // and + // step 6: transpose by swapping index 1 and 2 + parallel_fft2_contiguous_f32(temp[0], temp[1]) + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + values0: float32x4_t, + values1: float32x4_t, + values2: float32x4_t, + values3: float32x4_t, + ) -> [float32x4_t; 4] { + //we're going to hardcode a step of mixed radix + //aka we're going to do the six step algorithm + + // step 1: transpose + // and + // step 2: column FFTs + let temp0 = parallel_fft2_interleaved_f32(values0, values2); + let mut temp1 = parallel_fft2_interleaved_f32(values1, values3); + + // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i) + temp1[1] = self.rotate.rotate_both(temp1[1]); + + // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous + + // step 5: row FFTs + let out0 = parallel_fft2_interleaved_f32(temp0[0], temp1[0]); + let out2 = parallel_fft2_interleaved_f32(temp0[1], temp1[1]); + + // step 6: transpose by swapping index 1 and 2 + [out0[0], out2[0], out0[1], out2[1]] + } +} + +// _ _ __ _ _ _ _ _ +// | || | / /_ | || | | |__ (_) |_ +// | || |_ _____ | '_ \| || |_| '_ \| | __| +// |__ _| |_____| | (_) |__ _| |_) | | |_ +// |_| \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly4 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly4, 4, |this: &NeonF64Butterfly4<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly4, 4, |this: &NeonF64Butterfly4<_>| this + .direction); +impl NeonF64Butterfly4 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = if direction == FftDirection::Inverse { + Rotate90F64::new(true) + } else { + Rotate90F64::new(false) + }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value0 = input.load_complex(0); + let value1 = input.load_complex(1); + let value2 = input.load_complex(2); + let value3 = input.load_complex(3); + + let out = self.perform_fft_direct(value0, value1, value2, value3); + + output.store_complex(out[0], 0); + output.store_complex(out[1], 1); + output.store_complex(out[2], 2); + output.store_complex(out[3], 3); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value0: float64x2_t, + value1: float64x2_t, + value2: float64x2_t, + value3: float64x2_t, + ) -> [float64x2_t; 4] { + //we're going to hardcode a step of mixed radix + //aka we're going to do the six step algorithm + + // step 1: transpose + // and + // step 2: column FFTs + let temp0 = solo_fft2_f64(value0, value2); + let mut temp1 = solo_fft2_f64(value1, value3); + + // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i) + temp1[1] = self.rotate.rotate(temp1[1]); + + // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous + + // step 5: row FFTs + let out0 = solo_fft2_f64(temp0[0], temp1[0]); + let out2 = solo_fft2_f64(temp0[1], temp1[1]); + + // step 6: transpose by swapping index 1 and 2 + [out0[0], out2[0], out0[1], out2[1]] + } +} + +// ____ _________ _ _ _ +// | ___| |___ /___ \| |__ (_) |_ +// |___ \ _____ |_ \ __) | '_ \| | __| +// ___) | |_____| ___) / __/| |_) | | |_ +// |____/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly5 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle12re: float32x4_t, + twiddle21re: float32x4_t, + twiddle12im: float32x4_t, + twiddle21im: float32x4_t, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly5, 5, |this: &NeonF32Butterfly5<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly5, 5, |this: &NeonF32Butterfly5<_>| this + .direction); +impl NeonF32Butterfly5 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 5, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 5, direction); + let twiddle12re = unsafe { vld1q_f32([tw1.re, tw1.re, tw2.re, tw2.re].as_ptr()) }; + let twiddle21re = unsafe { vld1q_f32([tw2.re, tw2.re, tw1.re, tw1.re].as_ptr()) }; + let twiddle12im = unsafe { vld1q_f32([tw1.im, tw1.im, tw2.im, tw2.im].as_ptr()) }; + let twiddle21im = unsafe { vld1q_f32([tw2.im, tw2.im, -tw1.im, -tw1.im].as_ptr()) }; + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle12re, + twiddle21re, + twiddle12im, + twiddle21im, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + } + } + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value00 = input.load1_complex(0); + let value12 = input.load_complex(1); + let value34 = input.load_complex(3); + + let out = self.perform_fft_direct(value00, value12, value34); + + output.store_partial_lo_complex(out[0], 0); + output.store_complex(out[1], 1); + output.store_complex(out[2], 3); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4 ,6, 8}); + + let value0 = extract_lo_hi_f32(input_packed[0], input_packed[2]); + let value1 = extract_hi_lo_f32(input_packed[0], input_packed[3]); + let value2 = extract_lo_hi_f32(input_packed[1], input_packed[3]); + let value3 = extract_hi_lo_f32(input_packed[1], input_packed[4]); + let value4 = extract_lo_hi_f32(input_packed[2], input_packed[4]); + + let out = self.perform_parallel_fft_direct(value0, value1, value2, value3, value4); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_hi_f32(out[4], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0, 1, 2, 3, 4}); + } + + // length 5 fft of a, given as [x0, x0], [x1, x2], [x3, x4]. + // result is [[X0, Z], [X1, X2], [X3, X4]] + // Note that Z should not be used. + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value00: float32x4_t, + value12: float32x4_t, + value34: float32x4_t, + ) -> [float32x4_t; 3] { + // This is a Neon translation of the scalar 5-point butterfly + let temp43 = reverse_complex_elements_f32(value34); + let x1423p = vaddq_f32(value12, temp43); + let x1423n = vsubq_f32(value12, temp43); + + let x1414p = duplicate_lo_f32(x1423p); + let x2323p = duplicate_hi_f32(x1423p); + let x1414n = duplicate_lo_f32(x1423n); + let x2323n = duplicate_hi_f32(x1423n); + + let temp_a1 = vmulq_f32(self.twiddle12re, x1414p); + let temp_b1 = vmulq_f32(self.twiddle12im, x1414n); + + let temp_a = vfmaq_f32(temp_a1, self.twiddle21re, x2323p); + let temp_a = vaddq_f32(value00, temp_a); + let temp_b = vfmaq_f32(temp_b1, self.twiddle21im, x2323n); + + let b_rot = self.rotate.rotate_both(temp_b); + + let x00 = vaddq_f32(value00, vaddq_f32(x1414p, x2323p)); + + let x12 = vaddq_f32(temp_a, b_rot); + let x34 = reverse_complex_elements_f32(vsubq_f32(temp_a, b_rot)); + [x00, x12, x34] + } + + // length 5 dual fft of x and y, given as (x0, y0), (x1, y1) ... (x4, y4). + // result is [(X0, Y0), (X1, Y1) ... (X2, Y2)] + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + value0: float32x4_t, + value1: float32x4_t, + value2: float32x4_t, + value3: float32x4_t, + value4: float32x4_t, + ) -> [float32x4_t; 5] { + // This is a Neon translation of the scalar 3-point butterfly + let x14p = vaddq_f32(value1, value4); + let x14n = vsubq_f32(value1, value4); + let x23p = vaddq_f32(value2, value3); + let x23n = vsubq_f32(value2, value3); + + let temp_a1_1 = vmulq_f32(self.twiddle1re, x14p); + let temp_a1_2 = vmulq_f32(self.twiddle2re, x23p); + let temp_b1_1 = vmulq_f32(self.twiddle1im, x14n); + let temp_b1_2 = vmulq_f32(self.twiddle2im, x23n); + let temp_a2_1 = vmulq_f32(self.twiddle1re, x23p); + let temp_a2_2 = vmulq_f32(self.twiddle2re, x14p); + let temp_b2_1 = vmulq_f32(self.twiddle2im, x14n); + let temp_b2_2 = vmulq_f32(self.twiddle1im, x23n); + + let temp_a1 = vaddq_f32(value0, vaddq_f32(temp_a1_1, temp_a1_2)); + let temp_b1 = vaddq_f32(temp_b1_1, temp_b1_2); + let temp_a2 = vaddq_f32(value0, vaddq_f32(temp_a2_1, temp_a2_2)); + let temp_b2 = vsubq_f32(temp_b2_1, temp_b2_2); + + [ + vaddq_f32(value0, vaddq_f32(x14p, x23p)), + vaddq_f32(temp_a1, self.rotate.rotate_both(temp_b1)), + vaddq_f32(temp_a2, self.rotate.rotate_both(temp_b2)), + vsubq_f32(temp_a2, self.rotate.rotate_both(temp_b2)), + vsubq_f32(temp_a1, self.rotate.rotate_both(temp_b1)), + ] + } +} + +// ____ __ _ _ _ _ _ +// | ___| / /_ | || | | |__ (_) |_ +// |___ \ _____ | '_ \| || |_| '_ \| | __| +// ___) | |_____| | (_) |__ _| |_) | | |_ +// |____/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly5 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly5, 5, |this: &NeonF64Butterfly5<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly5, 5, |this: &NeonF64Butterfly5<_>| this + .direction); +impl NeonF64Butterfly5 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 5, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 5, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value0 = input.load_complex(0); + let value1 = input.load_complex(1); + let value2 = input.load_complex(2); + let value3 = input.load_complex(3); + let value4 = input.load_complex(4); + + let out = self.perform_fft_direct(value0, value1, value2, value3, value4); + + output.store_complex(out[0], 0); + output.store_complex(out[1], 1); + output.store_complex(out[2], 2); + output.store_complex(out[3], 3); + output.store_complex(out[4], 4); + } + + // length 5 fft of x, given as x0, x1, x2, x3, x4. + // result is [X0, X1, X2, X3, X4] + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value0: float64x2_t, + value1: float64x2_t, + value2: float64x2_t, + value3: float64x2_t, + value4: float64x2_t, + ) -> [float64x2_t; 5] { + // This is a Neon translation of the scalar 5-point butterfly + let x14p = vaddq_f64(value1, value4); + let x14n = vsubq_f64(value1, value4); + let x23p = vaddq_f64(value2, value3); + let x23n = vsubq_f64(value2, value3); + + let temp_a1_1 = vmulq_f64(self.twiddle1re, x14p); + let temp_a1_2 = vmulq_f64(self.twiddle2re, x23p); + let temp_a2_1 = vmulq_f64(self.twiddle2re, x14p); + let temp_a2_2 = vmulq_f64(self.twiddle1re, x23p); + + let temp_b1_1 = vmulq_f64(self.twiddle1im, x14n); + let temp_b1_2 = vmulq_f64(self.twiddle2im, x23n); + let temp_b2_1 = vmulq_f64(self.twiddle2im, x14n); + let temp_b2_2 = vmulq_f64(self.twiddle1im, x23n); + + let temp_a1 = vaddq_f64(value0, vaddq_f64(temp_a1_1, temp_a1_2)); + let temp_a2 = vaddq_f64(value0, vaddq_f64(temp_a2_1, temp_a2_2)); + + let temp_b1 = vaddq_f64(temp_b1_1, temp_b1_2); + let temp_b2 = vsubq_f64(temp_b2_1, temp_b2_2); + + let temp_b1_rot = self.rotate.rotate(temp_b1); + let temp_b2_rot = self.rotate.rotate(temp_b2); + [ + vaddq_f64(value0, vaddq_f64(x14p, x23p)), + vaddq_f64(temp_a1, temp_b1_rot), + vaddq_f64(temp_a2, temp_b2_rot), + vsubq_f64(temp_a2, temp_b2_rot), + vsubq_f64(temp_a1, temp_b1_rot), + ] + } +} + +// __ _________ _ _ _ +// / /_ |___ /___ \| |__ (_) |_ +// | '_ \ _____ |_ \ __) | '_ \| | __| +// | (_) | |_____| ___) / __/| |_) | | |_ +// \___/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly6 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf3: NeonF32Butterfly3, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly6, 6, |this: &NeonF32Butterfly6<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly6, 6, |this: &NeonF32Butterfly6<_>| this + .direction); +impl NeonF32Butterfly6 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let bf3 = NeonF32Butterfly3::new(direction); + + Self { + direction, + _phantom: std::marker::PhantomData, + bf3, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value01 = input.load_complex(0); + let value23 = input.load_complex(2); + let value45 = input.load_complex(4); + + let out = self.perform_fft_direct(value01, value23, value45); + + output.store_complex(out[0], 0); + output.store_complex(out[1], 2); + output.store_complex(out[2], 4); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10}); + + let values = interleave_complex_f32!(input_packed, 3, {0, 1, 2}); + + let out = self.perform_parallel_fft_direct( + values[0], values[1], values[2], values[3], values[4], values[5], + ); + + let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4}); + write_complex_to_array_strided!(out_sorted, output, 2, {0, 1, 2, 3, 4, 5}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value01: float32x4_t, + value23: float32x4_t, + value45: float32x4_t, + ) -> [float32x4_t; 3] { + // Algorithm: 3x2 good-thomas + + // Size-3 FFTs down the columns of our reordered array + let reord0 = extract_lo_hi_f32(value01, value23); + let reord1 = extract_lo_hi_f32(value23, value45); + let reord2 = extract_lo_hi_f32(value45, value01); + + let mid = self.bf3.perform_parallel_fft_direct(reord0, reord1, reord2); + + // We normally would put twiddle factors right here, but since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-2 FFTs down the columns + let [output0, output1] = parallel_fft2_contiguous_f32(mid[0], mid[1]); + let output2 = solo_fft2_f32(mid[2]); + + // Reorder into output + [ + extract_lo_hi_f32(output0, output1), + extract_lo_lo_f32(output2, output1), + extract_hi_hi_f32(output0, output2), + ] + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + value0: float32x4_t, + value1: float32x4_t, + value2: float32x4_t, + value3: float32x4_t, + value4: float32x4_t, + value5: float32x4_t, + ) -> [float32x4_t; 6] { + // Algorithm: 3x2 good-thomas + + // Size-3 FFTs down the columns of our reordered array + let mid0 = self.bf3.perform_parallel_fft_direct(value0, value2, value4); + let mid1 = self.bf3.perform_parallel_fft_direct(value3, value5, value1); + + // We normally would put twiddle factors right here, but since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-2 FFTs down the columns + let [output0, output1] = parallel_fft2_interleaved_f32(mid0[0], mid1[0]); + let [output2, output3] = parallel_fft2_interleaved_f32(mid0[1], mid1[1]); + let [output4, output5] = parallel_fft2_interleaved_f32(mid0[2], mid1[2]); + + // Reorder into output + [output0, output3, output4, output1, output2, output5] + } +} + +// __ __ _ _ _ _ _ +// / /_ / /_ | || | | |__ (_) |_ +// | '_ \ _____ | '_ \| || |_| '_ \| | __| +// | (_) | |_____| | (_) |__ _| |_) | | |_ +// \___/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly6 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf3: NeonF64Butterfly3, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly6, 6, |this: &NeonF64Butterfly6<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly6, 6, |this: &NeonF64Butterfly6<_>| this + .direction); +impl NeonF64Butterfly6 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let bf3 = NeonF64Butterfly3::new(direction); + + Self { + direction, + _phantom: std::marker::PhantomData, + bf3, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let value0 = input.load_complex(0); + let value1 = input.load_complex(1); + let value2 = input.load_complex(2); + let value3 = input.load_complex(3); + let value4 = input.load_complex(4); + let value5 = input.load_complex(5); + + let out = self.perform_fft_direct(value0, value1, value2, value3, value4, value5); + + output.store_complex(out[0], 0); + output.store_complex(out[1], 1); + output.store_complex(out[2], 2); + output.store_complex(out[3], 3); + output.store_complex(out[4], 4); + output.store_complex(out[5], 5); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct( + &self, + value0: float64x2_t, + value1: float64x2_t, + value2: float64x2_t, + value3: float64x2_t, + value4: float64x2_t, + value5: float64x2_t, + ) -> [float64x2_t; 6] { + // Algorithm: 3x2 good-thomas + + // Size-3 FFTs down the columns of our reordered array + let mid0 = self.bf3.perform_fft_direct(value0, value2, value4); + let mid1 = self.bf3.perform_fft_direct(value3, value5, value1); + + // We normally would put twiddle factors right here, but since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-2 FFTs down the columns + let [output0, output1] = solo_fft2_f64(mid0[0], mid1[0]); + let [output2, output3] = solo_fft2_f64(mid0[1], mid1[1]); + let [output4, output5] = solo_fft2_f64(mid0[2], mid1[2]); + + // Reorder into output + [output0, output3, output4, output1, output2, output5] + } +} + +// ___ _________ _ _ _ +// ( _ ) |___ /___ \| |__ (_) |_ +// / _ \ _____ |_ \ __) | '_ \| | __| +// | (_) | |_____| ___) / __/| |_) | | |_ +// \___/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly8 { + root2: float32x4_t, + root2_dual: float32x4_t, + direction: FftDirection, + bf4: NeonF32Butterfly4, + rotate90: Rotate90F32, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly8, 8, |this: &NeonF32Butterfly8<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly8, 8, |this: &NeonF32Butterfly8<_>| this + .direction); +impl NeonF32Butterfly8 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let bf4 = NeonF32Butterfly4::new(direction); + let root2 = + unsafe { vld1q_f32([1.0, 1.0, 0.5f32.sqrt(), 0.5f32.sqrt(), 1.0, 1.0].as_ptr()) }; + let root2_dual = unsafe { vmovq_n_f32(0.5f32.sqrt()) }; + let rotate90 = if direction == FftDirection::Inverse { + Rotate90F32::new(true) + } else { + Rotate90F32::new(false) + }; + Self { + root2, + root2_dual, + direction, + bf4, + rotate90, + } + } + + #[inline(always)] + unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6}); + + let out = self.perform_fft_direct(input_packed); + + write_complex_to_array_strided!(out, output, 2, {0,1,2,3}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14}); + + let values = interleave_complex_f32!(input_packed, 4, {0, 1, 2, 3}); + + let out = self.perform_parallel_fft_direct(values); + + let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6}); + + write_complex_to_array_strided!(out_sorted, output, 2, {0,1,2,3,4,5,6,7}); + } + + #[inline(always)] + unsafe fn perform_fft_direct(&self, values: [float32x4_t; 4]) -> [float32x4_t; 4] { + // we're going to hardcode a step of mixed radix + // step 1: copy and reorder the input into the scratch + let [in02, in13] = transpose_complex_2x2_f32(values[0], values[1]); + let [in46, in57] = transpose_complex_2x2_f32(values[2], values[3]); + + // step 2: column FFTs + let val0 = self.bf4.perform_fft_direct(in02, in46); + let mut val2 = self.bf4.perform_fft_direct(in13, in57); + + // step 3: apply twiddle factors + let val2b = self.rotate90.rotate_hi(val2[0]); + let val2c = vaddq_f32(val2b, val2[0]); + let val2d = vmulq_f32(val2c, self.root2); + val2[0] = extract_lo_hi_f32(val2[0], val2d); + + let val3b = self.rotate90.rotate_both(val2[1]); + let val3c = vsubq_f32(val3b, val2[1]); + let val3d = vmulq_f32(val3c, self.root2); + val2[1] = extract_lo_hi_f32(val3b, val3d); + + // step 4: transpose -- skipped because we're going to do the next FFTs non-contiguously + + // step 5: row FFTs + let out0 = parallel_fft2_interleaved_f32(val0[0], val2[0]); + let out1 = parallel_fft2_interleaved_f32(val0[1], val2[1]); + + // step 6: rearrange and copy to buffer + [out0[0], out1[0], out0[1], out1[1]] + } + + #[inline(always)] + unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 8]) -> [float32x4_t; 8] { + // we're going to hardcode a step of mixed radix + // step 1: copy and reorder the input into the scratch + // and + // step 2: column FFTs + let val03 = self + .bf4 + .perform_parallel_fft_direct(values[0], values[2], values[4], values[6]); + let mut val47 = self + .bf4 + .perform_parallel_fft_direct(values[1], values[3], values[5], values[7]); + + // step 3: apply twiddle factors + let val5b = self.rotate90.rotate_both(val47[1]); + let val5c = vaddq_f32(val5b, val47[1]); + val47[1] = vmulq_f32(val5c, self.root2_dual); + val47[2] = self.rotate90.rotate_both(val47[2]); + let val7b = self.rotate90.rotate_both(val47[3]); + let val7c = vsubq_f32(val7b, val47[3]); + val47[3] = vmulq_f32(val7c, self.root2_dual); + + // step 4: transpose -- skipped because we're going to do the next FFTs non-contiguously + + // step 5: row FFTs + let out0 = parallel_fft2_interleaved_f32(val03[0], val47[0]); + let out1 = parallel_fft2_interleaved_f32(val03[1], val47[1]); + let out2 = parallel_fft2_interleaved_f32(val03[2], val47[2]); + let out3 = parallel_fft2_interleaved_f32(val03[3], val47[3]); + + // step 6: rearrange and copy to buffer + [ + out0[0], out1[0], out2[0], out3[0], out0[1], out1[1], out2[1], out3[1], + ] + } +} + +// ___ __ _ _ _ _ _ +// ( _ ) / /_ | || | | |__ (_) |_ +// / _ \ _____ | '_ \| || |_| '_ \| | __| +// | (_) | |_____| | (_) |__ _| |_) | | |_ +// \___/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly8 { + root2: float64x2_t, + direction: FftDirection, + bf4: NeonF64Butterfly4, + rotate90: Rotate90F64, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly8, 8, |this: &NeonF64Butterfly8<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly8, 8, |this: &NeonF64Butterfly8<_>| this + .direction); +impl NeonF64Butterfly8 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let bf4 = NeonF64Butterfly4::new(direction); + let root2 = unsafe { vmovq_n_f64(0.5f64.sqrt()) }; + let rotate90 = if direction == FftDirection::Inverse { + Rotate90F64::new(true) + } else { + Rotate90F64::new(false) + }; + Self { + root2, + direction, + bf4, + rotate90, + } + } + + #[inline(always)] + unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7}); + } + + #[inline(always)] + unsafe fn perform_fft_direct(&self, values: [float64x2_t; 8]) -> [float64x2_t; 8] { + // we're going to hardcode a step of mixed radix + // step 1: copy and reorder the input into the scratch + // and + // step 2: column FFTs + let val03 = self + .bf4 + .perform_fft_direct(values[0], values[2], values[4], values[6]); + let mut val47 = self + .bf4 + .perform_fft_direct(values[1], values[3], values[5], values[7]); + + // step 3: apply twiddle factors + let val5b = self.rotate90.rotate(val47[1]); + let val5c = vaddq_f64(val5b, val47[1]); + val47[1] = vmulq_f64(val5c, self.root2); + val47[2] = self.rotate90.rotate(val47[2]); + let val7b = self.rotate90.rotate(val47[3]); + let val7c = vsubq_f64(val7b, val47[3]); + val47[3] = vmulq_f64(val7c, self.root2); + + // step 4: transpose -- skipped because we're going to do the next FFTs non-contiguously + + // step 5: row FFTs + let out0 = solo_fft2_f64(val03[0], val47[0]); + let out1 = solo_fft2_f64(val03[1], val47[1]); + let out2 = solo_fft2_f64(val03[2], val47[2]); + let out3 = solo_fft2_f64(val03[3], val47[3]); + + // step 6: rearrange and copy to buffer + [ + out0[0], out1[0], out2[0], out3[0], out0[1], out1[1], out2[1], out3[1], + ] + } +} + +// ___ _________ _ _ _ +// / _ \ |___ /___ \| |__ (_) |_ +// | (_) | _____ |_ \ __) | '_ \| | __| +// \__, | |_____| ___) / __/| |_) | | |_ +// /_/ |____/_____|_.__/|_|\__| +// +pub struct NeonF32Butterfly9 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf3: NeonF32Butterfly3, + twiddle1: float32x4_t, + twiddle2: float32x4_t, + twiddle4: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly9, 9, |this: &NeonF32Butterfly9<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly9, 9, |this: &NeonF32Butterfly9<_>| this + .direction); +impl NeonF32Butterfly9 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let bf3 = NeonF32Butterfly3::new(direction); + let tw1: Complex = twiddles::compute_twiddle(1, 9, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 9, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 9, direction); + let twiddle1 = unsafe { vld1q_f32([tw1.re, tw1.im, tw1.re, tw1.im].as_ptr()) }; + let twiddle2 = unsafe { vld1q_f32([tw2.re, tw2.im, tw2.re, tw2.im].as_ptr()) }; + let twiddle4 = unsafe { vld1q_f32([tw4.re, tw4.im, tw4.re, tw4.im].as_ptr()) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + bf3, + twiddle1, + twiddle2, + twiddle4, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + // A single Neon 9-point will need a lot of shuffling, let's just reuse the dual one + let values = read_partial1_complex_to_array!(input, {0,1,2,3,4,5,6,7,8}); + + let out = self.perform_parallel_fft_direct(values); + + for n in 0..9 { + output.store_partial_lo_complex(out[n], n); + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[4]), + extract_hi_lo_f32(input_packed[0], input_packed[5]), + extract_lo_hi_f32(input_packed[1], input_packed[5]), + extract_hi_lo_f32(input_packed[1], input_packed[6]), + extract_lo_hi_f32(input_packed[2], input_packed[6]), + extract_hi_lo_f32(input_packed[2], input_packed[7]), + extract_lo_hi_f32(input_packed[3], input_packed[7]), + extract_hi_lo_f32(input_packed[3], input_packed[8]), + extract_lo_hi_f32(input_packed[4], input_packed[8]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_hi_f32(out[8], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0,1,2,3,4,5,6,7,8}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + values: [float32x4_t; 9], + ) -> [float32x4_t; 9] { + // Algorithm: 3x3 mixed radix + + // Size-3 FFTs down the columns + let mid0 = self + .bf3 + .perform_parallel_fft_direct(values[0], values[3], values[6]); + let mut mid1 = self + .bf3 + .perform_parallel_fft_direct(values[1], values[4], values[7]); + let mut mid2 = self + .bf3 + .perform_parallel_fft_direct(values[2], values[5], values[8]); + + // Apply twiddle factors. Note that we're re-using twiddle2 + mid1[1] = mul_complex_f32(self.twiddle1, mid1[1]); + mid1[2] = mul_complex_f32(self.twiddle2, mid1[2]); + mid2[1] = mul_complex_f32(self.twiddle2, mid2[1]); + mid2[2] = mul_complex_f32(self.twiddle4, mid2[2]); + + let [output0, output1, output2] = self + .bf3 + .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]); + let [output3, output4, output5] = self + .bf3 + .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]); + let [output6, output7, output8] = self + .bf3 + .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]); + + [ + output0, output3, output6, output1, output4, output7, output2, output5, output8, + ] + } +} + +// ___ __ _ _ _ _ _ +// / _ \ / /_ | || | | |__ (_) |_ +// | (_) | _____ | '_ \| || |_| '_ \| | __| +// \__, | |_____| | (_) |__ _| |_) | | |_ +// /_/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly9 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf3: NeonF64Butterfly3, + twiddle1: float64x2_t, + twiddle2: float64x2_t, + twiddle4: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly9, 9, |this: &NeonF64Butterfly9<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly9, 9, |this: &NeonF64Butterfly9<_>| this + .direction); +impl NeonF64Butterfly9 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let bf3 = NeonF64Butterfly3::new(direction); + let tw1: Complex = twiddles::compute_twiddle(1, 9, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 9, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 9, direction); + let twiddle1 = unsafe { vld1q_f64([tw1.re, tw1.im].as_ptr()) }; + let twiddle2 = unsafe { vld1q_f64([tw2.re, tw2.im].as_ptr()) }; + let twiddle4 = unsafe { vld1q_f64([tw4.re, tw4.im].as_ptr()) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + bf3, + twiddle1, + twiddle2, + twiddle4, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 9]) -> [float64x2_t; 9] { + // Algorithm: 3x3 mixed radix + + // Size-3 FFTs down the columns + let mid0 = self.bf3.perform_fft_direct(values[0], values[3], values[6]); + let mut mid1 = self.bf3.perform_fft_direct(values[1], values[4], values[7]); + let mut mid2 = self.bf3.perform_fft_direct(values[2], values[5], values[8]); + + // Apply twiddle factors. Note that we're re-using twiddle2 + mid1[1] = mul_complex_f64(self.twiddle1, mid1[1]); + mid1[2] = mul_complex_f64(self.twiddle2, mid1[2]); + mid2[1] = mul_complex_f64(self.twiddle2, mid2[1]); + mid2[2] = mul_complex_f64(self.twiddle4, mid2[2]); + + let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]); + let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]); + let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]); + + [ + output0, output3, output6, output1, output4, output7, output2, output5, output8, + ] + } +} + +// _ ___ _________ _ _ _ +// / |/ _ \ |___ /___ \| |__ (_) |_ +// | | | | | _____ |_ \ __) | '_ \| | __| +// | | |_| | |_____| ___) / __/| |_) | | |_ +// |_|\___/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly10 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf5: NeonF32Butterfly5, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly10, 10, |this: &NeonF32Butterfly10<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly10, 10, |this: &NeonF32Butterfly10<_>| this + .direction); +impl NeonF32Butterfly10 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let bf5 = NeonF32Butterfly5::new(direction); + Self { + direction, + _phantom: std::marker::PhantomData, + bf5, + } + } + + #[inline(always)] + unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8}); + + let out = self.perform_fft_direct(input_packed); + + write_complex_to_array_strided!(out, output, 2, {0,1,2,3,4}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18}); + + let values = interleave_complex_f32!(input_packed, 5, {0, 1, 2, 3, 4}); + + let out = self.perform_parallel_fft_direct(values); + + let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8}); + + write_complex_to_array_strided!(out_sorted, output, 2, {0,1,2,3,4,5,6,7,8,9}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float32x4_t; 5]) -> [float32x4_t; 5] { + // Algorithm: 5x2 good-thomas + // Reorder and pack + let reord0 = extract_lo_hi_f32(values[0], values[2]); + let reord1 = extract_lo_hi_f32(values[1], values[3]); + let reord2 = extract_lo_hi_f32(values[2], values[4]); + let reord3 = extract_lo_hi_f32(values[3], values[0]); + let reord4 = extract_lo_hi_f32(values[4], values[1]); + + // Size-5 FFTs down the columns of our reordered array + let mids = self + .bf5 + .perform_parallel_fft_direct(reord0, reord1, reord2, reord3, reord4); + + // Since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-2 FFTs down the columns + let [temp01, temp23] = parallel_fft2_contiguous_f32(mids[0], mids[1]); + let [temp45, temp67] = parallel_fft2_contiguous_f32(mids[2], mids[3]); + let temp89 = solo_fft2_f32(mids[4]); + + // Reorder + let out01 = extract_lo_hi_f32(temp01, temp23); + let out23 = extract_lo_hi_f32(temp45, temp67); + let out45 = extract_lo_lo_f32(temp89, temp23); + let out67 = extract_hi_lo_f32(temp01, temp67); + let out89 = extract_hi_hi_f32(temp45, temp89); + + [out01, out23, out45, out67, out89] + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + values: [float32x4_t; 10], + ) -> [float32x4_t; 10] { + // Algorithm: 5x2 good-thomas + + // Size-5 FFTs down the columns of our reordered array + let mid0 = self + .bf5 + .perform_parallel_fft_direct(values[0], values[2], values[4], values[6], values[8]); + let mid1 = self + .bf5 + .perform_parallel_fft_direct(values[5], values[7], values[9], values[1], values[3]); + + // Since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-2 FFTs down the columns + let [output0, output1] = parallel_fft2_interleaved_f32(mid0[0], mid1[0]); + let [output2, output3] = parallel_fft2_interleaved_f32(mid0[1], mid1[1]); + let [output4, output5] = parallel_fft2_interleaved_f32(mid0[2], mid1[2]); + let [output6, output7] = parallel_fft2_interleaved_f32(mid0[3], mid1[3]); + let [output8, output9] = parallel_fft2_interleaved_f32(mid0[4], mid1[4]); + + // Reorder and return + [ + output0, output3, output4, output7, output8, output1, output2, output5, output6, + output9, + ] + } +} + +// _ ___ __ _ _ _ _ _ +// / |/ _ \ / /_ | || | | |__ (_) |_ +// | | | | | _____ | '_ \| || |_| '_ \| | __| +// | | |_| | |_____| | (_) |__ _| |_) | | |_ +// |_|\___/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly10 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf2: NeonF64Butterfly2, + bf5: NeonF64Butterfly5, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly10, 10, |this: &NeonF64Butterfly10<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly10, 10, |this: &NeonF64Butterfly10<_>| this + .direction); +impl NeonF64Butterfly10 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let bf2 = NeonF64Butterfly2::new(direction); + let bf5 = NeonF64Butterfly5::new(direction); + Self { + direction, + _phantom: std::marker::PhantomData, + bf2, + bf5, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 10]) -> [float64x2_t; 10] { + // Algorithm: 5x2 good-thomas + + // Size-5 FFTs down the columns of our reordered array + let mid0 = self + .bf5 + .perform_fft_direct(values[0], values[2], values[4], values[6], values[8]); + let mid1 = self + .bf5 + .perform_fft_direct(values[5], values[7], values[9], values[1], values[3]); + + // Since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-2 FFTs down the columns + let [output0, output1] = self.bf2.perform_fft_direct(mid0[0], mid1[0]); + let [output2, output3] = self.bf2.perform_fft_direct(mid0[1], mid1[1]); + let [output4, output5] = self.bf2.perform_fft_direct(mid0[2], mid1[2]); + let [output6, output7] = self.bf2.perform_fft_direct(mid0[3], mid1[3]); + let [output8, output9] = self.bf2.perform_fft_direct(mid0[4], mid1[4]); + + // Reorder and return + [ + output0, output3, output4, output7, output8, output1, output2, output5, output6, + output9, + ] + } +} + +// _ ____ _________ _ _ _ +// / |___ \ |___ /___ \| |__ (_) |_ +// | | __) | _____ |_ \ __) | '_ \| | __| +// | |/ __/ |_____| ___) / __/| |_) | | |_ +// |_|_____| |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly12 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf3: NeonF32Butterfly3, + bf4: NeonF32Butterfly4, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly12, 12, |this: &NeonF32Butterfly12<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly12, 12, |this: &NeonF32Butterfly12<_>| this + .direction); +impl NeonF32Butterfly12 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let bf3 = NeonF32Butterfly3::new(direction); + let bf4 = NeonF32Butterfly4::new(direction); + Self { + direction, + _phantom: std::marker::PhantomData, + bf3, + bf4, + } + } + + #[inline(always)] + unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10 }); + + let out = self.perform_fft_direct(input_packed); + + write_complex_to_array_strided!(out, output, 2, {0,1,2,3,4,5}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = + read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22}); + + let values = interleave_complex_f32!(input_packed, 6, {0, 1, 2, 3, 4, 5}); + + let out = self.perform_parallel_fft_direct(values); + + let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8, 10}); + + write_complex_to_array_strided!(out_sorted, output, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float32x4_t; 6]) -> [float32x4_t; 6] { + // Algorithm: 4x3 good-thomas + + // Reorder and pack + let packed03 = extract_lo_hi_f32(values[0], values[1]); + let packed47 = extract_lo_hi_f32(values[2], values[3]); + let packed69 = extract_lo_hi_f32(values[3], values[4]); + let packed101 = extract_lo_hi_f32(values[5], values[0]); + let packed811 = extract_lo_hi_f32(values[4], values[5]); + let packed25 = extract_lo_hi_f32(values[1], values[2]); + + // Size-4 FFTs down the columns of our reordered array + let mid0 = self.bf4.perform_fft_direct(packed03, packed69); + let mid1 = self.bf4.perform_fft_direct(packed47, packed101); + let mid2 = self.bf4.perform_fft_direct(packed811, packed25); + + // Since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-3 FFTs down the columns + let [temp03, temp14, temp25] = self + .bf3 + .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]); + let [temp69, temp710, temp811] = self + .bf3 + .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]); + + // Reorder and return + [ + extract_lo_hi_f32(temp03, temp14), + extract_lo_hi_f32(temp811, temp69), + extract_lo_hi_f32(temp14, temp25), + extract_lo_hi_f32(temp69, temp710), + extract_lo_hi_f32(temp25, temp03), + extract_lo_hi_f32(temp710, temp811), + ] + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + values: [float32x4_t; 12], + ) -> [float32x4_t; 12] { + // Algorithm: 4x3 good-thomas + + // Size-4 FFTs down the columns of our reordered array + let mid0 = self + .bf4 + .perform_parallel_fft_direct(values[0], values[3], values[6], values[9]); + let mid1 = self + .bf4 + .perform_parallel_fft_direct(values[4], values[7], values[10], values[1]); + let mid2 = self + .bf4 + .perform_parallel_fft_direct(values[8], values[11], values[2], values[5]); + + // Since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-3 FFTs down the columns + let [output0, output1, output2] = self + .bf3 + .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]); + let [output3, output4, output5] = self + .bf3 + .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]); + let [output6, output7, output8] = self + .bf3 + .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]); + let [output9, output10, output11] = self + .bf3 + .perform_parallel_fft_direct(mid0[3], mid1[3], mid2[3]); + + // Reorder and return + [ + output0, output4, output8, output9, output1, output5, output6, output10, output2, + output3, output7, output11, + ] + } +} + +// _ ____ __ _ _ _ _ _ +// / |___ \ / /_ | || | | |__ (_) |_ +// | | __) | _____ | '_ \| || |_| '_ \| | __| +// | |/ __/ |_____| | (_) |__ _| |_) | | |_ +// |_|_____| \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly12 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf3: NeonF64Butterfly3, + bf4: NeonF64Butterfly4, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly12, 12, |this: &NeonF64Butterfly12<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly12, 12, |this: &NeonF64Butterfly12<_>| this + .direction); +impl NeonF64Butterfly12 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let bf3 = NeonF64Butterfly3::new(direction); + let bf4 = NeonF64Butterfly4::new(direction); + Self { + direction, + _phantom: std::marker::PhantomData, + bf3, + bf4, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 12]) -> [float64x2_t; 12] { + // Algorithm: 4x3 good-thomas + + // Size-4 FFTs down the columns of our reordered array + let mid0 = self + .bf4 + .perform_fft_direct(values[0], values[3], values[6], values[9]); + let mid1 = self + .bf4 + .perform_fft_direct(values[4], values[7], values[10], values[1]); + let mid2 = self + .bf4 + .perform_fft_direct(values[8], values[11], values[2], values[5]); + + // Since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-3 FFTs down the columns + let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]); + let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]); + let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]); + let [output9, output10, output11] = self.bf3.perform_fft_direct(mid0[3], mid1[3], mid2[3]); + + [ + output0, output4, output8, output9, output1, output5, output6, output10, output2, + output3, output7, output11, + ] + } +} + +// _ ____ _________ _ _ _ +// / | ___| |___ /___ \| |__ (_) |_ +// | |___ \ _____ |_ \ __) | '_ \| | __| +// | |___) | |_____| ___) / __/| |_) | | |_ +// |_|____/ |____/_____|_.__/|_|\__| +// +pub struct NeonF32Butterfly15 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf3: NeonF32Butterfly3, + bf5: NeonF32Butterfly5, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly15, 15, |this: &NeonF32Butterfly15<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly15, 15, |this: &NeonF32Butterfly15<_>| this + .direction); +impl NeonF32Butterfly15 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let bf3 = NeonF32Butterfly3::new(direction); + let bf5 = NeonF32Butterfly5::new(direction); + Self { + direction, + _phantom: std::marker::PhantomData, + bf3, + bf5, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + // A single Neon 15-point will need a lot of shuffling, let's just reuse the dual one + let values = read_partial1_complex_to_array!(input, {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14}); + + let out = self.perform_parallel_fft_direct(values); + + for n in 0..15 { + output.store_partial_lo_complex(out[n], n); + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = + read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[7]), + extract_hi_lo_f32(input_packed[0], input_packed[8]), + extract_lo_hi_f32(input_packed[1], input_packed[8]), + extract_hi_lo_f32(input_packed[1], input_packed[9]), + extract_lo_hi_f32(input_packed[2], input_packed[9]), + extract_hi_lo_f32(input_packed[2], input_packed[10]), + extract_lo_hi_f32(input_packed[3], input_packed[10]), + extract_hi_lo_f32(input_packed[3], input_packed[11]), + extract_lo_hi_f32(input_packed[4], input_packed[11]), + extract_hi_lo_f32(input_packed[4], input_packed[12]), + extract_lo_hi_f32(input_packed[5], input_packed[12]), + extract_hi_lo_f32(input_packed[5], input_packed[13]), + extract_lo_hi_f32(input_packed[6], input_packed[13]), + extract_hi_lo_f32(input_packed[6], input_packed[14]), + extract_lo_hi_f32(input_packed[7], input_packed[14]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_lo_f32(out[8], out[9]), + extract_lo_lo_f32(out[10], out[11]), + extract_lo_lo_f32(out[12], out[13]), + extract_lo_hi_f32(out[14], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + extract_hi_hi_f32(out[9], out[10]), + extract_hi_hi_f32(out[11], out[12]), + extract_hi_hi_f32(out[13], out[14]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11, 12, 13, 14}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + values: [float32x4_t; 15], + ) -> [float32x4_t; 15] { + // Algorithm: 5x3 good-thomas + + // Size-5 FFTs down the columns of our reordered array + let mid0 = self + .bf5 + .perform_parallel_fft_direct(values[0], values[3], values[6], values[9], values[12]); + let mid1 = self + .bf5 + .perform_parallel_fft_direct(values[5], values[8], values[11], values[14], values[2]); + let mid2 = self + .bf5 + .perform_parallel_fft_direct(values[10], values[13], values[1], values[4], values[7]); + + // Since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-3 FFTs down the columns + let [output0, output1, output2] = self + .bf3 + .perform_parallel_fft_direct(mid0[0], mid1[0], mid2[0]); + let [output3, output4, output5] = self + .bf3 + .perform_parallel_fft_direct(mid0[1], mid1[1], mid2[1]); + let [output6, output7, output8] = self + .bf3 + .perform_parallel_fft_direct(mid0[2], mid1[2], mid2[2]); + let [output9, output10, output11] = self + .bf3 + .perform_parallel_fft_direct(mid0[3], mid1[3], mid2[3]); + let [output12, output13, output14] = self + .bf3 + .perform_parallel_fft_direct(mid0[4], mid1[4], mid2[4]); + + [ + output0, output4, output8, output9, output13, output2, output3, output7, output11, + output12, output1, output5, output6, output10, output14, + ] + } +} + +// _ ____ __ _ _ _ _ _ +// / | ___| / /_ | || | | |__ (_) |_ +// | |___ \ _____ | '_ \| || |_| '_ \| | __| +// | |___) | |_____| | (_) |__ _| |_) | | |_ +// |_|____/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly15 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + bf3: NeonF64Butterfly3, + bf5: NeonF64Butterfly5, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly15, 15, |this: &NeonF64Butterfly15<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly15, 15, |this: &NeonF64Butterfly15<_>| this + .direction); +impl NeonF64Butterfly15 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let bf3 = NeonF64Butterfly3::new(direction); + let bf5 = NeonF64Butterfly5::new(direction); + Self { + direction, + _phantom: std::marker::PhantomData, + bf3, + bf5, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = + read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 15]) -> [float64x2_t; 15] { + // Algorithm: 5x3 good-thomas + + // Size-5 FFTs down the columns of our reordered array + let mid0 = self + .bf5 + .perform_fft_direct(values[0], values[3], values[6], values[9], values[12]); + let mid1 = self + .bf5 + .perform_fft_direct(values[5], values[8], values[11], values[14], values[2]); + let mid2 = self + .bf5 + .perform_fft_direct(values[10], values[13], values[1], values[4], values[7]); + + // Since this is good-thomas algorithm, we don't need twiddle factors + + // Transpose the data and do size-3 FFTs down the columns + let [output0, output1, output2] = self.bf3.perform_fft_direct(mid0[0], mid1[0], mid2[0]); + let [output3, output4, output5] = self.bf3.perform_fft_direct(mid0[1], mid1[1], mid2[1]); + let [output6, output7, output8] = self.bf3.perform_fft_direct(mid0[2], mid1[2], mid2[2]); + let [output9, output10, output11] = self.bf3.perform_fft_direct(mid0[3], mid1[3], mid2[3]); + let [output12, output13, output14] = self.bf3.perform_fft_direct(mid0[4], mid1[4], mid2[4]); + + [ + output0, output4, output8, output9, output13, output2, output3, output7, output11, + output12, output1, output5, output6, output10, output14, + ] + } +} + +// _ __ _________ _ _ _ +// / |/ /_ |___ /___ \| |__ (_) |_ +// | | '_ \ _____ |_ \ __) | '_ \| | __| +// | | (_) | |_____| ___) / __/| |_) | | |_ +// |_|\___/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly16 { + direction: FftDirection, + bf4: NeonF32Butterfly4, + bf8: NeonF32Butterfly8, + rotate90: Rotate90F32, + twiddle01: float32x4_t, + twiddle23: float32x4_t, + twiddle01conj: float32x4_t, + twiddle23conj: float32x4_t, + twiddle1: float32x4_t, + twiddle2: float32x4_t, + twiddle3: float32x4_t, + twiddle1c: float32x4_t, + twiddle2c: float32x4_t, + twiddle3c: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly16, 16, |this: &NeonF32Butterfly16<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly16, 16, |this: &NeonF32Butterfly16<_>| this + .direction); +impl NeonF32Butterfly16 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let bf8 = NeonF32Butterfly8::new(direction); + let bf4 = NeonF32Butterfly4::new(direction); + let rotate90 = if direction == FftDirection::Inverse { + Rotate90F32::new(true) + } else { + Rotate90F32::new(false) + }; + let tw1: Complex = twiddles::compute_twiddle(1, 16, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 16, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 16, direction); + let twiddle01 = unsafe { vld1q_f32([1.0, 0.0, tw1.re, tw1.im].as_ptr()) }; + let twiddle23 = unsafe { vld1q_f32([tw2.re, tw2.im, tw3.re, tw3.im].as_ptr()) }; + let twiddle01conj = unsafe { vld1q_f32([1.0, 0.0, tw1.re, -tw1.im].as_ptr()) }; + let twiddle23conj = unsafe { vld1q_f32([tw2.re, -tw2.im, tw3.re, -tw3.im].as_ptr()) }; + let twiddle1 = unsafe { vld1q_f32([tw1.re, tw1.im, tw1.re, tw1.im].as_ptr()) }; + let twiddle2 = unsafe { vld1q_f32([tw2.re, tw2.im, tw2.re, tw2.im].as_ptr()) }; + let twiddle3 = unsafe { vld1q_f32([tw3.re, tw3.im, tw3.re, tw3.im].as_ptr()) }; + let twiddle1c = unsafe { vld1q_f32([tw1.re, -tw1.im, tw1.re, -tw1.im].as_ptr()) }; + let twiddle2c = unsafe { vld1q_f32([tw2.re, -tw2.im, tw2.re, -tw2.im].as_ptr()) }; + let twiddle3c = unsafe { vld1q_f32([tw3.re, -tw3.im, tw3.re, -tw3.im].as_ptr()) }; + Self { + direction, + bf4, + bf8, + rotate90, + twiddle01, + twiddle23, + twiddle01conj, + twiddle23conj, + twiddle1, + twiddle2, + twiddle3, + twiddle1c, + twiddle2c, + twiddle3c, + } + } + + #[inline(always)] + unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14 }); + + let out = self.perform_fft_direct(input_packed); + + write_complex_to_array_strided!(out, output, 2, {0,1,2,3,4,5,6,7}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); + + let values = interleave_complex_f32!(input_packed, 8, {0, 1, 2, 3 ,4 ,5 ,6 ,7}); + + let out = self.perform_parallel_fft_direct(values); + + let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8, 10, 12, 14}); + + write_complex_to_array_strided!(out_sorted, output, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11,12,13,14, 15}); + } + + #[inline(always)] + unsafe fn perform_fft_direct(&self, input: [float32x4_t; 8]) -> [float32x4_t; 8] { + // we're going to hardcode a step of split radix + // step 1: copy and reorder the input into the scratch + let in0002 = extract_lo_lo_f32(input[0], input[1]); + let in0406 = extract_lo_lo_f32(input[2], input[3]); + let in0810 = extract_lo_lo_f32(input[4], input[5]); + let in1214 = extract_lo_lo_f32(input[6], input[7]); + + let in0105 = extract_hi_hi_f32(input[0], input[2]); + let in0913 = extract_hi_hi_f32(input[4], input[6]); + let in1503 = extract_hi_hi_f32(input[7], input[1]); + let in0711 = extract_hi_hi_f32(input[3], input[5]); + + let in_evens = [in0002, in0406, in0810, in1214]; + + // step 2: column FFTs + let evens = self.bf8.perform_fft_direct(in_evens); + let mut odds1 = self.bf4.perform_fft_direct(in0105, in0913); + let mut odds3 = self.bf4.perform_fft_direct(in1503, in0711); + + // step 3: apply twiddle factors + odds1[0] = mul_complex_f32(odds1[0], self.twiddle01); + odds3[0] = mul_complex_f32(odds3[0], self.twiddle01conj); + + odds1[1] = mul_complex_f32(odds1[1], self.twiddle23); + odds3[1] = mul_complex_f32(odds3[1], self.twiddle23conj); + + // step 4: cross FFTs + let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); + let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); + + // apply the butterfly 4 twiddle factor, which is just a rotation + temp0[1] = self.rotate90.rotate_both(temp0[1]); + temp1[1] = self.rotate90.rotate_both(temp1[1]); + + //step 5: copy/add/subtract data back to buffer + [ + vaddq_f32(evens[0], temp0[0]), + vaddq_f32(evens[1], temp1[0]), + vaddq_f32(evens[2], temp0[1]), + vaddq_f32(evens[3], temp1[1]), + vsubq_f32(evens[0], temp0[0]), + vsubq_f32(evens[1], temp1[0]), + vsubq_f32(evens[2], temp0[1]), + vsubq_f32(evens[3], temp1[1]), + ] + } + + #[inline(always)] + unsafe fn perform_parallel_fft_direct(&self, input: [float32x4_t; 16]) -> [float32x4_t; 16] { + // we're going to hardcode a step of split radix + // step 1: copy and reorder the input into the scratch + // and + // step 2: column FFTs + let evens = self.bf8.perform_parallel_fft_direct([ + input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14], + ]); + let mut odds1 = self + .bf4 + .perform_parallel_fft_direct(input[1], input[5], input[9], input[13]); + let mut odds3 = self + .bf4 + .perform_parallel_fft_direct(input[15], input[3], input[7], input[11]); + + // step 3: apply twiddle factors + odds1[1] = mul_complex_f32(odds1[1], self.twiddle1); + odds3[1] = mul_complex_f32(odds3[1], self.twiddle1c); + + odds1[2] = mul_complex_f32(odds1[2], self.twiddle2); + odds3[2] = mul_complex_f32(odds3[2], self.twiddle2c); + + odds1[3] = mul_complex_f32(odds1[3], self.twiddle3); + odds3[3] = mul_complex_f32(odds3[3], self.twiddle3c); + + // step 4: cross FFTs + let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); + let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); + let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); + let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); + + // apply the butterfly 4 twiddle factor, which is just a rotation + temp0[1] = self.rotate90.rotate_both(temp0[1]); + temp1[1] = self.rotate90.rotate_both(temp1[1]); + temp2[1] = self.rotate90.rotate_both(temp2[1]); + temp3[1] = self.rotate90.rotate_both(temp3[1]); + + //step 5: copy/add/subtract data back to buffer + [ + vaddq_f32(evens[0], temp0[0]), + vaddq_f32(evens[1], temp1[0]), + vaddq_f32(evens[2], temp2[0]), + vaddq_f32(evens[3], temp3[0]), + vaddq_f32(evens[4], temp0[1]), + vaddq_f32(evens[5], temp1[1]), + vaddq_f32(evens[6], temp2[1]), + vaddq_f32(evens[7], temp3[1]), + vsubq_f32(evens[0], temp0[0]), + vsubq_f32(evens[1], temp1[0]), + vsubq_f32(evens[2], temp2[0]), + vsubq_f32(evens[3], temp3[0]), + vsubq_f32(evens[4], temp0[1]), + vsubq_f32(evens[5], temp1[1]), + vsubq_f32(evens[6], temp2[1]), + vsubq_f32(evens[7], temp3[1]), + ] + } +} + +// _ __ __ _ _ _ _ _ +// / |/ /_ / /_ | || | | |__ (_) |_ +// | | '_ \ _____ | '_ \| || |_| '_ \| | __| +// | | (_) | |_____| | (_) |__ _| |_) | | |_ +// |_|\___/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly16 { + direction: FftDirection, + bf4: NeonF64Butterfly4, + bf8: NeonF64Butterfly8, + rotate90: Rotate90F64, + twiddle1: float64x2_t, + twiddle2: float64x2_t, + twiddle3: float64x2_t, + twiddle1c: float64x2_t, + twiddle2c: float64x2_t, + twiddle3c: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly16, 16, |this: &NeonF64Butterfly16<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly16, 16, |this: &NeonF64Butterfly16<_>| this + .direction); +impl NeonF64Butterfly16 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let bf8 = NeonF64Butterfly8::new(direction); + let bf4 = NeonF64Butterfly4::new(direction); + let rotate90 = if direction == FftDirection::Inverse { + Rotate90F64::new(true) + } else { + Rotate90F64::new(false) + }; + let twiddle1 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(1, 16, direction) as *const _ as *const f64) + }; + let twiddle2 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(2, 16, direction) as *const _ as *const f64) + }; + let twiddle3 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(3, 16, direction) as *const _ as *const f64) + }; + let twiddle1c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(1, 16, direction).conj() as *const _ + as *const f64, + ) + }; + let twiddle2c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(2, 16, direction).conj() as *const _ + as *const f64, + ) + }; + let twiddle3c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(3, 16, direction).conj() as *const _ + as *const f64, + ) + }; + + Self { + direction, + bf4, + bf8, + rotate90, + twiddle1, + twiddle2, + twiddle3, + twiddle1c, + twiddle2c, + twiddle3c, + } + } + + #[inline(always)] + unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = + read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + } + + #[inline(always)] + unsafe fn perform_fft_direct(&self, input: [float64x2_t; 16]) -> [float64x2_t; 16] { + // we're going to hardcode a step of split radix + + // step 1: copy and reorder the input into the scratch + // and + // step 2: column FFTs + let evens = self.bf8.perform_fft_direct([ + input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14], + ]); + let mut odds1 = self + .bf4 + .perform_fft_direct(input[1], input[5], input[9], input[13]); + let mut odds3 = self + .bf4 + .perform_fft_direct(input[15], input[3], input[7], input[11]); + + // step 3: apply twiddle factors + odds1[1] = mul_complex_f64(odds1[1], self.twiddle1); + odds3[1] = mul_complex_f64(odds3[1], self.twiddle1c); + + odds1[2] = mul_complex_f64(odds1[2], self.twiddle2); + odds3[2] = mul_complex_f64(odds3[2], self.twiddle2c); + + odds1[3] = mul_complex_f64(odds1[3], self.twiddle3); + odds3[3] = mul_complex_f64(odds3[3], self.twiddle3c); + + // step 4: cross FFTs + let mut temp0 = solo_fft2_f64(odds1[0], odds3[0]); + let mut temp1 = solo_fft2_f64(odds1[1], odds3[1]); + let mut temp2 = solo_fft2_f64(odds1[2], odds3[2]); + let mut temp3 = solo_fft2_f64(odds1[3], odds3[3]); + + // apply the butterfly 4 twiddle factor, which is just a rotation + temp0[1] = self.rotate90.rotate(temp0[1]); + temp1[1] = self.rotate90.rotate(temp1[1]); + temp2[1] = self.rotate90.rotate(temp2[1]); + temp3[1] = self.rotate90.rotate(temp3[1]); + + //step 5: copy/add/subtract data back to buffer + [ + vaddq_f64(evens[0], temp0[0]), + vaddq_f64(evens[1], temp1[0]), + vaddq_f64(evens[2], temp2[0]), + vaddq_f64(evens[3], temp3[0]), + vaddq_f64(evens[4], temp0[1]), + vaddq_f64(evens[5], temp1[1]), + vaddq_f64(evens[6], temp2[1]), + vaddq_f64(evens[7], temp3[1]), + vsubq_f64(evens[0], temp0[0]), + vsubq_f64(evens[1], temp1[0]), + vsubq_f64(evens[2], temp2[0]), + vsubq_f64(evens[3], temp3[0]), + vsubq_f64(evens[4], temp0[1]), + vsubq_f64(evens[5], temp1[1]), + vsubq_f64(evens[6], temp2[1]), + vsubq_f64(evens[7], temp3[1]), + ] + } +} + +// _________ _________ _ _ _ +// |___ /___ \ |___ /___ \| |__ (_) |_ +// |_ \ __) | _____ |_ \ __) | '_ \| | __| +// ___) / __/ |_____| ___) / __/| |_) | | |_ +// |____/_____| |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly32 { + direction: FftDirection, + bf8: NeonF32Butterfly8, + bf16: NeonF32Butterfly16, + rotate90: Rotate90F32, + twiddle01: float32x4_t, + twiddle23: float32x4_t, + twiddle45: float32x4_t, + twiddle67: float32x4_t, + twiddle01conj: float32x4_t, + twiddle23conj: float32x4_t, + twiddle45conj: float32x4_t, + twiddle67conj: float32x4_t, + twiddle1: float32x4_t, + twiddle2: float32x4_t, + twiddle3: float32x4_t, + twiddle4: float32x4_t, + twiddle5: float32x4_t, + twiddle6: float32x4_t, + twiddle7: float32x4_t, + twiddle1c: float32x4_t, + twiddle2c: float32x4_t, + twiddle3c: float32x4_t, + twiddle4c: float32x4_t, + twiddle5c: float32x4_t, + twiddle6c: float32x4_t, + twiddle7c: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly32, 32, |this: &NeonF32Butterfly32<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly32, 32, |this: &NeonF32Butterfly32<_>| this + .direction); +impl NeonF32Butterfly32 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let bf8 = NeonF32Butterfly8::new(direction); + let bf16 = NeonF32Butterfly16::new(direction); + let rotate90 = if direction == FftDirection::Inverse { + Rotate90F32::new(true) + } else { + Rotate90F32::new(false) + }; + let tw1: Complex = twiddles::compute_twiddle(1, 32, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 32, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 32, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 32, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 32, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 32, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 32, direction); + let twiddle01 = unsafe { vld1q_f32([1.0, 0.0, tw1.re, tw1.im].as_ptr()) }; + let twiddle23 = unsafe { vld1q_f32([tw2.re, tw2.im, tw3.re, tw3.im].as_ptr()) }; + let twiddle45 = unsafe { vld1q_f32([tw4.re, tw4.im, tw5.re, tw5.im].as_ptr()) }; + let twiddle67 = unsafe { vld1q_f32([tw6.re, tw6.im, tw7.re, tw7.im].as_ptr()) }; + let twiddle01conj = unsafe { vld1q_f32([1.0, 0.0, tw1.re, -tw1.im].as_ptr()) }; + let twiddle23conj = unsafe { vld1q_f32([tw2.re, -tw2.im, tw3.re, -tw3.im].as_ptr()) }; + let twiddle45conj = unsafe { vld1q_f32([tw4.re, -tw4.im, tw5.re, -tw5.im].as_ptr()) }; + let twiddle67conj = unsafe { vld1q_f32([tw6.re, -tw6.im, tw7.re, -tw7.im].as_ptr()) }; + let twiddle1 = unsafe { vld1q_f32([tw1.re, tw1.im, tw1.re, tw1.im].as_ptr()) }; + let twiddle2 = unsafe { vld1q_f32([tw2.re, tw2.im, tw2.re, tw2.im].as_ptr()) }; + let twiddle3 = unsafe { vld1q_f32([tw3.re, tw3.im, tw3.re, tw3.im].as_ptr()) }; + let twiddle4 = unsafe { vld1q_f32([tw4.re, tw4.im, tw4.re, tw4.im].as_ptr()) }; + let twiddle5 = unsafe { vld1q_f32([tw5.re, tw5.im, tw5.re, tw5.im].as_ptr()) }; + let twiddle6 = unsafe { vld1q_f32([tw6.re, tw6.im, tw6.re, tw6.im].as_ptr()) }; + let twiddle7 = unsafe { vld1q_f32([tw7.re, tw7.im, tw7.re, tw7.im].as_ptr()) }; + let twiddle1c = unsafe { vld1q_f32([tw1.re, -tw1.im, tw1.re, -tw1.im].as_ptr()) }; + let twiddle2c = unsafe { vld1q_f32([tw2.re, -tw2.im, tw2.re, -tw2.im].as_ptr()) }; + let twiddle3c = unsafe { vld1q_f32([tw3.re, -tw3.im, tw3.re, -tw3.im].as_ptr()) }; + let twiddle4c = unsafe { vld1q_f32([tw4.re, -tw4.im, tw4.re, -tw4.im].as_ptr()) }; + let twiddle5c = unsafe { vld1q_f32([tw5.re, -tw5.im, tw5.re, -tw5.im].as_ptr()) }; + let twiddle6c = unsafe { vld1q_f32([tw6.re, -tw6.im, tw6.re, -tw6.im].as_ptr()) }; + let twiddle7c = unsafe { vld1q_f32([tw7.re, -tw7.im, tw7.re, -tw7.im].as_ptr()) }; + Self { + direction, + bf8, + bf16, + rotate90, + twiddle01, + twiddle23, + twiddle45, + twiddle67, + twiddle01conj, + twiddle23conj, + twiddle45conj, + twiddle67conj, + twiddle1, + twiddle2, + twiddle3, + twiddle4, + twiddle5, + twiddle6, + twiddle7, + twiddle1c, + twiddle2c, + twiddle3c, + twiddle4c, + twiddle5c, + twiddle6c, + twiddle7c, + } + } + + #[inline(always)] + unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }); + + let out = self.perform_fft_direct(input_packed); + + write_complex_to_array_strided!(out, output, 2, {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}); + + let values = interleave_complex_f32!(input_packed, 16, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + let out = self.perform_parallel_fft_direct(values); + + let out_sorted = separate_interleaved_complex_f32!(out, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); + + write_complex_to_array_strided!(out_sorted, output, 2, {0,1,2,3,4,5,6,7,8,9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }); + } + + #[inline(always)] + unsafe fn perform_fft_direct(&self, input: [float32x4_t; 16]) -> [float32x4_t; 16] { + // we're going to hardcode a step of split radix + + // step 1: copy and reorder the input into the scratch + let in0002 = extract_lo_lo_f32(input[0], input[1]); + let in0406 = extract_lo_lo_f32(input[2], input[3]); + let in0810 = extract_lo_lo_f32(input[4], input[5]); + let in1214 = extract_lo_lo_f32(input[6], input[7]); + let in1618 = extract_lo_lo_f32(input[8], input[9]); + let in2022 = extract_lo_lo_f32(input[10], input[11]); + let in2426 = extract_lo_lo_f32(input[12], input[13]); + let in2830 = extract_lo_lo_f32(input[14], input[15]); + + let in0105 = extract_hi_hi_f32(input[0], input[2]); + let in0913 = extract_hi_hi_f32(input[4], input[6]); + let in1721 = extract_hi_hi_f32(input[8], input[10]); + let in2529 = extract_hi_hi_f32(input[12], input[14]); + + let in3103 = extract_hi_hi_f32(input[15], input[1]); + let in0711 = extract_hi_hi_f32(input[3], input[5]); + let in1519 = extract_hi_hi_f32(input[7], input[9]); + let in2327 = extract_hi_hi_f32(input[11], input[13]); + + let in_evens = [ + in0002, in0406, in0810, in1214, in1618, in2022, in2426, in2830, + ]; + + // step 2: column FFTs + let evens = self.bf16.perform_fft_direct(in_evens); + let mut odds1 = self + .bf8 + .perform_fft_direct([in0105, in0913, in1721, in2529]); + let mut odds3 = self + .bf8 + .perform_fft_direct([in3103, in0711, in1519, in2327]); + + // step 3: apply twiddle factors + odds1[0] = mul_complex_f32(odds1[0], self.twiddle01); + odds3[0] = mul_complex_f32(odds3[0], self.twiddle01conj); + + odds1[1] = mul_complex_f32(odds1[1], self.twiddle23); + odds3[1] = mul_complex_f32(odds3[1], self.twiddle23conj); + + odds1[2] = mul_complex_f32(odds1[2], self.twiddle45); + odds3[2] = mul_complex_f32(odds3[2], self.twiddle45conj); + + odds1[3] = mul_complex_f32(odds1[3], self.twiddle67); + odds3[3] = mul_complex_f32(odds3[3], self.twiddle67conj); + + // step 4: cross FFTs + let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); + let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); + let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); + let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); + + // apply the butterfly 4 twiddle factor, which is just a rotation + temp0[1] = self.rotate90.rotate_both(temp0[1]); + temp1[1] = self.rotate90.rotate_both(temp1[1]); + temp2[1] = self.rotate90.rotate_both(temp2[1]); + temp3[1] = self.rotate90.rotate_both(temp3[1]); + + //step 5: copy/add/subtract data back to buffer + [ + vaddq_f32(evens[0], temp0[0]), + vaddq_f32(evens[1], temp1[0]), + vaddq_f32(evens[2], temp2[0]), + vaddq_f32(evens[3], temp3[0]), + vaddq_f32(evens[4], temp0[1]), + vaddq_f32(evens[5], temp1[1]), + vaddq_f32(evens[6], temp2[1]), + vaddq_f32(evens[7], temp3[1]), + vsubq_f32(evens[0], temp0[0]), + vsubq_f32(evens[1], temp1[0]), + vsubq_f32(evens[2], temp2[0]), + vsubq_f32(evens[3], temp3[0]), + vsubq_f32(evens[4], temp0[1]), + vsubq_f32(evens[5], temp1[1]), + vsubq_f32(evens[6], temp2[1]), + vsubq_f32(evens[7], temp3[1]), + ] + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct( + &self, + input: [float32x4_t; 32], + ) -> [float32x4_t; 32] { + // we're going to hardcode a step of split radix + + // step 1: copy and reorder the input into the scratch + // and + // step 2: column FFTs + let evens = self.bf16.perform_parallel_fft_direct([ + input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14], + input[16], input[18], input[20], input[22], input[24], input[26], input[28], input[30], + ]); + let mut odds1 = self.bf8.perform_parallel_fft_direct([ + input[1], input[5], input[9], input[13], input[17], input[21], input[25], input[29], + ]); + let mut odds3 = self.bf8.perform_parallel_fft_direct([ + input[31], input[3], input[7], input[11], input[15], input[19], input[23], input[27], + ]); + + // step 3: apply twiddle factors + odds1[1] = mul_complex_f32(odds1[1], self.twiddle1); + odds3[1] = mul_complex_f32(odds3[1], self.twiddle1c); + + odds1[2] = mul_complex_f32(odds1[2], self.twiddle2); + odds3[2] = mul_complex_f32(odds3[2], self.twiddle2c); + + odds1[3] = mul_complex_f32(odds1[3], self.twiddle3); + odds3[3] = mul_complex_f32(odds3[3], self.twiddle3c); + + odds1[4] = mul_complex_f32(odds1[4], self.twiddle4); + odds3[4] = mul_complex_f32(odds3[4], self.twiddle4c); + + odds1[5] = mul_complex_f32(odds1[5], self.twiddle5); + odds3[5] = mul_complex_f32(odds3[5], self.twiddle5c); + + odds1[6] = mul_complex_f32(odds1[6], self.twiddle6); + odds3[6] = mul_complex_f32(odds3[6], self.twiddle6c); + + odds1[7] = mul_complex_f32(odds1[7], self.twiddle7); + odds3[7] = mul_complex_f32(odds3[7], self.twiddle7c); + + // step 4: cross FFTs + let mut temp0 = parallel_fft2_interleaved_f32(odds1[0], odds3[0]); + let mut temp1 = parallel_fft2_interleaved_f32(odds1[1], odds3[1]); + let mut temp2 = parallel_fft2_interleaved_f32(odds1[2], odds3[2]); + let mut temp3 = parallel_fft2_interleaved_f32(odds1[3], odds3[3]); + let mut temp4 = parallel_fft2_interleaved_f32(odds1[4], odds3[4]); + let mut temp5 = parallel_fft2_interleaved_f32(odds1[5], odds3[5]); + let mut temp6 = parallel_fft2_interleaved_f32(odds1[6], odds3[6]); + let mut temp7 = parallel_fft2_interleaved_f32(odds1[7], odds3[7]); + + // apply the butterfly 4 twiddle factor, which is just a rotation + temp0[1] = self.rotate90.rotate_both(temp0[1]); + temp1[1] = self.rotate90.rotate_both(temp1[1]); + temp2[1] = self.rotate90.rotate_both(temp2[1]); + temp3[1] = self.rotate90.rotate_both(temp3[1]); + temp4[1] = self.rotate90.rotate_both(temp4[1]); + temp5[1] = self.rotate90.rotate_both(temp5[1]); + temp6[1] = self.rotate90.rotate_both(temp6[1]); + temp7[1] = self.rotate90.rotate_both(temp7[1]); + + //step 5: copy/add/subtract data back to buffer + [ + vaddq_f32(evens[0], temp0[0]), + vaddq_f32(evens[1], temp1[0]), + vaddq_f32(evens[2], temp2[0]), + vaddq_f32(evens[3], temp3[0]), + vaddq_f32(evens[4], temp4[0]), + vaddq_f32(evens[5], temp5[0]), + vaddq_f32(evens[6], temp6[0]), + vaddq_f32(evens[7], temp7[0]), + vaddq_f32(evens[8], temp0[1]), + vaddq_f32(evens[9], temp1[1]), + vaddq_f32(evens[10], temp2[1]), + vaddq_f32(evens[11], temp3[1]), + vaddq_f32(evens[12], temp4[1]), + vaddq_f32(evens[13], temp5[1]), + vaddq_f32(evens[14], temp6[1]), + vaddq_f32(evens[15], temp7[1]), + vsubq_f32(evens[0], temp0[0]), + vsubq_f32(evens[1], temp1[0]), + vsubq_f32(evens[2], temp2[0]), + vsubq_f32(evens[3], temp3[0]), + vsubq_f32(evens[4], temp4[0]), + vsubq_f32(evens[5], temp5[0]), + vsubq_f32(evens[6], temp6[0]), + vsubq_f32(evens[7], temp7[0]), + vsubq_f32(evens[8], temp0[1]), + vsubq_f32(evens[9], temp1[1]), + vsubq_f32(evens[10], temp2[1]), + vsubq_f32(evens[11], temp3[1]), + vsubq_f32(evens[12], temp4[1]), + vsubq_f32(evens[13], temp5[1]), + vsubq_f32(evens[14], temp6[1]), + vsubq_f32(evens[15], temp7[1]), + ] + } +} + +// _________ __ _ _ _ _ _ +// |___ /___ \ / /_ | || | | |__ (_) |_ +// |_ \ __) | _____ | '_ \| || |_| '_ \| | __| +// ___) / __/ |_____| | (_) |__ _| |_) | | |_ +// |____/_____| \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly32 { + direction: FftDirection, + bf8: NeonF64Butterfly8, + bf16: NeonF64Butterfly16, + rotate90: Rotate90F64, + twiddle1: float64x2_t, + twiddle2: float64x2_t, + twiddle3: float64x2_t, + twiddle4: float64x2_t, + twiddle5: float64x2_t, + twiddle6: float64x2_t, + twiddle7: float64x2_t, + twiddle1c: float64x2_t, + twiddle2c: float64x2_t, + twiddle3c: float64x2_t, + twiddle4c: float64x2_t, + twiddle5c: float64x2_t, + twiddle6c: float64x2_t, + twiddle7c: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly32, 32, |this: &NeonF64Butterfly32<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly32, 32, |this: &NeonF64Butterfly32<_>| this + .direction); +impl NeonF64Butterfly32 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let bf8 = NeonF64Butterfly8::new(direction); + let bf16 = NeonF64Butterfly16::new(direction); + let rotate90 = if direction == FftDirection::Inverse { + Rotate90F64::new(true) + } else { + Rotate90F64::new(false) + }; + let twiddle1 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(1, 32, direction) as *const _ as *const f64) + }; + let twiddle2 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(2, 32, direction) as *const _ as *const f64) + }; + let twiddle3 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(3, 32, direction) as *const _ as *const f64) + }; + let twiddle4 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(4, 32, direction) as *const _ as *const f64) + }; + let twiddle5 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(5, 32, direction) as *const _ as *const f64) + }; + let twiddle6 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(6, 32, direction) as *const _ as *const f64) + }; + let twiddle7 = unsafe { + vld1q_f64(&twiddles::compute_twiddle::(7, 32, direction) as *const _ as *const f64) + }; + let twiddle1c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(1, 32, direction).conj() as *const _ + as *const f64, + ) + }; + let twiddle2c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(2, 32, direction).conj() as *const _ + as *const f64, + ) + }; + let twiddle3c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(3, 32, direction).conj() as *const _ + as *const f64, + ) + }; + let twiddle4c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(4, 32, direction).conj() as *const _ + as *const f64, + ) + }; + let twiddle5c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(5, 32, direction).conj() as *const _ + as *const f64, + ) + }; + let twiddle6c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(6, 32, direction).conj() as *const _ + as *const f64, + ) + }; + let twiddle7c = unsafe { + vld1q_f64( + &twiddles::compute_twiddle::(7, 32, direction).conj() as *const _ + as *const f64, + ) + }; + + Self { + direction, + bf8, + bf16, + rotate90, + twiddle1, + twiddle2, + twiddle3, + twiddle4, + twiddle5, + twiddle6, + twiddle7, + twiddle1c, + twiddle2c, + twiddle3c, + twiddle4c, + twiddle5c, + twiddle6c, + twiddle7c, + } + } + + #[inline(always)] + unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); + } + + #[inline(always)] + unsafe fn perform_fft_direct(&self, input: [float64x2_t; 32]) -> [float64x2_t; 32] { + // we're going to hardcode a step of split radix + + // step 1: copy and reorder the input into the scratch + // and + // step 2: column FFTs + let evens = self.bf16.perform_fft_direct([ + input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14], + input[16], input[18], input[20], input[22], input[24], input[26], input[28], input[30], + ]); + let mut odds1 = self.bf8.perform_fft_direct([ + input[1], input[5], input[9], input[13], input[17], input[21], input[25], input[29], + ]); + let mut odds3 = self.bf8.perform_fft_direct([ + input[31], input[3], input[7], input[11], input[15], input[19], input[23], input[27], + ]); + + // step 3: apply twiddle factors + odds1[1] = mul_complex_f64(odds1[1], self.twiddle1); + odds3[1] = mul_complex_f64(odds3[1], self.twiddle1c); + + odds1[2] = mul_complex_f64(odds1[2], self.twiddle2); + odds3[2] = mul_complex_f64(odds3[2], self.twiddle2c); + + odds1[3] = mul_complex_f64(odds1[3], self.twiddle3); + odds3[3] = mul_complex_f64(odds3[3], self.twiddle3c); + + odds1[4] = mul_complex_f64(odds1[4], self.twiddle4); + odds3[4] = mul_complex_f64(odds3[4], self.twiddle4c); + + odds1[5] = mul_complex_f64(odds1[5], self.twiddle5); + odds3[5] = mul_complex_f64(odds3[5], self.twiddle5c); + + odds1[6] = mul_complex_f64(odds1[6], self.twiddle6); + odds3[6] = mul_complex_f64(odds3[6], self.twiddle6c); + + odds1[7] = mul_complex_f64(odds1[7], self.twiddle7); + odds3[7] = mul_complex_f64(odds3[7], self.twiddle7c); + + // step 4: cross FFTs + let mut temp0 = solo_fft2_f64(odds1[0], odds3[0]); + let mut temp1 = solo_fft2_f64(odds1[1], odds3[1]); + let mut temp2 = solo_fft2_f64(odds1[2], odds3[2]); + let mut temp3 = solo_fft2_f64(odds1[3], odds3[3]); + let mut temp4 = solo_fft2_f64(odds1[4], odds3[4]); + let mut temp5 = solo_fft2_f64(odds1[5], odds3[5]); + let mut temp6 = solo_fft2_f64(odds1[6], odds3[6]); + let mut temp7 = solo_fft2_f64(odds1[7], odds3[7]); + + // apply the butterfly 4 twiddle factor, which is just a rotation + temp0[1] = self.rotate90.rotate(temp0[1]); + temp1[1] = self.rotate90.rotate(temp1[1]); + temp2[1] = self.rotate90.rotate(temp2[1]); + temp3[1] = self.rotate90.rotate(temp3[1]); + temp4[1] = self.rotate90.rotate(temp4[1]); + temp5[1] = self.rotate90.rotate(temp5[1]); + temp6[1] = self.rotate90.rotate(temp6[1]); + temp7[1] = self.rotate90.rotate(temp7[1]); + + //step 5: copy/add/subtract data back to buffer + [ + vaddq_f64(evens[0], temp0[0]), + vaddq_f64(evens[1], temp1[0]), + vaddq_f64(evens[2], temp2[0]), + vaddq_f64(evens[3], temp3[0]), + vaddq_f64(evens[4], temp4[0]), + vaddq_f64(evens[5], temp5[0]), + vaddq_f64(evens[6], temp6[0]), + vaddq_f64(evens[7], temp7[0]), + vaddq_f64(evens[8], temp0[1]), + vaddq_f64(evens[9], temp1[1]), + vaddq_f64(evens[10], temp2[1]), + vaddq_f64(evens[11], temp3[1]), + vaddq_f64(evens[12], temp4[1]), + vaddq_f64(evens[13], temp5[1]), + vaddq_f64(evens[14], temp6[1]), + vaddq_f64(evens[15], temp7[1]), + vsubq_f64(evens[0], temp0[0]), + vsubq_f64(evens[1], temp1[0]), + vsubq_f64(evens[2], temp2[0]), + vsubq_f64(evens[3], temp3[0]), + vsubq_f64(evens[4], temp4[0]), + vsubq_f64(evens[5], temp5[0]), + vsubq_f64(evens[6], temp6[0]), + vsubq_f64(evens[7], temp7[0]), + vsubq_f64(evens[8], temp0[1]), + vsubq_f64(evens[9], temp1[1]), + vsubq_f64(evens[10], temp2[1]), + vsubq_f64(evens[11], temp3[1]), + vsubq_f64(evens[12], temp4[1]), + vsubq_f64(evens[13], temp5[1]), + vsubq_f64(evens[14], temp6[1]), + vsubq_f64(evens[15], temp7[1]), + ] + } +} + +#[cfg(test)] +mod unit_tests { + use super::*; + use crate::algorithm::Dft; + use crate::test_utils::{check_fft_algorithm, compare_vectors}; + + //the tests for all butterflies will be identical except for the identifiers used and size + //so it's ideal for a macro + macro_rules! test_butterfly_32_func { + ($test_name:ident, $struct_name:ident, $size:expr) => { + #[test] + fn $test_name() { + let butterfly = $struct_name::new(FftDirection::Forward); + check_fft_algorithm::(&butterfly, $size, FftDirection::Forward); + + let butterfly_direction = $struct_name::new(FftDirection::Inverse); + check_fft_algorithm::(&butterfly_direction, $size, FftDirection::Inverse); + } + }; + } + test_butterfly_32_func!(test_neonf32_butterfly2, NeonF32Butterfly2, 2); + test_butterfly_32_func!(test_neonf32_butterfly3, NeonF32Butterfly3, 3); + test_butterfly_32_func!(test_neonf32_butterfly4, NeonF32Butterfly4, 4); + test_butterfly_32_func!(test_neonf32_butterfly5, NeonF32Butterfly5, 5); + test_butterfly_32_func!(test_neonf32_butterfly6, NeonF32Butterfly6, 6); + test_butterfly_32_func!(test_neonf32_butterfly8, NeonF32Butterfly8, 8); + test_butterfly_32_func!(test_neonf32_butterfly9, NeonF32Butterfly9, 9); + test_butterfly_32_func!(test_neonf32_butterfly10, NeonF32Butterfly10, 10); + test_butterfly_32_func!(test_neonf32_butterfly12, NeonF32Butterfly12, 12); + test_butterfly_32_func!(test_neonf32_butterfly15, NeonF32Butterfly15, 15); + test_butterfly_32_func!(test_neonf32_butterfly16, NeonF32Butterfly16, 16); + test_butterfly_32_func!(test_neonf32_butterfly32, NeonF32Butterfly32, 32); + + //the tests for all butterflies will be identical except for the identifiers used and size + //so it's ideal for a macro + macro_rules! test_butterfly_64_func { + ($test_name:ident, $struct_name:ident, $size:expr) => { + #[test] + fn $test_name() { + let butterfly = $struct_name::new(FftDirection::Forward); + check_fft_algorithm::(&butterfly, $size, FftDirection::Forward); + + let butterfly_direction = $struct_name::new(FftDirection::Inverse); + check_fft_algorithm::(&butterfly_direction, $size, FftDirection::Inverse); + } + }; + } + test_butterfly_64_func!(test_neonf64_butterfly2, NeonF64Butterfly2, 2); + test_butterfly_64_func!(test_neonf64_butterfly3, NeonF64Butterfly3, 3); + test_butterfly_64_func!(test_neonf64_butterfly4, NeonF64Butterfly4, 4); + test_butterfly_64_func!(test_neonf64_butterfly5, NeonF64Butterfly5, 5); + test_butterfly_64_func!(test_neonf64_butterfly6, NeonF64Butterfly6, 6); + test_butterfly_64_func!(test_neonf64_butterfly8, NeonF64Butterfly8, 8); + test_butterfly_64_func!(test_neonf64_butterfly9, NeonF64Butterfly9, 9); + test_butterfly_64_func!(test_neonf64_butterfly10, NeonF64Butterfly10, 10); + test_butterfly_64_func!(test_neonf64_butterfly12, NeonF64Butterfly12, 12); + test_butterfly_64_func!(test_neonf64_butterfly15, NeonF64Butterfly15, 15); + test_butterfly_64_func!(test_neonf64_butterfly16, NeonF64Butterfly16, 16); + test_butterfly_64_func!(test_neonf64_butterfly32, NeonF64Butterfly32, 32); + + #[test] + fn test_solo_fft2_32() { + unsafe { + let val1 = Complex::::new(1.0, 2.5); + let val2 = Complex::::new(3.2, 4.2); + + let mut val = vec![val1, val2]; + + let in_packed = vld1q_f32(val.as_ptr() as *const f32); + + let dft = Dft::new(2, FftDirection::Forward); + + let bf2 = NeonF32Butterfly2::::new(FftDirection::Forward); + + dft.process(&mut val); + let res_packed = bf2.perform_fft_direct(in_packed); + + let res = std::mem::transmute::; 2]>(res_packed); + assert_eq!(val[0], res[0]); + assert_eq!(val[1], res[1]); + } + } + + #[test] + fn test_parallel_fft2_32() { + unsafe { + let val_a1 = Complex::::new(1.0, 2.5); + let val_a2 = Complex::::new(3.2, 4.2); + + let val_b1 = Complex::::new(6.0, 24.5); + let val_b2 = Complex::::new(4.3, 34.2); + + let mut val_a = vec![val_a1, val_a2]; + let mut val_b = vec![val_b1, val_b2]; + + let p1 = vld1q_f32(val_a.as_ptr() as *const f32); + let p2 = vld1q_f32(val_b.as_ptr() as *const f32); + + let dft = Dft::new(2, FftDirection::Forward); + + let bf2 = NeonF32Butterfly2::::new(FftDirection::Forward); + + dft.process(&mut val_a); + dft.process(&mut val_b); + let res_both = bf2.perform_parallel_fft_direct(p1, p2); + + let res = std::mem::transmute::<[float32x4_t; 2], [Complex; 4]>(res_both); + let neon_res_a = [res[0], res[2]]; + let neon_res_b = [res[1], res[3]]; + assert!(compare_vectors(&val_a, &neon_res_a)); + assert!(compare_vectors(&val_b, &neon_res_b)); + } + } +} diff --git a/src/neon/neon_common.rs b/src/neon/neon_common.rs new file mode 100644 index 00000000..22f68241 --- /dev/null +++ b/src/neon/neon_common.rs @@ -0,0 +1,346 @@ +use std::any::TypeId; + +// Calculate the sum of an expression consisting of just plus and minus, like `value = a + b - c + d`. +// The expression is rewritten to `value = a + (b - (c - d))` (note the flipped sign on d). +// After this the `$add` and `$sub` functions are used to make the calculation. +// For f32 using `_mm_add_ps` and `_mm_sub_ps`, the expression `value = a + b - c + d` becomes: +// ```let value = _mm_add_ps(a, _mm_sub_ps(b, _mm_sub_ps(c, d)));``` +// Only plus and minus are supported, and all the terms must be plain scalar variables. +// Using array indices, like `value = temp[0] + temp[1]` is not supported. +macro_rules! calc_sum { + ($add:ident, $sub:ident, + $acc:tt + $($rest:tt)*)=> { + $add($acc, calc_sum!($add, $sub, + $($rest)*)) + }; + ($add:ident, $sub:ident, + $acc:tt - $($rest:tt)*)=> { + $sub($acc, calc_sum!($add, $sub, - $($rest)*)) + }; + ($add:ident, $sub:ident, - $acc:tt + $($rest:tt)*)=> { + $sub($acc, calc_sum!($add, $sub, + $($rest)*)) + }; + ($add:ident, $sub:ident, - $acc:tt - $($rest:tt)*)=> { + $add($acc, calc_sum!($add, $sub, - $($rest)*)) + }; + ($add:ident, $sub:ident, $acc:tt + $($rest:tt)*)=> { + $add($acc, calc_sum!($add, $sub, + $($rest)*)) + }; + ($add:ident, $sub:ident, $acc:tt - $($rest:tt)*)=> { + $sub($acc, calc_sum!($add, $sub, - $($rest)*)) + }; + ($add:ident, $sub:ident, + $val:tt) => {$val}; + ($add:ident, $sub:ident, - $val:tt) => {$val}; +} + +// Calculate the sum of an expression consisting of just plus and minus, like a + b - c + d +macro_rules! calc_f32 { + ($($tokens:tt)*) => { calc_sum!(vaddq_f32, vsubq_f32, $($tokens)*)}; +} + +// Calculate the sum of an expression consisting of just plus and minus, like a + b - c + d +macro_rules! calc_f64 { + ($($tokens:tt)*) => { calc_sum!(vaddq_f64, vsubq_f64, $($tokens)*)}; +} + +// Helper function to assert we have the right float type +pub fn assert_f32() { + let id_f32 = TypeId::of::(); + let id_t = TypeId::of::(); + assert!(id_t == id_f32, "Wrong float type, must be f32"); +} + +// Helper function to assert we have the right float type +pub fn assert_f64() { + let id_f64 = TypeId::of::(); + let id_t = TypeId::of::(); + assert!(id_t == id_f64, "Wrong float type, must be f64"); +} + +// Shuffle elements to interleave two contiguous sets of f32, from an array of simd vectors to a new array of simd vectors +macro_rules! interleave_complex_f32 { + ($input:ident, $offset:literal, { $($idx:literal),* }) => { + [ + $( + extract_lo_lo_f32($input[$idx], $input[$idx+$offset]), + extract_hi_hi_f32($input[$idx], $input[$idx+$offset]), + )* + ] + } +} + +// Shuffle elements to interleave two contiguous sets of f32, from an array of simd vectors to a new array of simd vectors +// This statement: +// ``` +// let values = separate_interleaved_complex_f32!(input, {0, 2, 4}); +// ``` +// is equivalent to: +// ``` +// let values = [ +// extract_lo_lo_f32(input[0], input[1]), +// extract_lo_lo_f32(input[2], input[3]), +// extract_lo_lo_f32(input[4], input[5]), +// extract_hi_hi_f32(input[0], input[1]), +// extract_hi_hi_f32(input[2], input[3]), +// extract_hi_hi_f32(input[4], input[5]), +// ]; +macro_rules! separate_interleaved_complex_f32 { + ($input:ident, { $($idx:literal),* }) => { + [ + $( + extract_lo_lo_f32($input[$idx], $input[$idx+1]), + )* + $( + extract_hi_hi_f32($input[$idx], $input[$idx+1]), + )* + ] + } +} + +macro_rules! boilerplate_fft_neon_oop { + ($struct_name:ident, $len_fn:expr) => { + impl Fft for $struct_name { + fn process_outofplace_with_scratch( + &self, + input: &mut [Complex], + output: &mut [Complex], + _scratch: &mut [Complex], + ) { + if self.len() == 0 { + return; + } + + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let result = unsafe { + array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.perform_fft_out_of_place(in_chunk, out_chunk, &mut []) + }, + ) + }; + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } + fn process_with_scratch(&self, buffer: &mut [Complex], scratch: &mut [Complex]) { + if self.len() == 0 { + return; + } + + let required_scratch = self.get_inplace_scratch_len(); + if scratch.len() < required_scratch || buffer.len() < self.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_inplace( + self.len(), + buffer.len(), + self.get_inplace_scratch_len(), + scratch.len(), + ); + return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here + } + + let scratch = &mut scratch[..required_scratch]; + let result = unsafe { + array_utils::iter_chunks(buffer, self.len(), |chunk| { + self.perform_fft_out_of_place(chunk, scratch, &mut []); + chunk.copy_from_slice(scratch); + }) + }; + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_inplace( + self.len(), + buffer.len(), + self.get_inplace_scratch_len(), + scratch.len(), + ); + } + } + #[inline(always)] + fn get_inplace_scratch_len(&self) -> usize { + self.len() + } + #[inline(always)] + fn get_outofplace_scratch_len(&self) -> usize { + 0 + } + } + impl Length for $struct_name { + #[inline(always)] + fn len(&self) -> usize { + $len_fn(self) + } + } + impl Direction for $struct_name { + #[inline(always)] + fn fft_direction(&self) -> FftDirection { + self.direction + } + } + }; +} + +/* Not used now, but maybe later for the mixed radixes etc +macro_rules! boilerplate_sse_fft { + ($struct_name:ident, $len_fn:expr, $inplace_scratch_len_fn:expr, $out_of_place_scratch_len_fn:expr) => { + impl Fft for $struct_name { + fn process_outofplace_with_scratch( + &self, + input: &mut [Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if self.len() == 0 { + return; + } + + let required_scratch = self.get_outofplace_scratch_len(); + if scratch.len() < required_scratch + || input.len() < self.len() + || output.len() != input.len() + { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let scratch = &mut scratch[..required_scratch]; + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.perform_fft_out_of_place(in_chunk, out_chunk, scratch) + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + } + } + fn process_with_scratch(&self, buffer: &mut [Complex], scratch: &mut [Complex]) { + if self.len() == 0 { + return; + } + + let required_scratch = self.get_inplace_scratch_len(); + if scratch.len() < required_scratch || buffer.len() < self.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_inplace( + self.len(), + buffer.len(), + self.get_inplace_scratch_len(), + scratch.len(), + ); + return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here + } + + let scratch = &mut scratch[..required_scratch]; + let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + self.perform_fft_inplace(chunk, scratch) + }); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_inplace( + self.len(), + buffer.len(), + self.get_inplace_scratch_len(), + scratch.len(), + ); + } + } + #[inline(always)] + fn get_inplace_scratch_len(&self) -> usize { + $inplace_scratch_len_fn(self) + } + #[inline(always)] + fn get_outofplace_scratch_len(&self) -> usize { + $out_of_place_scratch_len_fn(self) + } + } + impl Length for $struct_name { + #[inline(always)] + fn len(&self) -> usize { + $len_fn(self) + } + } + impl Direction for $struct_name { + #[inline(always)] + fn fft_direction(&self) -> FftDirection { + self.direction + } + } + }; +} +*/ + +#[cfg(test)] +mod unit_tests { + use core::arch::aarch64::*; + + #[test] + fn test_calc_f32() { + unsafe { + let a = vld1q_f32([1.0, 1.0, 1.0, 1.0].as_ptr()); + let b = vld1q_f32([2.0, 2.0, 2.0, 2.0].as_ptr()); + let c = vld1q_f32([3.0, 3.0, 3.0, 3.0].as_ptr()); + let d = vld1q_f32([4.0, 4.0, 4.0, 4.0].as_ptr()); + let e = vld1q_f32([5.0, 5.0, 5.0, 5.0].as_ptr()); + let f = vld1q_f32([6.0, 6.0, 6.0, 6.0].as_ptr()); + let g = vld1q_f32([7.0, 7.0, 7.0, 7.0].as_ptr()); + let h = vld1q_f32([8.0, 8.0, 8.0, 8.0].as_ptr()); + let i = vld1q_f32([9.0, 9.0, 9.0, 9.0].as_ptr()); + let expected: f32 = 1.0 + 2.0 - 3.0 + 4.0 - 5.0 + 6.0 - 7.0 - 8.0 + 9.0; + let res = calc_f32!(a + b - c + d - e + f - g - h + i); + let sum = std::mem::transmute::(res); + assert_eq!(sum[0], expected); + assert_eq!(sum[1], expected); + assert_eq!(sum[2], expected); + assert_eq!(sum[3], expected); + } + } + #[test] + fn test_calc_f64() { + unsafe { + let a = vld1q_f64([1.0, 1.0].as_ptr()); + let b = vld1q_f64([2.0, 2.0].as_ptr()); + let c = vld1q_f64([3.0, 3.0].as_ptr()); + let d = vld1q_f64([4.0, 4.0].as_ptr()); + let e = vld1q_f64([5.0, 5.0].as_ptr()); + let f = vld1q_f64([6.0, 6.0].as_ptr()); + let g = vld1q_f64([7.0, 7.0].as_ptr()); + let h = vld1q_f64([8.0, 8.0].as_ptr()); + let i = vld1q_f64([9.0, 9.0].as_ptr()); + let expected: f64 = 1.0 + 2.0 - 3.0 + 4.0 - 5.0 + 6.0 - 7.0 - 8.0 + 9.0; + let res = calc_f64!(a + b - c + d - e + f - g - h + i); + let sum = std::mem::transmute::(res); + assert_eq!(sum[0], expected); + assert_eq!(sum[1], expected); + } + } +} diff --git a/src/neon/neon_planner.rs b/src/neon/neon_planner.rs new file mode 100644 index 00000000..d6d39fc8 --- /dev/null +++ b/src/neon/neon_planner.rs @@ -0,0 +1,836 @@ +use num_integer::gcd; +use std::any::TypeId; +use std::collections::HashMap; + +use std::sync::Arc; + +use crate::{common::FftNum, fft_cache::FftCache, FftDirection}; + +use crate::algorithm::*; +use crate::neon::neon_butterflies::*; +use crate::neon::neon_prime_butterflies::*; +use crate::neon::neon_radix4::*; +use crate::Fft; + +use crate::math_utils::{PrimeFactor, PrimeFactors}; + +const MIN_RADIX4_BITS: u32 = 6; // smallest size to consider radix 4 an option is 2^6 = 64 +const MAX_RADER_PRIME_FACTOR: usize = 23; // don't use Raders if the inner fft length has prime factor larger than this +const MIN_BLUESTEIN_MIXED_RADIX_LEN: usize = 90; // only use mixed radix for the inner fft of Bluestein if length is larger than this + +/// A Recipe is a structure that describes the design of a FFT, without actually creating it. +/// It is used as a middle step in the planning process. +#[derive(Debug, PartialEq, Clone)] +pub enum Recipe { + Dft(usize), + MixedRadix { + left_fft: Arc, + right_fft: Arc, + }, + #[allow(dead_code)] + GoodThomasAlgorithm { + left_fft: Arc, + right_fft: Arc, + }, + MixedRadixSmall { + left_fft: Arc, + right_fft: Arc, + }, + GoodThomasAlgorithmSmall { + left_fft: Arc, + right_fft: Arc, + }, + RadersAlgorithm { + inner_fft: Arc, + }, + BluesteinsAlgorithm { + len: usize, + inner_fft: Arc, + }, + Radix4(usize), + Butterfly1, + Butterfly2, + Butterfly3, + Butterfly4, + Butterfly5, + Butterfly6, + Butterfly7, + Butterfly8, + Butterfly9, + Butterfly10, + Butterfly11, + Butterfly12, + Butterfly13, + Butterfly15, + Butterfly16, + Butterfly17, + Butterfly19, + Butterfly23, + Butterfly29, + Butterfly31, + Butterfly32, +} + +impl Recipe { + pub fn len(&self) -> usize { + match self { + Recipe::Dft(length) => *length, + Recipe::Radix4(length) => *length, + Recipe::Butterfly1 => 1, + Recipe::Butterfly2 => 2, + Recipe::Butterfly3 => 3, + Recipe::Butterfly4 => 4, + Recipe::Butterfly5 => 5, + Recipe::Butterfly6 => 6, + Recipe::Butterfly7 => 7, + Recipe::Butterfly8 => 8, + Recipe::Butterfly9 => 9, + Recipe::Butterfly10 => 10, + Recipe::Butterfly11 => 11, + Recipe::Butterfly12 => 12, + Recipe::Butterfly13 => 13, + Recipe::Butterfly15 => 15, + Recipe::Butterfly16 => 16, + Recipe::Butterfly17 => 17, + Recipe::Butterfly19 => 19, + Recipe::Butterfly23 => 23, + Recipe::Butterfly29 => 29, + Recipe::Butterfly31 => 31, + Recipe::Butterfly32 => 32, + Recipe::MixedRadix { + left_fft, + right_fft, + } => left_fft.len() * right_fft.len(), + Recipe::GoodThomasAlgorithm { + left_fft, + right_fft, + } => left_fft.len() * right_fft.len(), + Recipe::MixedRadixSmall { + left_fft, + right_fft, + } => left_fft.len() * right_fft.len(), + Recipe::GoodThomasAlgorithmSmall { + left_fft, + right_fft, + } => left_fft.len() * right_fft.len(), + Recipe::RadersAlgorithm { inner_fft } => inner_fft.len() + 1, + Recipe::BluesteinsAlgorithm { len, .. } => *len, + } + } +} + +/// The Neon FFT planner creates new FFT algorithm instances using a mix of scalar and Neon accelerated algorithms. +/// It is supported when using the 64-bit AArch64 instruction set. +/// +/// RustFFT has several FFT algorithms available. For a given FFT size, the `FftPlannerNeon` decides which of the +/// available FFT algorithms to use and then initializes them. +/// +/// ~~~ +/// // Perform a forward Fft of size 1234 +/// use std::sync::Arc; +/// use rustfft::{FftPlannerNeon, num_complex::Complex}; +/// +/// if let Ok(mut planner) = FftPlannerNeon::new() { +/// let fft = planner.plan_fft_forward(1234); +/// +/// let mut buffer = vec![Complex{ re: 0.0f32, im: 0.0f32 }; 1234]; +/// fft.process(&mut buffer); +/// +/// // The FFT instance returned by the planner has the type `Arc>`, +/// // where T is the numeric type, ie f32 or f64, so it's cheap to clone +/// let fft_clone = Arc::clone(&fft); +/// } +/// ~~~ +/// +/// If you plan on creating multiple FFT instances, it is recommended to reuse the same planner for all of them. This +/// is because the planner re-uses internal data across FFT instances wherever possible, saving memory and reducing +/// setup time. (FFT instances created with one planner will never re-use data and buffers with FFT instances created +/// by a different planner) +/// +/// Each FFT instance owns [`Arc`s](std::sync::Arc) to its internal data, rather than borrowing it from the planner, so it's perfectly +/// safe to drop the planner after creating Fft instances. +pub struct FftPlannerNeon { + algorithm_cache: FftCache, + recipe_cache: HashMap>, +} + +impl FftPlannerNeon { + /// Creates a new `FftPlannerNeon` instance. + /// + /// Returns `Ok(planner_instance)` if this machine has the required instruction sets. + /// Returns `Err(())` if some instruction sets are missing. + pub fn new() -> Result { + if is_aarch64_feature_detected!("neon") { + // Ideally, we would implement the planner with specialization. + // Specialization won't be on stable rust for a long time though, so in the meantime, we can hack around it. + // + // We use TypeID to determine if T is f32, f64, or neither. If neither, we don't want to do any Neon acceleration + // If it's f32 or f64, then construct and return a Neon planner instance. + // + // All Neon accelerated algorithms come in separate versions for f32 and f64. The type is checked when a new one is created, and if it does not + // match the type the FFT is meant for, it will panic. This will never be a problem if using a planner to construct the FFTs. + // + // An annoying snag with this setup is that we frequently have to transmute buffers from &mut [Complex] to &mut [Complex] or vice versa. + // We know this is safe because we assert everywhere that Type(f32 or f64)==Type(T), so it's just a matter of "doing it right" every time. + // These transmutes are required because the FFT algorithm's input will come through the FFT trait, which may only be bounded by FftNum. + // So the buffers will have the type &mut [Complex]. + let id_f32 = TypeId::of::(); + let id_f64 = TypeId::of::(); + let id_t = TypeId::of::(); + + if id_t == id_f32 || id_t == id_f64 { + return Ok(Self { + algorithm_cache: FftCache::new(), + recipe_cache: HashMap::new(), + }); + } + } + Err(()) + } + + /// Returns a `Fft` instance which uses Neon instructions to compute FFTs of size `len`. + /// + /// If the provided `direction` is `FftDirection::Forward`, the returned instance will compute forward FFTs. If it's `FftDirection::Inverse`, it will compute inverse FFTs. + /// + /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time. + pub fn plan_fft(&mut self, len: usize, direction: FftDirection) -> Arc> { + // Step 1: Create a "recipe" for this FFT, which will tell us exactly which combination of algorithms to use + let recipe = self.design_fft_for_len(len); + + // Step 2: Use our recipe to construct a Fft trait object + self.build_fft(&recipe, direction) + } + + /// Returns a `Fft` instance which uses Neon instructions to compute forward FFTs of size `len` + /// + /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time. + pub fn plan_fft_forward(&mut self, len: usize) -> Arc> { + self.plan_fft(len, FftDirection::Forward) + } + + /// Returns a `Fft` instance which uses Neon instructions to compute inverse FFTs of size `len. + /// + /// If this is called multiple times, the planner will attempt to re-use internal data between calls, reducing memory usage and FFT initialization time. + pub fn plan_fft_inverse(&mut self, len: usize) -> Arc> { + self.plan_fft(len, FftDirection::Inverse) + } + + // Make a recipe for a length + fn design_fft_for_len(&mut self, len: usize) -> Arc { + if len < 1 { + Arc::new(Recipe::Dft(len)) + } else if let Some(recipe) = self.recipe_cache.get(&len) { + Arc::clone(&recipe) + } else { + let factors = PrimeFactors::compute(len); + let recipe = self.design_fft_with_factors(len, factors); + self.recipe_cache.insert(len, Arc::clone(&recipe)); + recipe + } + } + + // Create the fft from a recipe, take from cache if possible + fn build_fft(&mut self, recipe: &Recipe, direction: FftDirection) -> Arc> { + let len = recipe.len(); + if let Some(instance) = self.algorithm_cache.get(len, direction) { + instance + } else { + let fft = self.build_new_fft(recipe, direction); + self.algorithm_cache.insert(&fft); + fft + } + } + + // Create a new fft from a recipe + fn build_new_fft(&mut self, recipe: &Recipe, direction: FftDirection) -> Arc> { + let id_f32 = TypeId::of::(); + let id_f64 = TypeId::of::(); + let id_t = TypeId::of::(); + + match recipe { + Recipe::Dft(len) => Arc::new(Dft::new(*len, direction)) as Arc>, + Recipe::Radix4(len) => { + if id_t == id_f32 { + Arc::new(Neon32Radix4::new(*len, direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(Neon64Radix4::new(*len, direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly1 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly1::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly1::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly2 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly2::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly2::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly3 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly3::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly3::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly4 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly4::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly4::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly5 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly5::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly5::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly6 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly6::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly6::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly7 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly7::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly7::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly8 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly8::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly8::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly9 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly9::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly9::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly10 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly10::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly10::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly11 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly11::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly11::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly12 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly12::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly12::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly13 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly13::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly13::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly15 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly15::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly15::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly16 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly16::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly16::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly17 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly17::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly17::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly19 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly19::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly19::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly23 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly23::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly23::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly29 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly29::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly29::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly31 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly31::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly31::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::Butterfly32 => { + if id_t == id_f32 { + Arc::new(NeonF32Butterfly32::new(direction)) as Arc> + } else if id_t == id_f64 { + Arc::new(NeonF64Butterfly32::new(direction)) as Arc> + } else { + panic!("Not f32 or f64"); + } + } + Recipe::MixedRadix { + left_fft, + right_fft, + } => { + let left_fft = self.build_fft(&left_fft, direction); + let right_fft = self.build_fft(&right_fft, direction); + Arc::new(MixedRadix::new(left_fft, right_fft)) as Arc> + } + Recipe::GoodThomasAlgorithm { + left_fft, + right_fft, + } => { + let left_fft = self.build_fft(&left_fft, direction); + let right_fft = self.build_fft(&right_fft, direction); + Arc::new(GoodThomasAlgorithm::new(left_fft, right_fft)) as Arc> + } + Recipe::MixedRadixSmall { + left_fft, + right_fft, + } => { + let left_fft = self.build_fft(&left_fft, direction); + let right_fft = self.build_fft(&right_fft, direction); + Arc::new(MixedRadixSmall::new(left_fft, right_fft)) as Arc> + } + Recipe::GoodThomasAlgorithmSmall { + left_fft, + right_fft, + } => { + let left_fft = self.build_fft(&left_fft, direction); + let right_fft = self.build_fft(&right_fft, direction); + Arc::new(GoodThomasAlgorithmSmall::new(left_fft, right_fft)) as Arc> + } + Recipe::RadersAlgorithm { inner_fft } => { + let inner_fft = self.build_fft(&inner_fft, direction); + Arc::new(RadersAlgorithm::new(inner_fft)) as Arc> + } + Recipe::BluesteinsAlgorithm { len, inner_fft } => { + let inner_fft = self.build_fft(&inner_fft, direction); + Arc::new(BluesteinsAlgorithm::new(*len, inner_fft)) as Arc> + } + } + } + + fn design_fft_with_factors(&mut self, len: usize, factors: PrimeFactors) -> Arc { + if let Some(fft_instance) = self.design_butterfly_algorithm(len) { + fft_instance + } else if factors.is_prime() { + self.design_prime(len) + } else if len.trailing_zeros() >= MIN_RADIX4_BITS { + if len.is_power_of_two() { + Arc::new(Recipe::Radix4(len)) + } else { + let non_power_of_two = factors + .remove_factors(PrimeFactor { + value: 2, + count: len.trailing_zeros(), + }) + .unwrap(); + let power_of_two = PrimeFactors::compute(1 << len.trailing_zeros()); + self.design_mixed_radix(power_of_two, non_power_of_two) + } + } else { + // Can we do this as a mixed radix with just two butterflies? + // Loop through and find all combinations + // If more than one is found, keep the one where the factors are closer together. + // For example length 20 where 10x2 and 5x4 are possible, we use 5x4. + let butterflies: [usize; 20] = [ + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 19, 23, 29, 31, 32, + ]; + let mut bf_left = 0; + let mut bf_right = 0; + // If the length is below 14, or over 1024 we don't need to try this. + if len > 13 && len <= 1024 { + for (n, bf_l) in butterflies.iter().enumerate() { + if len % bf_l == 0 { + let bf_r = len / bf_l; + if butterflies.iter().skip(n).any(|&m| m == bf_r) { + bf_left = *bf_l; + bf_right = bf_r; + } + } + } + if bf_left > 0 { + let fact_l = PrimeFactors::compute(bf_left); + let fact_r = PrimeFactors::compute(bf_right); + return self.design_mixed_radix(fact_l, fact_r); + } + } + // Not possible with just butterflies, go with the general solution. + let (left_factors, right_factors) = factors.partition_factors(); + self.design_mixed_radix(left_factors, right_factors) + } + } + + fn design_mixed_radix( + &mut self, + left_factors: PrimeFactors, + right_factors: PrimeFactors, + ) -> Arc { + let left_len = left_factors.get_product(); + let right_len = right_factors.get_product(); + + //neither size is a butterfly, so go with the normal algorithm + let left_fft = self.design_fft_with_factors(left_len, left_factors); + let right_fft = self.design_fft_with_factors(right_len, right_factors); + + //if both left_len and right_len are small, use algorithms optimized for small FFTs + if left_len < 33 && right_len < 33 { + // for small FFTs, if gcd is 1, good-thomas is faster + if gcd(left_len, right_len) == 1 { + Arc::new(Recipe::GoodThomasAlgorithmSmall { + left_fft, + right_fft, + }) + } else { + Arc::new(Recipe::MixedRadixSmall { + left_fft, + right_fft, + }) + } + } else { + Arc::new(Recipe::MixedRadix { + left_fft, + right_fft, + }) + } + } + + // Returns Some(instance) if we have a butterfly available for this size. Returns None if there is no butterfly available for this size + fn design_butterfly_algorithm(&mut self, len: usize) -> Option> { + match len { + 1 => Some(Arc::new(Recipe::Butterfly1)), + 2 => Some(Arc::new(Recipe::Butterfly2)), + 3 => Some(Arc::new(Recipe::Butterfly3)), + 4 => Some(Arc::new(Recipe::Butterfly4)), + 5 => Some(Arc::new(Recipe::Butterfly5)), + 6 => Some(Arc::new(Recipe::Butterfly6)), + 7 => Some(Arc::new(Recipe::Butterfly7)), + 8 => Some(Arc::new(Recipe::Butterfly8)), + 9 => Some(Arc::new(Recipe::Butterfly9)), + 10 => Some(Arc::new(Recipe::Butterfly10)), + 11 => Some(Arc::new(Recipe::Butterfly11)), + 12 => Some(Arc::new(Recipe::Butterfly12)), + 13 => Some(Arc::new(Recipe::Butterfly13)), + 15 => Some(Arc::new(Recipe::Butterfly15)), + 16 => Some(Arc::new(Recipe::Butterfly16)), + 17 => Some(Arc::new(Recipe::Butterfly17)), + 19 => Some(Arc::new(Recipe::Butterfly19)), + 23 => Some(Arc::new(Recipe::Butterfly23)), + 29 => Some(Arc::new(Recipe::Butterfly29)), + 31 => Some(Arc::new(Recipe::Butterfly31)), + 32 => Some(Arc::new(Recipe::Butterfly32)), + _ => None, + } + } + + fn design_prime(&mut self, len: usize) -> Arc { + let inner_fft_len_rader = len - 1; + let raders_factors = PrimeFactors::compute(inner_fft_len_rader); + // If any of the prime factors is too large, Rader's gets slow and Bluestein's is the better choice + if raders_factors + .get_other_factors() + .iter() + .any(|val| val.value > MAX_RADER_PRIME_FACTOR) + { + let inner_fft_len_pow2 = (2 * len - 1).checked_next_power_of_two().unwrap(); + // for long ffts a mixed radix inner fft is faster than a longer radix4 + let min_inner_len = 2 * len - 1; + let mixed_radix_len = 3 * inner_fft_len_pow2 / 4; + let inner_fft = + if mixed_radix_len >= min_inner_len && len >= MIN_BLUESTEIN_MIXED_RADIX_LEN { + let mixed_radix_factors = PrimeFactors::compute(mixed_radix_len); + self.design_fft_with_factors(mixed_radix_len, mixed_radix_factors) + } else { + Arc::new(Recipe::Radix4(inner_fft_len_pow2)) + }; + Arc::new(Recipe::BluesteinsAlgorithm { len, inner_fft }) + } else { + let inner_fft = self.design_fft_with_factors(inner_fft_len_rader, raders_factors); + Arc::new(Recipe::RadersAlgorithm { inner_fft }) + } + } +} + +#[cfg(test)] +mod unit_tests { + use super::*; + + fn is_mixedradix(plan: &Recipe) -> bool { + match plan { + &Recipe::MixedRadix { .. } => true, + _ => false, + } + } + + fn is_mixedradixsmall(plan: &Recipe) -> bool { + match plan { + &Recipe::MixedRadixSmall { .. } => true, + _ => false, + } + } + + fn is_goodthomassmall(plan: &Recipe) -> bool { + match plan { + &Recipe::GoodThomasAlgorithmSmall { .. } => true, + _ => false, + } + } + + fn is_raders(plan: &Recipe) -> bool { + match plan { + &Recipe::RadersAlgorithm { .. } => true, + _ => false, + } + } + + fn is_bluesteins(plan: &Recipe) -> bool { + match plan { + &Recipe::BluesteinsAlgorithm { .. } => true, + _ => false, + } + } + + #[test] + fn test_plan_neon_trivial() { + // Length 0 and 1 should use Dft + let mut planner = FftPlannerNeon::::new().unwrap(); + for len in 0..1 { + let plan = planner.design_fft_for_len(len); + assert_eq!(*plan, Recipe::Dft(len)); + assert_eq!(plan.len(), len, "Recipe reports wrong length"); + } + } + + #[test] + fn test_plan_neon_largepoweroftwo() { + // Powers of 2 above 6 should use Radix4 + let mut planner = FftPlannerNeon::::new().unwrap(); + for pow in 6..32 { + let len = 1 << pow; + let plan = planner.design_fft_for_len(len); + assert_eq!(*plan, Recipe::Radix4(len)); + assert_eq!(plan.len(), len, "Recipe reports wrong length"); + } + } + + #[test] + fn test_plan_neon_butterflies() { + // Check that all butterflies are used + let mut planner = FftPlannerNeon::::new().unwrap(); + assert_eq!(*planner.design_fft_for_len(2), Recipe::Butterfly2); + assert_eq!(*planner.design_fft_for_len(3), Recipe::Butterfly3); + assert_eq!(*planner.design_fft_for_len(4), Recipe::Butterfly4); + assert_eq!(*planner.design_fft_for_len(5), Recipe::Butterfly5); + assert_eq!(*planner.design_fft_for_len(6), Recipe::Butterfly6); + assert_eq!(*planner.design_fft_for_len(7), Recipe::Butterfly7); + assert_eq!(*planner.design_fft_for_len(8), Recipe::Butterfly8); + assert_eq!(*planner.design_fft_for_len(9), Recipe::Butterfly9); + assert_eq!(*planner.design_fft_for_len(10), Recipe::Butterfly10); + assert_eq!(*planner.design_fft_for_len(11), Recipe::Butterfly11); + assert_eq!(*planner.design_fft_for_len(12), Recipe::Butterfly12); + assert_eq!(*planner.design_fft_for_len(13), Recipe::Butterfly13); + assert_eq!(*planner.design_fft_for_len(15), Recipe::Butterfly15); + assert_eq!(*planner.design_fft_for_len(16), Recipe::Butterfly16); + assert_eq!(*planner.design_fft_for_len(17), Recipe::Butterfly17); + assert_eq!(*planner.design_fft_for_len(19), Recipe::Butterfly19); + assert_eq!(*planner.design_fft_for_len(23), Recipe::Butterfly23); + assert_eq!(*planner.design_fft_for_len(29), Recipe::Butterfly29); + assert_eq!(*planner.design_fft_for_len(31), Recipe::Butterfly31); + assert_eq!(*planner.design_fft_for_len(32), Recipe::Butterfly32); + } + + #[test] + fn test_plan_neon_mixedradix() { + // Products of several different primes should become MixedRadix + let mut planner = FftPlannerNeon::::new().unwrap(); + for pow2 in 2..5 { + for pow3 in 2..5 { + for pow5 in 2..5 { + for pow7 in 2..5 { + let len = 2usize.pow(pow2) + * 3usize.pow(pow3) + * 5usize.pow(pow5) + * 7usize.pow(pow7); + let plan = planner.design_fft_for_len(len); + assert!(is_mixedradix(&plan), "Expected MixedRadix, got {:?}", plan); + assert_eq!(plan.len(), len, "Recipe reports wrong length"); + } + } + } + } + } + + #[test] + fn test_plan_neon_mixedradixsmall() { + // Products of two "small" lengths < 31 that have a common divisor >1, and isn't a power of 2 should be MixedRadixSmall + let mut planner = FftPlannerNeon::::new().unwrap(); + for len in [5 * 20, 5 * 25].iter() { + let plan = planner.design_fft_for_len(*len); + assert!( + is_mixedradixsmall(&plan), + "Expected MixedRadixSmall, got {:?}", + plan + ); + assert_eq!(plan.len(), *len, "Recipe reports wrong length"); + } + } + + #[test] + fn test_plan_neon_goodthomasbutterfly() { + let mut planner = FftPlannerNeon::::new().unwrap(); + for len in [3 * 7, 5 * 7, 11 * 13, 2 * 29].iter() { + let plan = planner.design_fft_for_len(*len); + assert!( + is_goodthomassmall(&plan), + "Expected GoodThomasAlgorithmSmall, got {:?}", + plan + ); + assert_eq!(plan.len(), *len, "Recipe reports wrong length"); + } + } + + #[test] + fn test_plan_neon_bluestein_vs_rader() { + let difficultprimes: [usize; 11] = [59, 83, 107, 149, 167, 173, 179, 359, 719, 1439, 2879]; + let easyprimes: [usize; 24] = [ + 53, 61, 67, 71, 73, 79, 89, 97, 101, 103, 109, 113, 127, 131, 137, 139, 151, 157, 163, + 181, 191, 193, 197, 199, + ]; + + let mut planner = FftPlannerNeon::::new().unwrap(); + for len in difficultprimes.iter() { + let plan = planner.design_fft_for_len(*len); + assert!( + is_bluesteins(&plan), + "Expected BluesteinsAlgorithm, got {:?}", + plan + ); + assert_eq!(plan.len(), *len, "Recipe reports wrong length"); + } + for len in easyprimes.iter() { + let plan = planner.design_fft_for_len(*len); + assert!(is_raders(&plan), "Expected RadersAlgorithm, got {:?}", plan); + assert_eq!(plan.len(), *len, "Recipe reports wrong length"); + } + } + + #[test] + fn test_neon_fft_cache() { + { + // Check that FFTs are reused if they're both forward + let mut planner = FftPlannerNeon::::new().unwrap(); + let fft_a = planner.plan_fft(1234, FftDirection::Forward); + let fft_b = planner.plan_fft(1234, FftDirection::Forward); + assert!(Arc::ptr_eq(&fft_a, &fft_b), "Existing fft was not reused"); + } + { + // Check that FFTs are reused if they're both inverse + let mut planner = FftPlannerNeon::::new().unwrap(); + let fft_a = planner.plan_fft(1234, FftDirection::Inverse); + let fft_b = planner.plan_fft(1234, FftDirection::Inverse); + assert!(Arc::ptr_eq(&fft_a, &fft_b), "Existing fft was not reused"); + } + { + // Check that FFTs are NOT resued if they don't both have the same direction + let mut planner = FftPlannerNeon::::new().unwrap(); + let fft_a = planner.plan_fft(1234, FftDirection::Forward); + let fft_b = planner.plan_fft(1234, FftDirection::Inverse); + assert!( + !Arc::ptr_eq(&fft_a, &fft_b), + "Existing fft was reused, even though directions don't match" + ); + } + } + + #[test] + fn test_neon_recipe_cache() { + // Check that all butterflies are used + let mut planner = FftPlannerNeon::::new().unwrap(); + let fft_a = planner.design_fft_for_len(1234); + let fft_b = planner.design_fft_for_len(1234); + assert!( + Arc::ptr_eq(&fft_a, &fft_b), + "Existing recipe was not reused" + ); + } +} diff --git a/src/neon/neon_prime_butterflies.rs b/src/neon/neon_prime_butterflies.rs new file mode 100644 index 00000000..574eb7d1 --- /dev/null +++ b/src/neon/neon_prime_butterflies.rs @@ -0,0 +1,6180 @@ +use core::arch::aarch64::*; +use num_complex::Complex; + +use crate::{common::FftNum, FftDirection}; + +use crate::array_utils; +use crate::array_utils::{RawSlice, RawSliceMut}; +use crate::common::{fft_error_inplace, fft_error_outofplace}; +use crate::twiddles; +use crate::{Direction, Fft, Length}; + +use super::neon_common::{assert_f32, assert_f64}; +use super::neon_utils::*; +use super::neon_vector::{NeonArray, NeonArrayMut}; +use super::neon_butterflies::{parallel_fft2_interleaved_f32, solo_fft2_f64}; + +// Auto-generated prime length butterflies +// The code here is mostly autogenerated by the python script tools/gen_sse_butterflies.py, and then translated from SSE to Neon. +// +// The algorithm is derived directly from the definition of the DFT, by eliminating any repeated calculations. +// See the comments in src/algorithm/butterflies.rs for a detailed description. +// +// The script generates the code for performing a single f64 fft, as well as dual f32 fft. +// It also generates the code for reading and writing the input and output. +// The single 32-bit ffts reuse the dual ffts. + +// _____ _________ _ _ _ +// |___ | |___ /___ \| |__ (_) |_ +// / / _____ |_ \ __) | '_ \| | __| +// / / |_____| ___) / __/| |_) | | |_ +// /_/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly7 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, + twiddle3re: float32x4_t, + twiddle3im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly7, 7, |this: &NeonF32Butterfly7<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly7, 7, |this: &NeonF32Butterfly7<_>| this + .direction); +impl NeonF32Butterfly7 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 7, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 7, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 7, direction); + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f32(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f32(tw3.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + } + } + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6}); + + let out = self.perform_parallel_fft_direct(values); + + write_partial_lo_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[3]), + extract_hi_lo_f32(input_packed[0], input_packed[4]), + extract_lo_hi_f32(input_packed[1], input_packed[4]), + extract_hi_lo_f32(input_packed[1], input_packed[5]), + extract_lo_hi_f32(input_packed[2], input_packed[5]), + extract_hi_lo_f32(input_packed[2], input_packed[6]), + extract_lo_hi_f32(input_packed[3], input_packed[6]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_hi_f32(out[6], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0,1,2,3,4,5,6}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 7]) -> [float32x4_t; 7] { + let [x1p6, x1m6] = parallel_fft2_interleaved_f32(values[1], values[6]); + let [x2p5, x2m5] = parallel_fft2_interleaved_f32(values[2], values[5]); + let [x3p4, x3m4] = parallel_fft2_interleaved_f32(values[3], values[4]); + + let t_a1_1 = vmulq_f32(self.twiddle1re, x1p6); + let t_a1_2 = vmulq_f32(self.twiddle2re, x2p5); + let t_a1_3 = vmulq_f32(self.twiddle3re, x3p4); + let t_a2_1 = vmulq_f32(self.twiddle2re, x1p6); + let t_a2_2 = vmulq_f32(self.twiddle3re, x2p5); + let t_a2_3 = vmulq_f32(self.twiddle1re, x3p4); + let t_a3_1 = vmulq_f32(self.twiddle3re, x1p6); + let t_a3_2 = vmulq_f32(self.twiddle1re, x2p5); + let t_a3_3 = vmulq_f32(self.twiddle2re, x3p4); + + let t_b1_1 = vmulq_f32(self.twiddle1im, x1m6); + let t_b1_2 = vmulq_f32(self.twiddle2im, x2m5); + let t_b1_3 = vmulq_f32(self.twiddle3im, x3m4); + let t_b2_1 = vmulq_f32(self.twiddle2im, x1m6); + let t_b2_2 = vmulq_f32(self.twiddle3im, x2m5); + let t_b2_3 = vmulq_f32(self.twiddle1im, x3m4); + let t_b3_1 = vmulq_f32(self.twiddle3im, x1m6); + let t_b3_2 = vmulq_f32(self.twiddle1im, x2m5); + let t_b3_3 = vmulq_f32(self.twiddle2im, x3m4); + + let x0 = values[0]; + let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3); + let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3); + let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3); + + let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3); + let t_b2 = calc_f32!(t_b2_1 - t_b2_2 - t_b2_3); + let t_b3 = calc_f32!(t_b3_1 - t_b3_2 + t_b3_3); + + let t_b1_rot = self.rotate.rotate_both(t_b1); + let t_b2_rot = self.rotate.rotate_both(t_b2); + let t_b3_rot = self.rotate.rotate_both(t_b3); + + let y0 = calc_f32!(x0 + x1p6 + x2p5 + x3p4); + let [y1, y6] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot); + let [y2, y5] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot); + let [y3, y4] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot); + [y0, y1, y2, y3, y4, y5, y6] + } +} + +// _____ __ _ _ _ _ _ +// |___ | / /_ | || | | |__ (_) |_ +// / / _____ | '_ \| || |_| '_ \| | __| +// / / |_____| | (_) |__ _| |_) | | |_ +// /_/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly7 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, + twiddle3re: float64x2_t, + twiddle3im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly7, 7, |this: &NeonF64Butterfly7<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly7, 7, |this: &NeonF64Butterfly7<_>| this + .direction); +impl NeonF64Butterfly7 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 7, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 7, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 7, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f64(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f64(tw3.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 7]) -> [float64x2_t; 7] { + let [x1p6, x1m6] = solo_fft2_f64(values[1], values[6]); + let [x2p5, x2m5] = solo_fft2_f64(values[2], values[5]); + let [x3p4, x3m4] = solo_fft2_f64(values[3], values[4]); + + let t_a1_1 = vmulq_f64(self.twiddle1re, x1p6); + let t_a1_2 = vmulq_f64(self.twiddle2re, x2p5); + let t_a1_3 = vmulq_f64(self.twiddle3re, x3p4); + let t_a2_1 = vmulq_f64(self.twiddle2re, x1p6); + let t_a2_2 = vmulq_f64(self.twiddle3re, x2p5); + let t_a2_3 = vmulq_f64(self.twiddle1re, x3p4); + let t_a3_1 = vmulq_f64(self.twiddle3re, x1p6); + let t_a3_2 = vmulq_f64(self.twiddle1re, x2p5); + let t_a3_3 = vmulq_f64(self.twiddle2re, x3p4); + + let t_b1_1 = vmulq_f64(self.twiddle1im, x1m6); + let t_b1_2 = vmulq_f64(self.twiddle2im, x2m5); + let t_b1_3 = vmulq_f64(self.twiddle3im, x3m4); + let t_b2_1 = vmulq_f64(self.twiddle2im, x1m6); + let t_b2_2 = vmulq_f64(self.twiddle3im, x2m5); + let t_b2_3 = vmulq_f64(self.twiddle1im, x3m4); + let t_b3_1 = vmulq_f64(self.twiddle3im, x1m6); + let t_b3_2 = vmulq_f64(self.twiddle1im, x2m5); + let t_b3_3 = vmulq_f64(self.twiddle2im, x3m4); + + let x0 = values[0]; + let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3); + let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3); + let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3); + + let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3); + let t_b2 = calc_f64!(t_b2_1 - t_b2_2 - t_b2_3); + let t_b3 = calc_f64!(t_b3_1 - t_b3_2 + t_b3_3); + + let t_b1_rot = self.rotate.rotate(t_b1); + let t_b2_rot = self.rotate.rotate(t_b2); + let t_b3_rot = self.rotate.rotate(t_b3); + + let y0 = calc_f64!(x0 + x1p6 + x2p5 + x3p4); + let [y1, y6] = solo_fft2_f64(t_a1, t_b1_rot); + let [y2, y5] = solo_fft2_f64(t_a2, t_b2_rot); + let [y3, y4] = solo_fft2_f64(t_a3, t_b3_rot); + [y0, y1, y2, y3, y4, y5, y6] + + } +} + +// _ _ _________ _ _ _ +// / / | |___ /___ \| |__ (_) |_ +// | | | _____ |_ \ __) | '_ \| | __| +// | | | |_____| ___) / __/| |_) | | |_ +// |_|_| |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly11 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, + twiddle3re: float32x4_t, + twiddle3im: float32x4_t, + twiddle4re: float32x4_t, + twiddle4im: float32x4_t, + twiddle5re: float32x4_t, + twiddle5im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly11, 11, |this: &NeonF32Butterfly11<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly11, 11, |this: &NeonF32Butterfly11<_>| this + .direction); +impl NeonF32Butterfly11 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 11, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 11, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 11, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 11, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 11, direction); + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f32(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f32(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f32(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f32(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f32(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f32(tw5.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + + let out = self.perform_parallel_fft_direct(values); + + write_partial_lo_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[5]), + extract_hi_lo_f32(input_packed[0], input_packed[6]), + extract_lo_hi_f32(input_packed[1], input_packed[6]), + extract_hi_lo_f32(input_packed[1], input_packed[7]), + extract_lo_hi_f32(input_packed[2], input_packed[7]), + extract_hi_lo_f32(input_packed[2], input_packed[8]), + extract_lo_hi_f32(input_packed[3], input_packed[8]), + extract_hi_lo_f32(input_packed[3], input_packed[9]), + extract_lo_hi_f32(input_packed[4], input_packed[9]), + extract_hi_lo_f32(input_packed[4], input_packed[10]), + extract_lo_hi_f32(input_packed[5], input_packed[10]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_lo_f32(out[8], out[9]), + extract_lo_hi_f32(out[10], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + extract_hi_hi_f32(out[9], out[10]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 11]) -> [float32x4_t; 11] { + let [x1p10, x1m10] = parallel_fft2_interleaved_f32(values[1], values[10]); + let [x2p9, x2m9] = parallel_fft2_interleaved_f32(values[2], values[9]); + let [x3p8, x3m8] = parallel_fft2_interleaved_f32(values[3], values[8]); + let [x4p7, x4m7] = parallel_fft2_interleaved_f32(values[4], values[7]); + let [x5p6, x5m6] = parallel_fft2_interleaved_f32(values[5], values[6]); + + let t_a1_1 = vmulq_f32(self.twiddle1re, x1p10); + let t_a1_2 = vmulq_f32(self.twiddle2re, x2p9); + let t_a1_3 = vmulq_f32(self.twiddle3re, x3p8); + let t_a1_4 = vmulq_f32(self.twiddle4re, x4p7); + let t_a1_5 = vmulq_f32(self.twiddle5re, x5p6); + let t_a2_1 = vmulq_f32(self.twiddle2re, x1p10); + let t_a2_2 = vmulq_f32(self.twiddle4re, x2p9); + let t_a2_3 = vmulq_f32(self.twiddle5re, x3p8); + let t_a2_4 = vmulq_f32(self.twiddle3re, x4p7); + let t_a2_5 = vmulq_f32(self.twiddle1re, x5p6); + let t_a3_1 = vmulq_f32(self.twiddle3re, x1p10); + let t_a3_2 = vmulq_f32(self.twiddle5re, x2p9); + let t_a3_3 = vmulq_f32(self.twiddle2re, x3p8); + let t_a3_4 = vmulq_f32(self.twiddle1re, x4p7); + let t_a3_5 = vmulq_f32(self.twiddle4re, x5p6); + let t_a4_1 = vmulq_f32(self.twiddle4re, x1p10); + let t_a4_2 = vmulq_f32(self.twiddle3re, x2p9); + let t_a4_3 = vmulq_f32(self.twiddle1re, x3p8); + let t_a4_4 = vmulq_f32(self.twiddle5re, x4p7); + let t_a4_5 = vmulq_f32(self.twiddle2re, x5p6); + let t_a5_1 = vmulq_f32(self.twiddle5re, x1p10); + let t_a5_2 = vmulq_f32(self.twiddle1re, x2p9); + let t_a5_3 = vmulq_f32(self.twiddle4re, x3p8); + let t_a5_4 = vmulq_f32(self.twiddle2re, x4p7); + let t_a5_5 = vmulq_f32(self.twiddle3re, x5p6); + + let t_b1_1 = vmulq_f32(self.twiddle1im, x1m10); + let t_b1_2 = vmulq_f32(self.twiddle2im, x2m9); + let t_b1_3 = vmulq_f32(self.twiddle3im, x3m8); + let t_b1_4 = vmulq_f32(self.twiddle4im, x4m7); + let t_b1_5 = vmulq_f32(self.twiddle5im, x5m6); + let t_b2_1 = vmulq_f32(self.twiddle2im, x1m10); + let t_b2_2 = vmulq_f32(self.twiddle4im, x2m9); + let t_b2_3 = vmulq_f32(self.twiddle5im, x3m8); + let t_b2_4 = vmulq_f32(self.twiddle3im, x4m7); + let t_b2_5 = vmulq_f32(self.twiddle1im, x5m6); + let t_b3_1 = vmulq_f32(self.twiddle3im, x1m10); + let t_b3_2 = vmulq_f32(self.twiddle5im, x2m9); + let t_b3_3 = vmulq_f32(self.twiddle2im, x3m8); + let t_b3_4 = vmulq_f32(self.twiddle1im, x4m7); + let t_b3_5 = vmulq_f32(self.twiddle4im, x5m6); + let t_b4_1 = vmulq_f32(self.twiddle4im, x1m10); + let t_b4_2 = vmulq_f32(self.twiddle3im, x2m9); + let t_b4_3 = vmulq_f32(self.twiddle1im, x3m8); + let t_b4_4 = vmulq_f32(self.twiddle5im, x4m7); + let t_b4_5 = vmulq_f32(self.twiddle2im, x5m6); + let t_b5_1 = vmulq_f32(self.twiddle5im, x1m10); + let t_b5_2 = vmulq_f32(self.twiddle1im, x2m9); + let t_b5_3 = vmulq_f32(self.twiddle4im, x3m8); + let t_b5_4 = vmulq_f32(self.twiddle2im, x4m7); + let t_b5_5 = vmulq_f32(self.twiddle3im, x5m6); + + let x0 = values[0]; + let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5); + let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5); + let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5); + let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5); + let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5); + + let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5); + let t_b2 = calc_f32!(t_b2_1 + t_b2_2 - t_b2_3 - t_b2_4 - t_b2_5); + let t_b3 = calc_f32!(t_b3_1 - t_b3_2 - t_b3_3 + t_b3_4 + t_b3_5); + let t_b4 = calc_f32!(t_b4_1 - t_b4_2 + t_b4_3 + t_b4_4 - t_b4_5); + let t_b5 = calc_f32!(t_b5_1 - t_b5_2 + t_b5_3 - t_b5_4 + t_b5_5); + + let t_b1_rot = self.rotate.rotate_both(t_b1); + let t_b2_rot = self.rotate.rotate_both(t_b2); + let t_b3_rot = self.rotate.rotate_both(t_b3); + let t_b4_rot = self.rotate.rotate_both(t_b4); + let t_b5_rot = self.rotate.rotate_both(t_b5); + + let y0 = calc_f32!(x0 + x1p10 + x2p9 + x3p8 + x4p7 + x5p6); + let [y1, y10] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot); + let [y2, y9] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot); + let [y3, y8] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot); + let [y4, y7] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot); + let [y5, y6] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10] + + } +} + +// _ _ __ _ _ _ _ _ +// / / | / /_ | || | | |__ (_) |_ +// | | | _____ | '_ \| || |_| '_ \| | __| +// | | | |_____| | (_) |__ _| |_) | | |_ +// |_|_| \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly11 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, + twiddle3re: float64x2_t, + twiddle3im: float64x2_t, + twiddle4re: float64x2_t, + twiddle4im: float64x2_t, + twiddle5re: float64x2_t, + twiddle5im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly11, 11, |this: &NeonF64Butterfly11<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly11, 11, |this: &NeonF64Butterfly11<_>| this + .direction); +impl NeonF64Butterfly11 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 11, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 11, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 11, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 11, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 11, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f64(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f64(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f64(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f64(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f64(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f64(tw5.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 11]) -> [float64x2_t; 11] { + let [x1p10, x1m10] = solo_fft2_f64(values[1], values[10]); + let [x2p9, x2m9] = solo_fft2_f64(values[2], values[9]); + let [x3p8, x3m8] = solo_fft2_f64(values[3], values[8]); + let [x4p7, x4m7] = solo_fft2_f64(values[4], values[7]); + let [x5p6, x5m6] = solo_fft2_f64(values[5], values[6]); + + let t_a1_1 = vmulq_f64(self.twiddle1re, x1p10); + let t_a1_2 = vmulq_f64(self.twiddle2re, x2p9); + let t_a1_3 = vmulq_f64(self.twiddle3re, x3p8); + let t_a1_4 = vmulq_f64(self.twiddle4re, x4p7); + let t_a1_5 = vmulq_f64(self.twiddle5re, x5p6); + let t_a2_1 = vmulq_f64(self.twiddle2re, x1p10); + let t_a2_2 = vmulq_f64(self.twiddle4re, x2p9); + let t_a2_3 = vmulq_f64(self.twiddle5re, x3p8); + let t_a2_4 = vmulq_f64(self.twiddle3re, x4p7); + let t_a2_5 = vmulq_f64(self.twiddle1re, x5p6); + let t_a3_1 = vmulq_f64(self.twiddle3re, x1p10); + let t_a3_2 = vmulq_f64(self.twiddle5re, x2p9); + let t_a3_3 = vmulq_f64(self.twiddle2re, x3p8); + let t_a3_4 = vmulq_f64(self.twiddle1re, x4p7); + let t_a3_5 = vmulq_f64(self.twiddle4re, x5p6); + let t_a4_1 = vmulq_f64(self.twiddle4re, x1p10); + let t_a4_2 = vmulq_f64(self.twiddle3re, x2p9); + let t_a4_3 = vmulq_f64(self.twiddle1re, x3p8); + let t_a4_4 = vmulq_f64(self.twiddle5re, x4p7); + let t_a4_5 = vmulq_f64(self.twiddle2re, x5p6); + let t_a5_1 = vmulq_f64(self.twiddle5re, x1p10); + let t_a5_2 = vmulq_f64(self.twiddle1re, x2p9); + let t_a5_3 = vmulq_f64(self.twiddle4re, x3p8); + let t_a5_4 = vmulq_f64(self.twiddle2re, x4p7); + let t_a5_5 = vmulq_f64(self.twiddle3re, x5p6); + + let t_b1_1 = vmulq_f64(self.twiddle1im, x1m10); + let t_b1_2 = vmulq_f64(self.twiddle2im, x2m9); + let t_b1_3 = vmulq_f64(self.twiddle3im, x3m8); + let t_b1_4 = vmulq_f64(self.twiddle4im, x4m7); + let t_b1_5 = vmulq_f64(self.twiddle5im, x5m6); + let t_b2_1 = vmulq_f64(self.twiddle2im, x1m10); + let t_b2_2 = vmulq_f64(self.twiddle4im, x2m9); + let t_b2_3 = vmulq_f64(self.twiddle5im, x3m8); + let t_b2_4 = vmulq_f64(self.twiddle3im, x4m7); + let t_b2_5 = vmulq_f64(self.twiddle1im, x5m6); + let t_b3_1 = vmulq_f64(self.twiddle3im, x1m10); + let t_b3_2 = vmulq_f64(self.twiddle5im, x2m9); + let t_b3_3 = vmulq_f64(self.twiddle2im, x3m8); + let t_b3_4 = vmulq_f64(self.twiddle1im, x4m7); + let t_b3_5 = vmulq_f64(self.twiddle4im, x5m6); + let t_b4_1 = vmulq_f64(self.twiddle4im, x1m10); + let t_b4_2 = vmulq_f64(self.twiddle3im, x2m9); + let t_b4_3 = vmulq_f64(self.twiddle1im, x3m8); + let t_b4_4 = vmulq_f64(self.twiddle5im, x4m7); + let t_b4_5 = vmulq_f64(self.twiddle2im, x5m6); + let t_b5_1 = vmulq_f64(self.twiddle5im, x1m10); + let t_b5_2 = vmulq_f64(self.twiddle1im, x2m9); + let t_b5_3 = vmulq_f64(self.twiddle4im, x3m8); + let t_b5_4 = vmulq_f64(self.twiddle2im, x4m7); + let t_b5_5 = vmulq_f64(self.twiddle3im, x5m6); + + let x0 = values[0]; + let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5); + let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5); + let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5); + let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5); + let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5); + + let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5); + let t_b2 = calc_f64!(t_b2_1 + t_b2_2 - t_b2_3 - t_b2_4 - t_b2_5); + let t_b3 = calc_f64!(t_b3_1 - t_b3_2 - t_b3_3 + t_b3_4 + t_b3_5); + let t_b4 = calc_f64!(t_b4_1 - t_b4_2 + t_b4_3 + t_b4_4 - t_b4_5); + let t_b5 = calc_f64!(t_b5_1 - t_b5_2 + t_b5_3 - t_b5_4 + t_b5_5); + + let t_b1_rot = self.rotate.rotate(t_b1); + let t_b2_rot = self.rotate.rotate(t_b2); + let t_b3_rot = self.rotate.rotate(t_b3); + let t_b4_rot = self.rotate.rotate(t_b4); + let t_b5_rot = self.rotate.rotate(t_b5); + + let y0 = calc_f64!(x0 + x1p10 + x2p9 + x3p8 + x4p7 + x5p6); + let [y1, y10] = solo_fft2_f64(t_a1, t_b1_rot); + let [y2, y9] = solo_fft2_f64(t_a2, t_b2_rot); + let [y3, y8] = solo_fft2_f64(t_a3, t_b3_rot); + let [y4, y7] = solo_fft2_f64(t_a4, t_b4_rot); + let [y5, y6] = solo_fft2_f64(t_a5, t_b5_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10] + } +} + +// _ _____ _________ _ _ _ +// / |___ / |___ /___ \| |__ (_) |_ +// | | |_ \ _____ |_ \ __) | '_ \| | __| +// | |___) | |_____| ___) / __/| |_) | | |_ +// |_|____/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly13 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, + twiddle3re: float32x4_t, + twiddle3im: float32x4_t, + twiddle4re: float32x4_t, + twiddle4im: float32x4_t, + twiddle5re: float32x4_t, + twiddle5im: float32x4_t, + twiddle6re: float32x4_t, + twiddle6im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly13, 13, |this: &NeonF32Butterfly13<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly13, 13, |this: &NeonF32Butterfly13<_>| this + .direction); +impl NeonF32Butterfly13 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 13, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 13, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 13, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 13, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 13, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 13, direction); + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f32(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f32(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f32(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f32(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f32(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f32(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f32(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f32(tw6.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + + let out = self.perform_parallel_fft_direct(values); + + write_partial_lo_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[6]), + extract_hi_lo_f32(input_packed[0], input_packed[7]), + extract_lo_hi_f32(input_packed[1], input_packed[7]), + extract_hi_lo_f32(input_packed[1], input_packed[8]), + extract_lo_hi_f32(input_packed[2], input_packed[8]), + extract_hi_lo_f32(input_packed[2], input_packed[9]), + extract_lo_hi_f32(input_packed[3], input_packed[9]), + extract_hi_lo_f32(input_packed[3], input_packed[10]), + extract_lo_hi_f32(input_packed[4], input_packed[10]), + extract_hi_lo_f32(input_packed[4], input_packed[11]), + extract_lo_hi_f32(input_packed[5], input_packed[11]), + extract_hi_lo_f32(input_packed[5], input_packed[12]), + extract_lo_hi_f32(input_packed[6], input_packed[12]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_lo_f32(out[8], out[9]), + extract_lo_lo_f32(out[10], out[11]), + extract_lo_hi_f32(out[12], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + extract_hi_hi_f32(out[9], out[10]), + extract_hi_hi_f32(out[11], out[12]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 13]) -> [float32x4_t; 13] { + let [x1p12, x1m12] = parallel_fft2_interleaved_f32(values[1], values[12]); + let [x2p11, x2m11] = parallel_fft2_interleaved_f32(values[2], values[11]); + let [x3p10, x3m10] = parallel_fft2_interleaved_f32(values[3], values[10]); + let [x4p9, x4m9] = parallel_fft2_interleaved_f32(values[4], values[9]); + let [x5p8, x5m8] = parallel_fft2_interleaved_f32(values[5], values[8]); + let [x6p7, x6m7] = parallel_fft2_interleaved_f32(values[6], values[7]); + + let t_a1_1 = vmulq_f32(self.twiddle1re, x1p12); + let t_a1_2 = vmulq_f32(self.twiddle2re, x2p11); + let t_a1_3 = vmulq_f32(self.twiddle3re, x3p10); + let t_a1_4 = vmulq_f32(self.twiddle4re, x4p9); + let t_a1_5 = vmulq_f32(self.twiddle5re, x5p8); + let t_a1_6 = vmulq_f32(self.twiddle6re, x6p7); + let t_a2_1 = vmulq_f32(self.twiddle2re, x1p12); + let t_a2_2 = vmulq_f32(self.twiddle4re, x2p11); + let t_a2_3 = vmulq_f32(self.twiddle6re, x3p10); + let t_a2_4 = vmulq_f32(self.twiddle5re, x4p9); + let t_a2_5 = vmulq_f32(self.twiddle3re, x5p8); + let t_a2_6 = vmulq_f32(self.twiddle1re, x6p7); + let t_a3_1 = vmulq_f32(self.twiddle3re, x1p12); + let t_a3_2 = vmulq_f32(self.twiddle6re, x2p11); + let t_a3_3 = vmulq_f32(self.twiddle4re, x3p10); + let t_a3_4 = vmulq_f32(self.twiddle1re, x4p9); + let t_a3_5 = vmulq_f32(self.twiddle2re, x5p8); + let t_a3_6 = vmulq_f32(self.twiddle5re, x6p7); + let t_a4_1 = vmulq_f32(self.twiddle4re, x1p12); + let t_a4_2 = vmulq_f32(self.twiddle5re, x2p11); + let t_a4_3 = vmulq_f32(self.twiddle1re, x3p10); + let t_a4_4 = vmulq_f32(self.twiddle3re, x4p9); + let t_a4_5 = vmulq_f32(self.twiddle6re, x5p8); + let t_a4_6 = vmulq_f32(self.twiddle2re, x6p7); + let t_a5_1 = vmulq_f32(self.twiddle5re, x1p12); + let t_a5_2 = vmulq_f32(self.twiddle3re, x2p11); + let t_a5_3 = vmulq_f32(self.twiddle2re, x3p10); + let t_a5_4 = vmulq_f32(self.twiddle6re, x4p9); + let t_a5_5 = vmulq_f32(self.twiddle1re, x5p8); + let t_a5_6 = vmulq_f32(self.twiddle4re, x6p7); + let t_a6_1 = vmulq_f32(self.twiddle6re, x1p12); + let t_a6_2 = vmulq_f32(self.twiddle1re, x2p11); + let t_a6_3 = vmulq_f32(self.twiddle5re, x3p10); + let t_a6_4 = vmulq_f32(self.twiddle2re, x4p9); + let t_a6_5 = vmulq_f32(self.twiddle4re, x5p8); + let t_a6_6 = vmulq_f32(self.twiddle3re, x6p7); + + let t_b1_1 = vmulq_f32(self.twiddle1im, x1m12); + let t_b1_2 = vmulq_f32(self.twiddle2im, x2m11); + let t_b1_3 = vmulq_f32(self.twiddle3im, x3m10); + let t_b1_4 = vmulq_f32(self.twiddle4im, x4m9); + let t_b1_5 = vmulq_f32(self.twiddle5im, x5m8); + let t_b1_6 = vmulq_f32(self.twiddle6im, x6m7); + let t_b2_1 = vmulq_f32(self.twiddle2im, x1m12); + let t_b2_2 = vmulq_f32(self.twiddle4im, x2m11); + let t_b2_3 = vmulq_f32(self.twiddle6im, x3m10); + let t_b2_4 = vmulq_f32(self.twiddle5im, x4m9); + let t_b2_5 = vmulq_f32(self.twiddle3im, x5m8); + let t_b2_6 = vmulq_f32(self.twiddle1im, x6m7); + let t_b3_1 = vmulq_f32(self.twiddle3im, x1m12); + let t_b3_2 = vmulq_f32(self.twiddle6im, x2m11); + let t_b3_3 = vmulq_f32(self.twiddle4im, x3m10); + let t_b3_4 = vmulq_f32(self.twiddle1im, x4m9); + let t_b3_5 = vmulq_f32(self.twiddle2im, x5m8); + let t_b3_6 = vmulq_f32(self.twiddle5im, x6m7); + let t_b4_1 = vmulq_f32(self.twiddle4im, x1m12); + let t_b4_2 = vmulq_f32(self.twiddle5im, x2m11); + let t_b4_3 = vmulq_f32(self.twiddle1im, x3m10); + let t_b4_4 = vmulq_f32(self.twiddle3im, x4m9); + let t_b4_5 = vmulq_f32(self.twiddle6im, x5m8); + let t_b4_6 = vmulq_f32(self.twiddle2im, x6m7); + let t_b5_1 = vmulq_f32(self.twiddle5im, x1m12); + let t_b5_2 = vmulq_f32(self.twiddle3im, x2m11); + let t_b5_3 = vmulq_f32(self.twiddle2im, x3m10); + let t_b5_4 = vmulq_f32(self.twiddle6im, x4m9); + let t_b5_5 = vmulq_f32(self.twiddle1im, x5m8); + let t_b5_6 = vmulq_f32(self.twiddle4im, x6m7); + let t_b6_1 = vmulq_f32(self.twiddle6im, x1m12); + let t_b6_2 = vmulq_f32(self.twiddle1im, x2m11); + let t_b6_3 = vmulq_f32(self.twiddle5im, x3m10); + let t_b6_4 = vmulq_f32(self.twiddle2im, x4m9); + let t_b6_5 = vmulq_f32(self.twiddle4im, x5m8); + let t_b6_6 = vmulq_f32(self.twiddle3im, x6m7); + + let x0 = values[0]; + let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6); + let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6); + let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6); + let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6); + let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6); + let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6); + + let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6); + let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 - t_b2_4 - t_b2_5 - t_b2_6); + let t_b3 = calc_f32!(t_b3_1 + t_b3_2 - t_b3_3 - t_b3_4 + t_b3_5 + t_b3_6); + let t_b4 = calc_f32!(t_b4_1 - t_b4_2 - t_b4_3 + t_b4_4 - t_b4_5 - t_b4_6); + let t_b5 = calc_f32!(t_b5_1 - t_b5_2 + t_b5_3 - t_b5_4 - t_b5_5 + t_b5_6); + let t_b6 = calc_f32!(t_b6_1 - t_b6_2 + t_b6_3 - t_b6_4 + t_b6_5 - t_b6_6); + + let t_b1_rot = self.rotate.rotate_both(t_b1); + let t_b2_rot = self.rotate.rotate_both(t_b2); + let t_b3_rot = self.rotate.rotate_both(t_b3); + let t_b4_rot = self.rotate.rotate_both(t_b4); + let t_b5_rot = self.rotate.rotate_both(t_b5); + let t_b6_rot = self.rotate.rotate_both(t_b6); + + let y0 = calc_f32!(x0 + x1p12 + x2p11 + x3p10 + x4p9 + x5p8 + x6p7); + let [y1, y12] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot); + let [y2, y11] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot); + let [y3, y10] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot); + let [y4, y9] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot); + let [y5, y8] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot); + let [y6, y7] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12] + } +} + +// _ _____ __ _ _ _ _ _ +// / |___ / / /_ | || | | |__ (_) |_ +// | | |_ \ _____ | '_ \| || |_| '_ \| | __| +// | |___) | |_____| | (_) |__ _| |_) | | |_ +// |_|____/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly13 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, + twiddle3re: float64x2_t, + twiddle3im: float64x2_t, + twiddle4re: float64x2_t, + twiddle4im: float64x2_t, + twiddle5re: float64x2_t, + twiddle5im: float64x2_t, + twiddle6re: float64x2_t, + twiddle6im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly13, 13, |this: &NeonF64Butterfly13<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly13, 13, |this: &NeonF64Butterfly13<_>| this + .direction); +impl NeonF64Butterfly13 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 13, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 13, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 13, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 13, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 13, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 13, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f64(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f64(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f64(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f64(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f64(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f64(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f64(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f64(tw6.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 13]) -> [float64x2_t; 13] { + let [x1p12, x1m12] = solo_fft2_f64(values[1], values[12]); + let [x2p11, x2m11] = solo_fft2_f64(values[2], values[11]); + let [x3p10, x3m10] = solo_fft2_f64(values[3], values[10]); + let [x4p9, x4m9] = solo_fft2_f64(values[4], values[9]); + let [x5p8, x5m8] = solo_fft2_f64(values[5], values[8]); + let [x6p7, x6m7] = solo_fft2_f64(values[6], values[7]); + + let t_a1_1 = vmulq_f64(self.twiddle1re, x1p12); + let t_a1_2 = vmulq_f64(self.twiddle2re, x2p11); + let t_a1_3 = vmulq_f64(self.twiddle3re, x3p10); + let t_a1_4 = vmulq_f64(self.twiddle4re, x4p9); + let t_a1_5 = vmulq_f64(self.twiddle5re, x5p8); + let t_a1_6 = vmulq_f64(self.twiddle6re, x6p7); + let t_a2_1 = vmulq_f64(self.twiddle2re, x1p12); + let t_a2_2 = vmulq_f64(self.twiddle4re, x2p11); + let t_a2_3 = vmulq_f64(self.twiddle6re, x3p10); + let t_a2_4 = vmulq_f64(self.twiddle5re, x4p9); + let t_a2_5 = vmulq_f64(self.twiddle3re, x5p8); + let t_a2_6 = vmulq_f64(self.twiddle1re, x6p7); + let t_a3_1 = vmulq_f64(self.twiddle3re, x1p12); + let t_a3_2 = vmulq_f64(self.twiddle6re, x2p11); + let t_a3_3 = vmulq_f64(self.twiddle4re, x3p10); + let t_a3_4 = vmulq_f64(self.twiddle1re, x4p9); + let t_a3_5 = vmulq_f64(self.twiddle2re, x5p8); + let t_a3_6 = vmulq_f64(self.twiddle5re, x6p7); + let t_a4_1 = vmulq_f64(self.twiddle4re, x1p12); + let t_a4_2 = vmulq_f64(self.twiddle5re, x2p11); + let t_a4_3 = vmulq_f64(self.twiddle1re, x3p10); + let t_a4_4 = vmulq_f64(self.twiddle3re, x4p9); + let t_a4_5 = vmulq_f64(self.twiddle6re, x5p8); + let t_a4_6 = vmulq_f64(self.twiddle2re, x6p7); + let t_a5_1 = vmulq_f64(self.twiddle5re, x1p12); + let t_a5_2 = vmulq_f64(self.twiddle3re, x2p11); + let t_a5_3 = vmulq_f64(self.twiddle2re, x3p10); + let t_a5_4 = vmulq_f64(self.twiddle6re, x4p9); + let t_a5_5 = vmulq_f64(self.twiddle1re, x5p8); + let t_a5_6 = vmulq_f64(self.twiddle4re, x6p7); + let t_a6_1 = vmulq_f64(self.twiddle6re, x1p12); + let t_a6_2 = vmulq_f64(self.twiddle1re, x2p11); + let t_a6_3 = vmulq_f64(self.twiddle5re, x3p10); + let t_a6_4 = vmulq_f64(self.twiddle2re, x4p9); + let t_a6_5 = vmulq_f64(self.twiddle4re, x5p8); + let t_a6_6 = vmulq_f64(self.twiddle3re, x6p7); + + let t_b1_1 = vmulq_f64(self.twiddle1im, x1m12); + let t_b1_2 = vmulq_f64(self.twiddle2im, x2m11); + let t_b1_3 = vmulq_f64(self.twiddle3im, x3m10); + let t_b1_4 = vmulq_f64(self.twiddle4im, x4m9); + let t_b1_5 = vmulq_f64(self.twiddle5im, x5m8); + let t_b1_6 = vmulq_f64(self.twiddle6im, x6m7); + let t_b2_1 = vmulq_f64(self.twiddle2im, x1m12); + let t_b2_2 = vmulq_f64(self.twiddle4im, x2m11); + let t_b2_3 = vmulq_f64(self.twiddle6im, x3m10); + let t_b2_4 = vmulq_f64(self.twiddle5im, x4m9); + let t_b2_5 = vmulq_f64(self.twiddle3im, x5m8); + let t_b2_6 = vmulq_f64(self.twiddle1im, x6m7); + let t_b3_1 = vmulq_f64(self.twiddle3im, x1m12); + let t_b3_2 = vmulq_f64(self.twiddle6im, x2m11); + let t_b3_3 = vmulq_f64(self.twiddle4im, x3m10); + let t_b3_4 = vmulq_f64(self.twiddle1im, x4m9); + let t_b3_5 = vmulq_f64(self.twiddle2im, x5m8); + let t_b3_6 = vmulq_f64(self.twiddle5im, x6m7); + let t_b4_1 = vmulq_f64(self.twiddle4im, x1m12); + let t_b4_2 = vmulq_f64(self.twiddle5im, x2m11); + let t_b4_3 = vmulq_f64(self.twiddle1im, x3m10); + let t_b4_4 = vmulq_f64(self.twiddle3im, x4m9); + let t_b4_5 = vmulq_f64(self.twiddle6im, x5m8); + let t_b4_6 = vmulq_f64(self.twiddle2im, x6m7); + let t_b5_1 = vmulq_f64(self.twiddle5im, x1m12); + let t_b5_2 = vmulq_f64(self.twiddle3im, x2m11); + let t_b5_3 = vmulq_f64(self.twiddle2im, x3m10); + let t_b5_4 = vmulq_f64(self.twiddle6im, x4m9); + let t_b5_5 = vmulq_f64(self.twiddle1im, x5m8); + let t_b5_6 = vmulq_f64(self.twiddle4im, x6m7); + let t_b6_1 = vmulq_f64(self.twiddle6im, x1m12); + let t_b6_2 = vmulq_f64(self.twiddle1im, x2m11); + let t_b6_3 = vmulq_f64(self.twiddle5im, x3m10); + let t_b6_4 = vmulq_f64(self.twiddle2im, x4m9); + let t_b6_5 = vmulq_f64(self.twiddle4im, x5m8); + let t_b6_6 = vmulq_f64(self.twiddle3im, x6m7); + + let x0 = values[0]; + let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6); + let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6); + let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6); + let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6); + let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6); + let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6); + + let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6); + let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 - t_b2_4 - t_b2_5 - t_b2_6); + let t_b3 = calc_f64!(t_b3_1 + t_b3_2 - t_b3_3 - t_b3_4 + t_b3_5 + t_b3_6); + let t_b4 = calc_f64!(t_b4_1 - t_b4_2 - t_b4_3 + t_b4_4 - t_b4_5 - t_b4_6); + let t_b5 = calc_f64!(t_b5_1 - t_b5_2 + t_b5_3 - t_b5_4 - t_b5_5 + t_b5_6); + let t_b6 = calc_f64!(t_b6_1 - t_b6_2 + t_b6_3 - t_b6_4 + t_b6_5 - t_b6_6); + + let t_b1_rot = self.rotate.rotate(t_b1); + let t_b2_rot = self.rotate.rotate(t_b2); + let t_b3_rot = self.rotate.rotate(t_b3); + let t_b4_rot = self.rotate.rotate(t_b4); + let t_b5_rot = self.rotate.rotate(t_b5); + let t_b6_rot = self.rotate.rotate(t_b6); + + let y0 = calc_f64!(x0 + x1p12 + x2p11 + x3p10 + x4p9 + x5p8 + x6p7); + let [y1, y12] = solo_fft2_f64(t_a1, t_b1_rot); + let [y2, y11] = solo_fft2_f64(t_a2, t_b2_rot); + let [y3, y10] = solo_fft2_f64(t_a3, t_b3_rot); + let [y4, y9] = solo_fft2_f64(t_a4, t_b4_rot); + let [y5, y8] = solo_fft2_f64(t_a5, t_b5_rot); + let [y6, y7] = solo_fft2_f64(t_a6, t_b6_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12] + } +} + +// _ _____ _________ _ _ _ +// / |___ | |___ /___ \| |__ (_) |_ +// | | / / _____ |_ \ __) | '_ \| | __| +// | | / / |_____| ___) / __/| |_) | | |_ +// |_|/_/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly17 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, + twiddle3re: float32x4_t, + twiddle3im: float32x4_t, + twiddle4re: float32x4_t, + twiddle4im: float32x4_t, + twiddle5re: float32x4_t, + twiddle5im: float32x4_t, + twiddle6re: float32x4_t, + twiddle6im: float32x4_t, + twiddle7re: float32x4_t, + twiddle7im: float32x4_t, + twiddle8re: float32x4_t, + twiddle8im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly17, 17, |this: &NeonF32Butterfly17<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly17, 17, |this: &NeonF32Butterfly17<_>| this + .direction); +impl NeonF32Butterfly17 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 17, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 17, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 17, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 17, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 17, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 17, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 17, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 17, direction); + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f32(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f32(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f32(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f32(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f32(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f32(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f32(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f32(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f32(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f32(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f32(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f32(tw8.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + + let out = self.perform_parallel_fft_direct(values); + + write_partial_lo_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[8]), + extract_hi_lo_f32(input_packed[0], input_packed[9]), + extract_lo_hi_f32(input_packed[1], input_packed[9]), + extract_hi_lo_f32(input_packed[1], input_packed[10]), + extract_lo_hi_f32(input_packed[2], input_packed[10]), + extract_hi_lo_f32(input_packed[2], input_packed[11]), + extract_lo_hi_f32(input_packed[3], input_packed[11]), + extract_hi_lo_f32(input_packed[3], input_packed[12]), + extract_lo_hi_f32(input_packed[4], input_packed[12]), + extract_hi_lo_f32(input_packed[4], input_packed[13]), + extract_lo_hi_f32(input_packed[5], input_packed[13]), + extract_hi_lo_f32(input_packed[5], input_packed[14]), + extract_lo_hi_f32(input_packed[6], input_packed[14]), + extract_hi_lo_f32(input_packed[6], input_packed[15]), + extract_lo_hi_f32(input_packed[7], input_packed[15]), + extract_hi_lo_f32(input_packed[7], input_packed[16]), + extract_lo_hi_f32(input_packed[8], input_packed[16]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_lo_f32(out[8], out[9]), + extract_lo_lo_f32(out[10], out[11]), + extract_lo_lo_f32(out[12], out[13]), + extract_lo_lo_f32(out[14], out[15]), + extract_lo_hi_f32(out[16], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + extract_hi_hi_f32(out[9], out[10]), + extract_hi_hi_f32(out[11], out[12]), + extract_hi_hi_f32(out[13], out[14]), + extract_hi_hi_f32(out[15], out[16]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 17]) -> [float32x4_t; 17] { + let [x1p16, x1m16] = parallel_fft2_interleaved_f32(values[1], values[16]); + let [x2p15, x2m15] = parallel_fft2_interleaved_f32(values[2], values[15]); + let [x3p14, x3m14] = parallel_fft2_interleaved_f32(values[3], values[14]); + let [x4p13, x4m13] = parallel_fft2_interleaved_f32(values[4], values[13]); + let [x5p12, x5m12] = parallel_fft2_interleaved_f32(values[5], values[12]); + let [x6p11, x6m11] = parallel_fft2_interleaved_f32(values[6], values[11]); + let [x7p10, x7m10] = parallel_fft2_interleaved_f32(values[7], values[10]); + let [x8p9, x8m9] = parallel_fft2_interleaved_f32(values[8], values[9]); + + let t_a1_1 = vmulq_f32(self.twiddle1re, x1p16); + let t_a1_2 = vmulq_f32(self.twiddle2re, x2p15); + let t_a1_3 = vmulq_f32(self.twiddle3re, x3p14); + let t_a1_4 = vmulq_f32(self.twiddle4re, x4p13); + let t_a1_5 = vmulq_f32(self.twiddle5re, x5p12); + let t_a1_6 = vmulq_f32(self.twiddle6re, x6p11); + let t_a1_7 = vmulq_f32(self.twiddle7re, x7p10); + let t_a1_8 = vmulq_f32(self.twiddle8re, x8p9); + let t_a2_1 = vmulq_f32(self.twiddle2re, x1p16); + let t_a2_2 = vmulq_f32(self.twiddle4re, x2p15); + let t_a2_3 = vmulq_f32(self.twiddle6re, x3p14); + let t_a2_4 = vmulq_f32(self.twiddle8re, x4p13); + let t_a2_5 = vmulq_f32(self.twiddle7re, x5p12); + let t_a2_6 = vmulq_f32(self.twiddle5re, x6p11); + let t_a2_7 = vmulq_f32(self.twiddle3re, x7p10); + let t_a2_8 = vmulq_f32(self.twiddle1re, x8p9); + let t_a3_1 = vmulq_f32(self.twiddle3re, x1p16); + let t_a3_2 = vmulq_f32(self.twiddle6re, x2p15); + let t_a3_3 = vmulq_f32(self.twiddle8re, x3p14); + let t_a3_4 = vmulq_f32(self.twiddle5re, x4p13); + let t_a3_5 = vmulq_f32(self.twiddle2re, x5p12); + let t_a3_6 = vmulq_f32(self.twiddle1re, x6p11); + let t_a3_7 = vmulq_f32(self.twiddle4re, x7p10); + let t_a3_8 = vmulq_f32(self.twiddle7re, x8p9); + let t_a4_1 = vmulq_f32(self.twiddle4re, x1p16); + let t_a4_2 = vmulq_f32(self.twiddle8re, x2p15); + let t_a4_3 = vmulq_f32(self.twiddle5re, x3p14); + let t_a4_4 = vmulq_f32(self.twiddle1re, x4p13); + let t_a4_5 = vmulq_f32(self.twiddle3re, x5p12); + let t_a4_6 = vmulq_f32(self.twiddle7re, x6p11); + let t_a4_7 = vmulq_f32(self.twiddle6re, x7p10); + let t_a4_8 = vmulq_f32(self.twiddle2re, x8p9); + let t_a5_1 = vmulq_f32(self.twiddle5re, x1p16); + let t_a5_2 = vmulq_f32(self.twiddle7re, x2p15); + let t_a5_3 = vmulq_f32(self.twiddle2re, x3p14); + let t_a5_4 = vmulq_f32(self.twiddle3re, x4p13); + let t_a5_5 = vmulq_f32(self.twiddle8re, x5p12); + let t_a5_6 = vmulq_f32(self.twiddle4re, x6p11); + let t_a5_7 = vmulq_f32(self.twiddle1re, x7p10); + let t_a5_8 = vmulq_f32(self.twiddle6re, x8p9); + let t_a6_1 = vmulq_f32(self.twiddle6re, x1p16); + let t_a6_2 = vmulq_f32(self.twiddle5re, x2p15); + let t_a6_3 = vmulq_f32(self.twiddle1re, x3p14); + let t_a6_4 = vmulq_f32(self.twiddle7re, x4p13); + let t_a6_5 = vmulq_f32(self.twiddle4re, x5p12); + let t_a6_6 = vmulq_f32(self.twiddle2re, x6p11); + let t_a6_7 = vmulq_f32(self.twiddle8re, x7p10); + let t_a6_8 = vmulq_f32(self.twiddle3re, x8p9); + let t_a7_1 = vmulq_f32(self.twiddle7re, x1p16); + let t_a7_2 = vmulq_f32(self.twiddle3re, x2p15); + let t_a7_3 = vmulq_f32(self.twiddle4re, x3p14); + let t_a7_4 = vmulq_f32(self.twiddle6re, x4p13); + let t_a7_5 = vmulq_f32(self.twiddle1re, x5p12); + let t_a7_6 = vmulq_f32(self.twiddle8re, x6p11); + let t_a7_7 = vmulq_f32(self.twiddle2re, x7p10); + let t_a7_8 = vmulq_f32(self.twiddle5re, x8p9); + let t_a8_1 = vmulq_f32(self.twiddle8re, x1p16); + let t_a8_2 = vmulq_f32(self.twiddle1re, x2p15); + let t_a8_3 = vmulq_f32(self.twiddle7re, x3p14); + let t_a8_4 = vmulq_f32(self.twiddle2re, x4p13); + let t_a8_5 = vmulq_f32(self.twiddle6re, x5p12); + let t_a8_6 = vmulq_f32(self.twiddle3re, x6p11); + let t_a8_7 = vmulq_f32(self.twiddle5re, x7p10); + let t_a8_8 = vmulq_f32(self.twiddle4re, x8p9); + + let t_b1_1 = vmulq_f32(self.twiddle1im, x1m16); + let t_b1_2 = vmulq_f32(self.twiddle2im, x2m15); + let t_b1_3 = vmulq_f32(self.twiddle3im, x3m14); + let t_b1_4 = vmulq_f32(self.twiddle4im, x4m13); + let t_b1_5 = vmulq_f32(self.twiddle5im, x5m12); + let t_b1_6 = vmulq_f32(self.twiddle6im, x6m11); + let t_b1_7 = vmulq_f32(self.twiddle7im, x7m10); + let t_b1_8 = vmulq_f32(self.twiddle8im, x8m9); + let t_b2_1 = vmulq_f32(self.twiddle2im, x1m16); + let t_b2_2 = vmulq_f32(self.twiddle4im, x2m15); + let t_b2_3 = vmulq_f32(self.twiddle6im, x3m14); + let t_b2_4 = vmulq_f32(self.twiddle8im, x4m13); + let t_b2_5 = vmulq_f32(self.twiddle7im, x5m12); + let t_b2_6 = vmulq_f32(self.twiddle5im, x6m11); + let t_b2_7 = vmulq_f32(self.twiddle3im, x7m10); + let t_b2_8 = vmulq_f32(self.twiddle1im, x8m9); + let t_b3_1 = vmulq_f32(self.twiddle3im, x1m16); + let t_b3_2 = vmulq_f32(self.twiddle6im, x2m15); + let t_b3_3 = vmulq_f32(self.twiddle8im, x3m14); + let t_b3_4 = vmulq_f32(self.twiddle5im, x4m13); + let t_b3_5 = vmulq_f32(self.twiddle2im, x5m12); + let t_b3_6 = vmulq_f32(self.twiddle1im, x6m11); + let t_b3_7 = vmulq_f32(self.twiddle4im, x7m10); + let t_b3_8 = vmulq_f32(self.twiddle7im, x8m9); + let t_b4_1 = vmulq_f32(self.twiddle4im, x1m16); + let t_b4_2 = vmulq_f32(self.twiddle8im, x2m15); + let t_b4_3 = vmulq_f32(self.twiddle5im, x3m14); + let t_b4_4 = vmulq_f32(self.twiddle1im, x4m13); + let t_b4_5 = vmulq_f32(self.twiddle3im, x5m12); + let t_b4_6 = vmulq_f32(self.twiddle7im, x6m11); + let t_b4_7 = vmulq_f32(self.twiddle6im, x7m10); + let t_b4_8 = vmulq_f32(self.twiddle2im, x8m9); + let t_b5_1 = vmulq_f32(self.twiddle5im, x1m16); + let t_b5_2 = vmulq_f32(self.twiddle7im, x2m15); + let t_b5_3 = vmulq_f32(self.twiddle2im, x3m14); + let t_b5_4 = vmulq_f32(self.twiddle3im, x4m13); + let t_b5_5 = vmulq_f32(self.twiddle8im, x5m12); + let t_b5_6 = vmulq_f32(self.twiddle4im, x6m11); + let t_b5_7 = vmulq_f32(self.twiddle1im, x7m10); + let t_b5_8 = vmulq_f32(self.twiddle6im, x8m9); + let t_b6_1 = vmulq_f32(self.twiddle6im, x1m16); + let t_b6_2 = vmulq_f32(self.twiddle5im, x2m15); + let t_b6_3 = vmulq_f32(self.twiddle1im, x3m14); + let t_b6_4 = vmulq_f32(self.twiddle7im, x4m13); + let t_b6_5 = vmulq_f32(self.twiddle4im, x5m12); + let t_b6_6 = vmulq_f32(self.twiddle2im, x6m11); + let t_b6_7 = vmulq_f32(self.twiddle8im, x7m10); + let t_b6_8 = vmulq_f32(self.twiddle3im, x8m9); + let t_b7_1 = vmulq_f32(self.twiddle7im, x1m16); + let t_b7_2 = vmulq_f32(self.twiddle3im, x2m15); + let t_b7_3 = vmulq_f32(self.twiddle4im, x3m14); + let t_b7_4 = vmulq_f32(self.twiddle6im, x4m13); + let t_b7_5 = vmulq_f32(self.twiddle1im, x5m12); + let t_b7_6 = vmulq_f32(self.twiddle8im, x6m11); + let t_b7_7 = vmulq_f32(self.twiddle2im, x7m10); + let t_b7_8 = vmulq_f32(self.twiddle5im, x8m9); + let t_b8_1 = vmulq_f32(self.twiddle8im, x1m16); + let t_b8_2 = vmulq_f32(self.twiddle1im, x2m15); + let t_b8_3 = vmulq_f32(self.twiddle7im, x3m14); + let t_b8_4 = vmulq_f32(self.twiddle2im, x4m13); + let t_b8_5 = vmulq_f32(self.twiddle6im, x5m12); + let t_b8_6 = vmulq_f32(self.twiddle3im, x6m11); + let t_b8_7 = vmulq_f32(self.twiddle5im, x7m10); + let t_b8_8 = vmulq_f32(self.twiddle4im, x8m9); + + let x0 = values[0]; + let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8); + let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8); + let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8); + let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8); + let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8); + let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8); + let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8); + let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8); + + let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8); + let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 - t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8); + let t_b3 = calc_f32!(t_b3_1 + t_b3_2 - t_b3_3 - t_b3_4 - t_b3_5 + t_b3_6 + t_b3_7 + t_b3_8); + let t_b4 = calc_f32!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 + t_b4_5 + t_b4_6 - t_b4_7 - t_b4_8); + let t_b5 = calc_f32!(t_b5_1 - t_b5_2 - t_b5_3 + t_b5_4 + t_b5_5 - t_b5_6 + t_b5_7 + t_b5_8); + let t_b6 = calc_f32!(t_b6_1 - t_b6_2 + t_b6_3 + t_b6_4 - t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8); + let t_b7 = calc_f32!(t_b7_1 - t_b7_2 + t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 + t_b7_8); + let t_b8 = calc_f32!(t_b8_1 - t_b8_2 + t_b8_3 - t_b8_4 + t_b8_5 - t_b8_6 + t_b8_7 - t_b8_8); + + let t_b1_rot = self.rotate.rotate_both(t_b1); + let t_b2_rot = self.rotate.rotate_both(t_b2); + let t_b3_rot = self.rotate.rotate_both(t_b3); + let t_b4_rot = self.rotate.rotate_both(t_b4); + let t_b5_rot = self.rotate.rotate_both(t_b5); + let t_b6_rot = self.rotate.rotate_both(t_b6); + let t_b7_rot = self.rotate.rotate_both(t_b7); + let t_b8_rot = self.rotate.rotate_both(t_b8); + + let y0 = calc_f32!(x0 + x1p16 + x2p15 + x3p14 + x4p13 + x5p12 + x6p11 + x7p10 + x8p9); + let [y1, y16] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot); + let [y2, y15] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot); + let [y3, y14] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot); + let [y4, y13] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot); + let [y5, y12] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot); + let [y6, y11] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot); + let [y7, y10] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot); + let [y8, y9] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16] + } +} + +// _ _____ __ _ _ _ _ _ +// / |___ | / /_ | || | | |__ (_) |_ +// | | / / _____ | '_ \| || |_| '_ \| | __| +// | | / / |_____| | (_) |__ _| |_) | | |_ +// |_|/_/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly17 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, + twiddle3re: float64x2_t, + twiddle3im: float64x2_t, + twiddle4re: float64x2_t, + twiddle4im: float64x2_t, + twiddle5re: float64x2_t, + twiddle5im: float64x2_t, + twiddle6re: float64x2_t, + twiddle6im: float64x2_t, + twiddle7re: float64x2_t, + twiddle7im: float64x2_t, + twiddle8re: float64x2_t, + twiddle8im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly17, 17, |this: &NeonF64Butterfly17<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly17, 17, |this: &NeonF64Butterfly17<_>| this + .direction); +impl NeonF64Butterfly17 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 17, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 17, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 17, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 17, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 17, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 17, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 17, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 17, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f64(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f64(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f64(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f64(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f64(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f64(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f64(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f64(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f64(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f64(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f64(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f64(tw8.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 17]) -> [float64x2_t; 17] { + let [x1p16, x1m16] = solo_fft2_f64(values[1], values[16]); + let [x2p15, x2m15] = solo_fft2_f64(values[2], values[15]); + let [x3p14, x3m14] = solo_fft2_f64(values[3], values[14]); + let [x4p13, x4m13] = solo_fft2_f64(values[4], values[13]); + let [x5p12, x5m12] = solo_fft2_f64(values[5], values[12]); + let [x6p11, x6m11] = solo_fft2_f64(values[6], values[11]); + let [x7p10, x7m10] = solo_fft2_f64(values[7], values[10]); + let [x8p9, x8m9] = solo_fft2_f64(values[8], values[9]); + + let t_a1_1 = vmulq_f64(self.twiddle1re, x1p16); + let t_a1_2 = vmulq_f64(self.twiddle2re, x2p15); + let t_a1_3 = vmulq_f64(self.twiddle3re, x3p14); + let t_a1_4 = vmulq_f64(self.twiddle4re, x4p13); + let t_a1_5 = vmulq_f64(self.twiddle5re, x5p12); + let t_a1_6 = vmulq_f64(self.twiddle6re, x6p11); + let t_a1_7 = vmulq_f64(self.twiddle7re, x7p10); + let t_a1_8 = vmulq_f64(self.twiddle8re, x8p9); + let t_a2_1 = vmulq_f64(self.twiddle2re, x1p16); + let t_a2_2 = vmulq_f64(self.twiddle4re, x2p15); + let t_a2_3 = vmulq_f64(self.twiddle6re, x3p14); + let t_a2_4 = vmulq_f64(self.twiddle8re, x4p13); + let t_a2_5 = vmulq_f64(self.twiddle7re, x5p12); + let t_a2_6 = vmulq_f64(self.twiddle5re, x6p11); + let t_a2_7 = vmulq_f64(self.twiddle3re, x7p10); + let t_a2_8 = vmulq_f64(self.twiddle1re, x8p9); + let t_a3_1 = vmulq_f64(self.twiddle3re, x1p16); + let t_a3_2 = vmulq_f64(self.twiddle6re, x2p15); + let t_a3_3 = vmulq_f64(self.twiddle8re, x3p14); + let t_a3_4 = vmulq_f64(self.twiddle5re, x4p13); + let t_a3_5 = vmulq_f64(self.twiddle2re, x5p12); + let t_a3_6 = vmulq_f64(self.twiddle1re, x6p11); + let t_a3_7 = vmulq_f64(self.twiddle4re, x7p10); + let t_a3_8 = vmulq_f64(self.twiddle7re, x8p9); + let t_a4_1 = vmulq_f64(self.twiddle4re, x1p16); + let t_a4_2 = vmulq_f64(self.twiddle8re, x2p15); + let t_a4_3 = vmulq_f64(self.twiddle5re, x3p14); + let t_a4_4 = vmulq_f64(self.twiddle1re, x4p13); + let t_a4_5 = vmulq_f64(self.twiddle3re, x5p12); + let t_a4_6 = vmulq_f64(self.twiddle7re, x6p11); + let t_a4_7 = vmulq_f64(self.twiddle6re, x7p10); + let t_a4_8 = vmulq_f64(self.twiddle2re, x8p9); + let t_a5_1 = vmulq_f64(self.twiddle5re, x1p16); + let t_a5_2 = vmulq_f64(self.twiddle7re, x2p15); + let t_a5_3 = vmulq_f64(self.twiddle2re, x3p14); + let t_a5_4 = vmulq_f64(self.twiddle3re, x4p13); + let t_a5_5 = vmulq_f64(self.twiddle8re, x5p12); + let t_a5_6 = vmulq_f64(self.twiddle4re, x6p11); + let t_a5_7 = vmulq_f64(self.twiddle1re, x7p10); + let t_a5_8 = vmulq_f64(self.twiddle6re, x8p9); + let t_a6_1 = vmulq_f64(self.twiddle6re, x1p16); + let t_a6_2 = vmulq_f64(self.twiddle5re, x2p15); + let t_a6_3 = vmulq_f64(self.twiddle1re, x3p14); + let t_a6_4 = vmulq_f64(self.twiddle7re, x4p13); + let t_a6_5 = vmulq_f64(self.twiddle4re, x5p12); + let t_a6_6 = vmulq_f64(self.twiddle2re, x6p11); + let t_a6_7 = vmulq_f64(self.twiddle8re, x7p10); + let t_a6_8 = vmulq_f64(self.twiddle3re, x8p9); + let t_a7_1 = vmulq_f64(self.twiddle7re, x1p16); + let t_a7_2 = vmulq_f64(self.twiddle3re, x2p15); + let t_a7_3 = vmulq_f64(self.twiddle4re, x3p14); + let t_a7_4 = vmulq_f64(self.twiddle6re, x4p13); + let t_a7_5 = vmulq_f64(self.twiddle1re, x5p12); + let t_a7_6 = vmulq_f64(self.twiddle8re, x6p11); + let t_a7_7 = vmulq_f64(self.twiddle2re, x7p10); + let t_a7_8 = vmulq_f64(self.twiddle5re, x8p9); + let t_a8_1 = vmulq_f64(self.twiddle8re, x1p16); + let t_a8_2 = vmulq_f64(self.twiddle1re, x2p15); + let t_a8_3 = vmulq_f64(self.twiddle7re, x3p14); + let t_a8_4 = vmulq_f64(self.twiddle2re, x4p13); + let t_a8_5 = vmulq_f64(self.twiddle6re, x5p12); + let t_a8_6 = vmulq_f64(self.twiddle3re, x6p11); + let t_a8_7 = vmulq_f64(self.twiddle5re, x7p10); + let t_a8_8 = vmulq_f64(self.twiddle4re, x8p9); + + let t_b1_1 = vmulq_f64(self.twiddle1im, x1m16); + let t_b1_2 = vmulq_f64(self.twiddle2im, x2m15); + let t_b1_3 = vmulq_f64(self.twiddle3im, x3m14); + let t_b1_4 = vmulq_f64(self.twiddle4im, x4m13); + let t_b1_5 = vmulq_f64(self.twiddle5im, x5m12); + let t_b1_6 = vmulq_f64(self.twiddle6im, x6m11); + let t_b1_7 = vmulq_f64(self.twiddle7im, x7m10); + let t_b1_8 = vmulq_f64(self.twiddle8im, x8m9); + let t_b2_1 = vmulq_f64(self.twiddle2im, x1m16); + let t_b2_2 = vmulq_f64(self.twiddle4im, x2m15); + let t_b2_3 = vmulq_f64(self.twiddle6im, x3m14); + let t_b2_4 = vmulq_f64(self.twiddle8im, x4m13); + let t_b2_5 = vmulq_f64(self.twiddle7im, x5m12); + let t_b2_6 = vmulq_f64(self.twiddle5im, x6m11); + let t_b2_7 = vmulq_f64(self.twiddle3im, x7m10); + let t_b2_8 = vmulq_f64(self.twiddle1im, x8m9); + let t_b3_1 = vmulq_f64(self.twiddle3im, x1m16); + let t_b3_2 = vmulq_f64(self.twiddle6im, x2m15); + let t_b3_3 = vmulq_f64(self.twiddle8im, x3m14); + let t_b3_4 = vmulq_f64(self.twiddle5im, x4m13); + let t_b3_5 = vmulq_f64(self.twiddle2im, x5m12); + let t_b3_6 = vmulq_f64(self.twiddle1im, x6m11); + let t_b3_7 = vmulq_f64(self.twiddle4im, x7m10); + let t_b3_8 = vmulq_f64(self.twiddle7im, x8m9); + let t_b4_1 = vmulq_f64(self.twiddle4im, x1m16); + let t_b4_2 = vmulq_f64(self.twiddle8im, x2m15); + let t_b4_3 = vmulq_f64(self.twiddle5im, x3m14); + let t_b4_4 = vmulq_f64(self.twiddle1im, x4m13); + let t_b4_5 = vmulq_f64(self.twiddle3im, x5m12); + let t_b4_6 = vmulq_f64(self.twiddle7im, x6m11); + let t_b4_7 = vmulq_f64(self.twiddle6im, x7m10); + let t_b4_8 = vmulq_f64(self.twiddle2im, x8m9); + let t_b5_1 = vmulq_f64(self.twiddle5im, x1m16); + let t_b5_2 = vmulq_f64(self.twiddle7im, x2m15); + let t_b5_3 = vmulq_f64(self.twiddle2im, x3m14); + let t_b5_4 = vmulq_f64(self.twiddle3im, x4m13); + let t_b5_5 = vmulq_f64(self.twiddle8im, x5m12); + let t_b5_6 = vmulq_f64(self.twiddle4im, x6m11); + let t_b5_7 = vmulq_f64(self.twiddle1im, x7m10); + let t_b5_8 = vmulq_f64(self.twiddle6im, x8m9); + let t_b6_1 = vmulq_f64(self.twiddle6im, x1m16); + let t_b6_2 = vmulq_f64(self.twiddle5im, x2m15); + let t_b6_3 = vmulq_f64(self.twiddle1im, x3m14); + let t_b6_4 = vmulq_f64(self.twiddle7im, x4m13); + let t_b6_5 = vmulq_f64(self.twiddle4im, x5m12); + let t_b6_6 = vmulq_f64(self.twiddle2im, x6m11); + let t_b6_7 = vmulq_f64(self.twiddle8im, x7m10); + let t_b6_8 = vmulq_f64(self.twiddle3im, x8m9); + let t_b7_1 = vmulq_f64(self.twiddle7im, x1m16); + let t_b7_2 = vmulq_f64(self.twiddle3im, x2m15); + let t_b7_3 = vmulq_f64(self.twiddle4im, x3m14); + let t_b7_4 = vmulq_f64(self.twiddle6im, x4m13); + let t_b7_5 = vmulq_f64(self.twiddle1im, x5m12); + let t_b7_6 = vmulq_f64(self.twiddle8im, x6m11); + let t_b7_7 = vmulq_f64(self.twiddle2im, x7m10); + let t_b7_8 = vmulq_f64(self.twiddle5im, x8m9); + let t_b8_1 = vmulq_f64(self.twiddle8im, x1m16); + let t_b8_2 = vmulq_f64(self.twiddle1im, x2m15); + let t_b8_3 = vmulq_f64(self.twiddle7im, x3m14); + let t_b8_4 = vmulq_f64(self.twiddle2im, x4m13); + let t_b8_5 = vmulq_f64(self.twiddle6im, x5m12); + let t_b8_6 = vmulq_f64(self.twiddle3im, x6m11); + let t_b8_7 = vmulq_f64(self.twiddle5im, x7m10); + let t_b8_8 = vmulq_f64(self.twiddle4im, x8m9); + + let x0 = values[0]; + let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8); + let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8); + let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8); + let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8); + let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8); + let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8); + let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8); + let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8); + + let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8); + let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 - t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8); + let t_b3 = calc_f64!(t_b3_1 + t_b3_2 - t_b3_3 - t_b3_4 - t_b3_5 + t_b3_6 + t_b3_7 + t_b3_8); + let t_b4 = calc_f64!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 + t_b4_5 + t_b4_6 - t_b4_7 - t_b4_8); + let t_b5 = calc_f64!(t_b5_1 - t_b5_2 - t_b5_3 + t_b5_4 + t_b5_5 - t_b5_6 + t_b5_7 + t_b5_8); + let t_b6 = calc_f64!(t_b6_1 - t_b6_2 + t_b6_3 + t_b6_4 - t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8); + let t_b7 = calc_f64!(t_b7_1 - t_b7_2 + t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 + t_b7_8); + let t_b8 = calc_f64!(t_b8_1 - t_b8_2 + t_b8_3 - t_b8_4 + t_b8_5 - t_b8_6 + t_b8_7 - t_b8_8); + + let t_b1_rot = self.rotate.rotate(t_b1); + let t_b2_rot = self.rotate.rotate(t_b2); + let t_b3_rot = self.rotate.rotate(t_b3); + let t_b4_rot = self.rotate.rotate(t_b4); + let t_b5_rot = self.rotate.rotate(t_b5); + let t_b6_rot = self.rotate.rotate(t_b6); + let t_b7_rot = self.rotate.rotate(t_b7); + let t_b8_rot = self.rotate.rotate(t_b8); + + let y0 = calc_f64!(x0 + x1p16 + x2p15 + x3p14 + x4p13 + x5p12 + x6p11 + x7p10 + x8p9); + let [y1, y16] = solo_fft2_f64(t_a1, t_b1_rot); + let [y2, y15] = solo_fft2_f64(t_a2, t_b2_rot); + let [y3, y14] = solo_fft2_f64(t_a3, t_b3_rot); + let [y4, y13] = solo_fft2_f64(t_a4, t_b4_rot); + let [y5, y12] = solo_fft2_f64(t_a5, t_b5_rot); + let [y6, y11] = solo_fft2_f64(t_a6, t_b6_rot); + let [y7, y10] = solo_fft2_f64(t_a7, t_b7_rot); + let [y8, y9] = solo_fft2_f64(t_a8, t_b8_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16] + } +} + +// _ ___ _________ _ _ _ +// / |/ _ \ |___ /___ \| |__ (_) |_ +// | | (_) | _____ |_ \ __) | '_ \| | __| +// | |\__, | |_____| ___) / __/| |_) | | |_ +// |_| /_/ |____/_____|_.__/|_|\__| +// +pub struct NeonF32Butterfly19 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, + twiddle3re: float32x4_t, + twiddle3im: float32x4_t, + twiddle4re: float32x4_t, + twiddle4im: float32x4_t, + twiddle5re: float32x4_t, + twiddle5im: float32x4_t, + twiddle6re: float32x4_t, + twiddle6im: float32x4_t, + twiddle7re: float32x4_t, + twiddle7im: float32x4_t, + twiddle8re: float32x4_t, + twiddle8im: float32x4_t, + twiddle9re: float32x4_t, + twiddle9im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly19, 19, |this: &NeonF32Butterfly19<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly19, 19, |this: &NeonF32Butterfly19<_>| this + .direction); +impl NeonF32Butterfly19 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 19, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 19, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 19, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 19, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 19, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 19, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 19, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 19, direction); + let tw9: Complex = twiddles::compute_twiddle(9, 19, direction); + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f32(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f32(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f32(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f32(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f32(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f32(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f32(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f32(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f32(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f32(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f32(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f32(tw8.im) }; + let twiddle9re = unsafe { vmovq_n_f32(tw9.re) }; + let twiddle9im = unsafe { vmovq_n_f32(tw9.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + twiddle9re, + twiddle9im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}); + + let out = self.perform_parallel_fft_direct(values); + + write_partial_lo_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[9]), + extract_hi_lo_f32(input_packed[0], input_packed[10]), + extract_lo_hi_f32(input_packed[1], input_packed[10]), + extract_hi_lo_f32(input_packed[1], input_packed[11]), + extract_lo_hi_f32(input_packed[2], input_packed[11]), + extract_hi_lo_f32(input_packed[2], input_packed[12]), + extract_lo_hi_f32(input_packed[3], input_packed[12]), + extract_hi_lo_f32(input_packed[3], input_packed[13]), + extract_lo_hi_f32(input_packed[4], input_packed[13]), + extract_hi_lo_f32(input_packed[4], input_packed[14]), + extract_lo_hi_f32(input_packed[5], input_packed[14]), + extract_hi_lo_f32(input_packed[5], input_packed[15]), + extract_lo_hi_f32(input_packed[6], input_packed[15]), + extract_hi_lo_f32(input_packed[6], input_packed[16]), + extract_lo_hi_f32(input_packed[7], input_packed[16]), + extract_hi_lo_f32(input_packed[7], input_packed[17]), + extract_lo_hi_f32(input_packed[8], input_packed[17]), + extract_hi_lo_f32(input_packed[8], input_packed[18]), + extract_lo_hi_f32(input_packed[9], input_packed[18]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_lo_f32(out[8], out[9]), + extract_lo_lo_f32(out[10], out[11]), + extract_lo_lo_f32(out[12], out[13]), + extract_lo_lo_f32(out[14], out[15]), + extract_lo_lo_f32(out[16], out[17]), + extract_lo_hi_f32(out[18], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + extract_hi_hi_f32(out[9], out[10]), + extract_hi_hi_f32(out[11], out[12]), + extract_hi_hi_f32(out[13], out[14]), + extract_hi_hi_f32(out[15], out[16]), + extract_hi_hi_f32(out[17], out[18]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 19]) -> [float32x4_t; 19] { + let [x1p18, x1m18] = parallel_fft2_interleaved_f32(values[1], values[18]); + let [x2p17, x2m17] = parallel_fft2_interleaved_f32(values[2], values[17]); + let [x3p16, x3m16] = parallel_fft2_interleaved_f32(values[3], values[16]); + let [x4p15, x4m15] = parallel_fft2_interleaved_f32(values[4], values[15]); + let [x5p14, x5m14] = parallel_fft2_interleaved_f32(values[5], values[14]); + let [x6p13, x6m13] = parallel_fft2_interleaved_f32(values[6], values[13]); + let [x7p12, x7m12] = parallel_fft2_interleaved_f32(values[7], values[12]); + let [x8p11, x8m11] = parallel_fft2_interleaved_f32(values[8], values[11]); + let [x9p10, x9m10] = parallel_fft2_interleaved_f32(values[9], values[10]); + + let t_a1_1 = vmulq_f32(self.twiddle1re, x1p18); + let t_a1_2 = vmulq_f32(self.twiddle2re, x2p17); + let t_a1_3 = vmulq_f32(self.twiddle3re, x3p16); + let t_a1_4 = vmulq_f32(self.twiddle4re, x4p15); + let t_a1_5 = vmulq_f32(self.twiddle5re, x5p14); + let t_a1_6 = vmulq_f32(self.twiddle6re, x6p13); + let t_a1_7 = vmulq_f32(self.twiddle7re, x7p12); + let t_a1_8 = vmulq_f32(self.twiddle8re, x8p11); + let t_a1_9 = vmulq_f32(self.twiddle9re, x9p10); + let t_a2_1 = vmulq_f32(self.twiddle2re, x1p18); + let t_a2_2 = vmulq_f32(self.twiddle4re, x2p17); + let t_a2_3 = vmulq_f32(self.twiddle6re, x3p16); + let t_a2_4 = vmulq_f32(self.twiddle8re, x4p15); + let t_a2_5 = vmulq_f32(self.twiddle9re, x5p14); + let t_a2_6 = vmulq_f32(self.twiddle7re, x6p13); + let t_a2_7 = vmulq_f32(self.twiddle5re, x7p12); + let t_a2_8 = vmulq_f32(self.twiddle3re, x8p11); + let t_a2_9 = vmulq_f32(self.twiddle1re, x9p10); + let t_a3_1 = vmulq_f32(self.twiddle3re, x1p18); + let t_a3_2 = vmulq_f32(self.twiddle6re, x2p17); + let t_a3_3 = vmulq_f32(self.twiddle9re, x3p16); + let t_a3_4 = vmulq_f32(self.twiddle7re, x4p15); + let t_a3_5 = vmulq_f32(self.twiddle4re, x5p14); + let t_a3_6 = vmulq_f32(self.twiddle1re, x6p13); + let t_a3_7 = vmulq_f32(self.twiddle2re, x7p12); + let t_a3_8 = vmulq_f32(self.twiddle5re, x8p11); + let t_a3_9 = vmulq_f32(self.twiddle8re, x9p10); + let t_a4_1 = vmulq_f32(self.twiddle4re, x1p18); + let t_a4_2 = vmulq_f32(self.twiddle8re, x2p17); + let t_a4_3 = vmulq_f32(self.twiddle7re, x3p16); + let t_a4_4 = vmulq_f32(self.twiddle3re, x4p15); + let t_a4_5 = vmulq_f32(self.twiddle1re, x5p14); + let t_a4_6 = vmulq_f32(self.twiddle5re, x6p13); + let t_a4_7 = vmulq_f32(self.twiddle9re, x7p12); + let t_a4_8 = vmulq_f32(self.twiddle6re, x8p11); + let t_a4_9 = vmulq_f32(self.twiddle2re, x9p10); + let t_a5_1 = vmulq_f32(self.twiddle5re, x1p18); + let t_a5_2 = vmulq_f32(self.twiddle9re, x2p17); + let t_a5_3 = vmulq_f32(self.twiddle4re, x3p16); + let t_a5_4 = vmulq_f32(self.twiddle1re, x4p15); + let t_a5_5 = vmulq_f32(self.twiddle6re, x5p14); + let t_a5_6 = vmulq_f32(self.twiddle8re, x6p13); + let t_a5_7 = vmulq_f32(self.twiddle3re, x7p12); + let t_a5_8 = vmulq_f32(self.twiddle2re, x8p11); + let t_a5_9 = vmulq_f32(self.twiddle7re, x9p10); + let t_a6_1 = vmulq_f32(self.twiddle6re, x1p18); + let t_a6_2 = vmulq_f32(self.twiddle7re, x2p17); + let t_a6_3 = vmulq_f32(self.twiddle1re, x3p16); + let t_a6_4 = vmulq_f32(self.twiddle5re, x4p15); + let t_a6_5 = vmulq_f32(self.twiddle8re, x5p14); + let t_a6_6 = vmulq_f32(self.twiddle2re, x6p13); + let t_a6_7 = vmulq_f32(self.twiddle4re, x7p12); + let t_a6_8 = vmulq_f32(self.twiddle9re, x8p11); + let t_a6_9 = vmulq_f32(self.twiddle3re, x9p10); + let t_a7_1 = vmulq_f32(self.twiddle7re, x1p18); + let t_a7_2 = vmulq_f32(self.twiddle5re, x2p17); + let t_a7_3 = vmulq_f32(self.twiddle2re, x3p16); + let t_a7_4 = vmulq_f32(self.twiddle9re, x4p15); + let t_a7_5 = vmulq_f32(self.twiddle3re, x5p14); + let t_a7_6 = vmulq_f32(self.twiddle4re, x6p13); + let t_a7_7 = vmulq_f32(self.twiddle8re, x7p12); + let t_a7_8 = vmulq_f32(self.twiddle1re, x8p11); + let t_a7_9 = vmulq_f32(self.twiddle6re, x9p10); + let t_a8_1 = vmulq_f32(self.twiddle8re, x1p18); + let t_a8_2 = vmulq_f32(self.twiddle3re, x2p17); + let t_a8_3 = vmulq_f32(self.twiddle5re, x3p16); + let t_a8_4 = vmulq_f32(self.twiddle6re, x4p15); + let t_a8_5 = vmulq_f32(self.twiddle2re, x5p14); + let t_a8_6 = vmulq_f32(self.twiddle9re, x6p13); + let t_a8_7 = vmulq_f32(self.twiddle1re, x7p12); + let t_a8_8 = vmulq_f32(self.twiddle7re, x8p11); + let t_a8_9 = vmulq_f32(self.twiddle4re, x9p10); + let t_a9_1 = vmulq_f32(self.twiddle9re, x1p18); + let t_a9_2 = vmulq_f32(self.twiddle1re, x2p17); + let t_a9_3 = vmulq_f32(self.twiddle8re, x3p16); + let t_a9_4 = vmulq_f32(self.twiddle2re, x4p15); + let t_a9_5 = vmulq_f32(self.twiddle7re, x5p14); + let t_a9_6 = vmulq_f32(self.twiddle3re, x6p13); + let t_a9_7 = vmulq_f32(self.twiddle6re, x7p12); + let t_a9_8 = vmulq_f32(self.twiddle4re, x8p11); + let t_a9_9 = vmulq_f32(self.twiddle5re, x9p10); + + let t_b1_1 = vmulq_f32(self.twiddle1im, x1m18); + let t_b1_2 = vmulq_f32(self.twiddle2im, x2m17); + let t_b1_3 = vmulq_f32(self.twiddle3im, x3m16); + let t_b1_4 = vmulq_f32(self.twiddle4im, x4m15); + let t_b1_5 = vmulq_f32(self.twiddle5im, x5m14); + let t_b1_6 = vmulq_f32(self.twiddle6im, x6m13); + let t_b1_7 = vmulq_f32(self.twiddle7im, x7m12); + let t_b1_8 = vmulq_f32(self.twiddle8im, x8m11); + let t_b1_9 = vmulq_f32(self.twiddle9im, x9m10); + let t_b2_1 = vmulq_f32(self.twiddle2im, x1m18); + let t_b2_2 = vmulq_f32(self.twiddle4im, x2m17); + let t_b2_3 = vmulq_f32(self.twiddle6im, x3m16); + let t_b2_4 = vmulq_f32(self.twiddle8im, x4m15); + let t_b2_5 = vmulq_f32(self.twiddle9im, x5m14); + let t_b2_6 = vmulq_f32(self.twiddle7im, x6m13); + let t_b2_7 = vmulq_f32(self.twiddle5im, x7m12); + let t_b2_8 = vmulq_f32(self.twiddle3im, x8m11); + let t_b2_9 = vmulq_f32(self.twiddle1im, x9m10); + let t_b3_1 = vmulq_f32(self.twiddle3im, x1m18); + let t_b3_2 = vmulq_f32(self.twiddle6im, x2m17); + let t_b3_3 = vmulq_f32(self.twiddle9im, x3m16); + let t_b3_4 = vmulq_f32(self.twiddle7im, x4m15); + let t_b3_5 = vmulq_f32(self.twiddle4im, x5m14); + let t_b3_6 = vmulq_f32(self.twiddle1im, x6m13); + let t_b3_7 = vmulq_f32(self.twiddle2im, x7m12); + let t_b3_8 = vmulq_f32(self.twiddle5im, x8m11); + let t_b3_9 = vmulq_f32(self.twiddle8im, x9m10); + let t_b4_1 = vmulq_f32(self.twiddle4im, x1m18); + let t_b4_2 = vmulq_f32(self.twiddle8im, x2m17); + let t_b4_3 = vmulq_f32(self.twiddle7im, x3m16); + let t_b4_4 = vmulq_f32(self.twiddle3im, x4m15); + let t_b4_5 = vmulq_f32(self.twiddle1im, x5m14); + let t_b4_6 = vmulq_f32(self.twiddle5im, x6m13); + let t_b4_7 = vmulq_f32(self.twiddle9im, x7m12); + let t_b4_8 = vmulq_f32(self.twiddle6im, x8m11); + let t_b4_9 = vmulq_f32(self.twiddle2im, x9m10); + let t_b5_1 = vmulq_f32(self.twiddle5im, x1m18); + let t_b5_2 = vmulq_f32(self.twiddle9im, x2m17); + let t_b5_3 = vmulq_f32(self.twiddle4im, x3m16); + let t_b5_4 = vmulq_f32(self.twiddle1im, x4m15); + let t_b5_5 = vmulq_f32(self.twiddle6im, x5m14); + let t_b5_6 = vmulq_f32(self.twiddle8im, x6m13); + let t_b5_7 = vmulq_f32(self.twiddle3im, x7m12); + let t_b5_8 = vmulq_f32(self.twiddle2im, x8m11); + let t_b5_9 = vmulq_f32(self.twiddle7im, x9m10); + let t_b6_1 = vmulq_f32(self.twiddle6im, x1m18); + let t_b6_2 = vmulq_f32(self.twiddle7im, x2m17); + let t_b6_3 = vmulq_f32(self.twiddle1im, x3m16); + let t_b6_4 = vmulq_f32(self.twiddle5im, x4m15); + let t_b6_5 = vmulq_f32(self.twiddle8im, x5m14); + let t_b6_6 = vmulq_f32(self.twiddle2im, x6m13); + let t_b6_7 = vmulq_f32(self.twiddle4im, x7m12); + let t_b6_8 = vmulq_f32(self.twiddle9im, x8m11); + let t_b6_9 = vmulq_f32(self.twiddle3im, x9m10); + let t_b7_1 = vmulq_f32(self.twiddle7im, x1m18); + let t_b7_2 = vmulq_f32(self.twiddle5im, x2m17); + let t_b7_3 = vmulq_f32(self.twiddle2im, x3m16); + let t_b7_4 = vmulq_f32(self.twiddle9im, x4m15); + let t_b7_5 = vmulq_f32(self.twiddle3im, x5m14); + let t_b7_6 = vmulq_f32(self.twiddle4im, x6m13); + let t_b7_7 = vmulq_f32(self.twiddle8im, x7m12); + let t_b7_8 = vmulq_f32(self.twiddle1im, x8m11); + let t_b7_9 = vmulq_f32(self.twiddle6im, x9m10); + let t_b8_1 = vmulq_f32(self.twiddle8im, x1m18); + let t_b8_2 = vmulq_f32(self.twiddle3im, x2m17); + let t_b8_3 = vmulq_f32(self.twiddle5im, x3m16); + let t_b8_4 = vmulq_f32(self.twiddle6im, x4m15); + let t_b8_5 = vmulq_f32(self.twiddle2im, x5m14); + let t_b8_6 = vmulq_f32(self.twiddle9im, x6m13); + let t_b8_7 = vmulq_f32(self.twiddle1im, x7m12); + let t_b8_8 = vmulq_f32(self.twiddle7im, x8m11); + let t_b8_9 = vmulq_f32(self.twiddle4im, x9m10); + let t_b9_1 = vmulq_f32(self.twiddle9im, x1m18); + let t_b9_2 = vmulq_f32(self.twiddle1im, x2m17); + let t_b9_3 = vmulq_f32(self.twiddle8im, x3m16); + let t_b9_4 = vmulq_f32(self.twiddle2im, x4m15); + let t_b9_5 = vmulq_f32(self.twiddle7im, x5m14); + let t_b9_6 = vmulq_f32(self.twiddle3im, x6m13); + let t_b9_7 = vmulq_f32(self.twiddle6im, x7m12); + let t_b9_8 = vmulq_f32(self.twiddle4im, x8m11); + let t_b9_9 = vmulq_f32(self.twiddle5im, x9m10); + + let x0 = values[0]; + let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9); + let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9); + let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9); + let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9); + let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9); + let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9); + let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9); + let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9); + let t_a9 = calc_f32!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9); + + let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9); + let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 - t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8 - t_b2_9); + let t_b3 = calc_f32!(t_b3_1 + t_b3_2 + t_b3_3 - t_b3_4 - t_b3_5 - t_b3_6 + t_b3_7 + t_b3_8 + t_b3_9); + let t_b4 = calc_f32!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 + t_b4_5 + t_b4_6 + t_b4_7 - t_b4_8 - t_b4_9); + let t_b5 = calc_f32!(t_b5_1 - t_b5_2 - t_b5_3 + t_b5_4 + t_b5_5 - t_b5_6 - t_b5_7 + t_b5_8 + t_b5_9); + let t_b6 = calc_f32!(t_b6_1 - t_b6_2 - t_b6_3 + t_b6_4 - t_b6_5 - t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9); + let t_b7 = calc_f32!(t_b7_1 - t_b7_2 + t_b7_3 + t_b7_4 - t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9); + let t_b8 = calc_f32!(t_b8_1 - t_b8_2 + t_b8_3 - t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 - t_b8_9); + let t_b9 = calc_f32!(t_b9_1 - t_b9_2 + t_b9_3 - t_b9_4 + t_b9_5 - t_b9_6 + t_b9_7 - t_b9_8 + t_b9_9); + + let t_b1_rot = self.rotate.rotate_both(t_b1); + let t_b2_rot = self.rotate.rotate_both(t_b2); + let t_b3_rot = self.rotate.rotate_both(t_b3); + let t_b4_rot = self.rotate.rotate_both(t_b4); + let t_b5_rot = self.rotate.rotate_both(t_b5); + let t_b6_rot = self.rotate.rotate_both(t_b6); + let t_b7_rot = self.rotate.rotate_both(t_b7); + let t_b8_rot = self.rotate.rotate_both(t_b8); + let t_b9_rot = self.rotate.rotate_both(t_b9); + + let y0 = calc_f32!(x0 + x1p18 + x2p17 + x3p16 + x4p15 + x5p14 + x6p13 + x7p12 + x8p11 + x9p10); + let [y1, y18] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot); + let [y2, y17] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot); + let [y3, y16] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot); + let [y4, y15] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot); + let [y5, y14] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot); + let [y6, y13] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot); + let [y7, y12] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot); + let [y8, y11] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot); + let [y9, y10] = parallel_fft2_interleaved_f32(t_a9, t_b9_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18] + } +} + +// _ ___ __ _ _ _ _ _ +// / |/ _ \ / /_ | || | | |__ (_) |_ +// | | (_) | _____ | '_ \| || |_| '_ \| | __| +// | |\__, | |_____| | (_) |__ _| |_) | | |_ +// |_| /_/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly19 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, + twiddle3re: float64x2_t, + twiddle3im: float64x2_t, + twiddle4re: float64x2_t, + twiddle4im: float64x2_t, + twiddle5re: float64x2_t, + twiddle5im: float64x2_t, + twiddle6re: float64x2_t, + twiddle6im: float64x2_t, + twiddle7re: float64x2_t, + twiddle7im: float64x2_t, + twiddle8re: float64x2_t, + twiddle8im: float64x2_t, + twiddle9re: float64x2_t, + twiddle9im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly19, 19, |this: &NeonF64Butterfly19<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly19, 19, |this: &NeonF64Butterfly19<_>| this + .direction); +impl NeonF64Butterfly19 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 19, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 19, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 19, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 19, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 19, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 19, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 19, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 19, direction); + let tw9: Complex = twiddles::compute_twiddle(9, 19, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f64(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f64(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f64(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f64(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f64(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f64(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f64(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f64(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f64(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f64(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f64(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f64(tw8.im) }; + let twiddle9re = unsafe { vmovq_n_f64(tw9.re) }; + let twiddle9im = unsafe { vmovq_n_f64(tw9.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + twiddle9re, + twiddle9im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 19]) -> [float64x2_t; 19] { + let [x1p18, x1m18] = solo_fft2_f64(values[1], values[18]); + let [x2p17, x2m17] = solo_fft2_f64(values[2], values[17]); + let [x3p16, x3m16] = solo_fft2_f64(values[3], values[16]); + let [x4p15, x4m15] = solo_fft2_f64(values[4], values[15]); + let [x5p14, x5m14] = solo_fft2_f64(values[5], values[14]); + let [x6p13, x6m13] = solo_fft2_f64(values[6], values[13]); + let [x7p12, x7m12] = solo_fft2_f64(values[7], values[12]); + let [x8p11, x8m11] = solo_fft2_f64(values[8], values[11]); + let [x9p10, x9m10] = solo_fft2_f64(values[9], values[10]); + + let t_a1_1 = vmulq_f64(self.twiddle1re, x1p18); + let t_a1_2 = vmulq_f64(self.twiddle2re, x2p17); + let t_a1_3 = vmulq_f64(self.twiddle3re, x3p16); + let t_a1_4 = vmulq_f64(self.twiddle4re, x4p15); + let t_a1_5 = vmulq_f64(self.twiddle5re, x5p14); + let t_a1_6 = vmulq_f64(self.twiddle6re, x6p13); + let t_a1_7 = vmulq_f64(self.twiddle7re, x7p12); + let t_a1_8 = vmulq_f64(self.twiddle8re, x8p11); + let t_a1_9 = vmulq_f64(self.twiddle9re, x9p10); + let t_a2_1 = vmulq_f64(self.twiddle2re, x1p18); + let t_a2_2 = vmulq_f64(self.twiddle4re, x2p17); + let t_a2_3 = vmulq_f64(self.twiddle6re, x3p16); + let t_a2_4 = vmulq_f64(self.twiddle8re, x4p15); + let t_a2_5 = vmulq_f64(self.twiddle9re, x5p14); + let t_a2_6 = vmulq_f64(self.twiddle7re, x6p13); + let t_a2_7 = vmulq_f64(self.twiddle5re, x7p12); + let t_a2_8 = vmulq_f64(self.twiddle3re, x8p11); + let t_a2_9 = vmulq_f64(self.twiddle1re, x9p10); + let t_a3_1 = vmulq_f64(self.twiddle3re, x1p18); + let t_a3_2 = vmulq_f64(self.twiddle6re, x2p17); + let t_a3_3 = vmulq_f64(self.twiddle9re, x3p16); + let t_a3_4 = vmulq_f64(self.twiddle7re, x4p15); + let t_a3_5 = vmulq_f64(self.twiddle4re, x5p14); + let t_a3_6 = vmulq_f64(self.twiddle1re, x6p13); + let t_a3_7 = vmulq_f64(self.twiddle2re, x7p12); + let t_a3_8 = vmulq_f64(self.twiddle5re, x8p11); + let t_a3_9 = vmulq_f64(self.twiddle8re, x9p10); + let t_a4_1 = vmulq_f64(self.twiddle4re, x1p18); + let t_a4_2 = vmulq_f64(self.twiddle8re, x2p17); + let t_a4_3 = vmulq_f64(self.twiddle7re, x3p16); + let t_a4_4 = vmulq_f64(self.twiddle3re, x4p15); + let t_a4_5 = vmulq_f64(self.twiddle1re, x5p14); + let t_a4_6 = vmulq_f64(self.twiddle5re, x6p13); + let t_a4_7 = vmulq_f64(self.twiddle9re, x7p12); + let t_a4_8 = vmulq_f64(self.twiddle6re, x8p11); + let t_a4_9 = vmulq_f64(self.twiddle2re, x9p10); + let t_a5_1 = vmulq_f64(self.twiddle5re, x1p18); + let t_a5_2 = vmulq_f64(self.twiddle9re, x2p17); + let t_a5_3 = vmulq_f64(self.twiddle4re, x3p16); + let t_a5_4 = vmulq_f64(self.twiddle1re, x4p15); + let t_a5_5 = vmulq_f64(self.twiddle6re, x5p14); + let t_a5_6 = vmulq_f64(self.twiddle8re, x6p13); + let t_a5_7 = vmulq_f64(self.twiddle3re, x7p12); + let t_a5_8 = vmulq_f64(self.twiddle2re, x8p11); + let t_a5_9 = vmulq_f64(self.twiddle7re, x9p10); + let t_a6_1 = vmulq_f64(self.twiddle6re, x1p18); + let t_a6_2 = vmulq_f64(self.twiddle7re, x2p17); + let t_a6_3 = vmulq_f64(self.twiddle1re, x3p16); + let t_a6_4 = vmulq_f64(self.twiddle5re, x4p15); + let t_a6_5 = vmulq_f64(self.twiddle8re, x5p14); + let t_a6_6 = vmulq_f64(self.twiddle2re, x6p13); + let t_a6_7 = vmulq_f64(self.twiddle4re, x7p12); + let t_a6_8 = vmulq_f64(self.twiddle9re, x8p11); + let t_a6_9 = vmulq_f64(self.twiddle3re, x9p10); + let t_a7_1 = vmulq_f64(self.twiddle7re, x1p18); + let t_a7_2 = vmulq_f64(self.twiddle5re, x2p17); + let t_a7_3 = vmulq_f64(self.twiddle2re, x3p16); + let t_a7_4 = vmulq_f64(self.twiddle9re, x4p15); + let t_a7_5 = vmulq_f64(self.twiddle3re, x5p14); + let t_a7_6 = vmulq_f64(self.twiddle4re, x6p13); + let t_a7_7 = vmulq_f64(self.twiddle8re, x7p12); + let t_a7_8 = vmulq_f64(self.twiddle1re, x8p11); + let t_a7_9 = vmulq_f64(self.twiddle6re, x9p10); + let t_a8_1 = vmulq_f64(self.twiddle8re, x1p18); + let t_a8_2 = vmulq_f64(self.twiddle3re, x2p17); + let t_a8_3 = vmulq_f64(self.twiddle5re, x3p16); + let t_a8_4 = vmulq_f64(self.twiddle6re, x4p15); + let t_a8_5 = vmulq_f64(self.twiddle2re, x5p14); + let t_a8_6 = vmulq_f64(self.twiddle9re, x6p13); + let t_a8_7 = vmulq_f64(self.twiddle1re, x7p12); + let t_a8_8 = vmulq_f64(self.twiddle7re, x8p11); + let t_a8_9 = vmulq_f64(self.twiddle4re, x9p10); + let t_a9_1 = vmulq_f64(self.twiddle9re, x1p18); + let t_a9_2 = vmulq_f64(self.twiddle1re, x2p17); + let t_a9_3 = vmulq_f64(self.twiddle8re, x3p16); + let t_a9_4 = vmulq_f64(self.twiddle2re, x4p15); + let t_a9_5 = vmulq_f64(self.twiddle7re, x5p14); + let t_a9_6 = vmulq_f64(self.twiddle3re, x6p13); + let t_a9_7 = vmulq_f64(self.twiddle6re, x7p12); + let t_a9_8 = vmulq_f64(self.twiddle4re, x8p11); + let t_a9_9 = vmulq_f64(self.twiddle5re, x9p10); + + let t_b1_1 = vmulq_f64(self.twiddle1im, x1m18); + let t_b1_2 = vmulq_f64(self.twiddle2im, x2m17); + let t_b1_3 = vmulq_f64(self.twiddle3im, x3m16); + let t_b1_4 = vmulq_f64(self.twiddle4im, x4m15); + let t_b1_5 = vmulq_f64(self.twiddle5im, x5m14); + let t_b1_6 = vmulq_f64(self.twiddle6im, x6m13); + let t_b1_7 = vmulq_f64(self.twiddle7im, x7m12); + let t_b1_8 = vmulq_f64(self.twiddle8im, x8m11); + let t_b1_9 = vmulq_f64(self.twiddle9im, x9m10); + let t_b2_1 = vmulq_f64(self.twiddle2im, x1m18); + let t_b2_2 = vmulq_f64(self.twiddle4im, x2m17); + let t_b2_3 = vmulq_f64(self.twiddle6im, x3m16); + let t_b2_4 = vmulq_f64(self.twiddle8im, x4m15); + let t_b2_5 = vmulq_f64(self.twiddle9im, x5m14); + let t_b2_6 = vmulq_f64(self.twiddle7im, x6m13); + let t_b2_7 = vmulq_f64(self.twiddle5im, x7m12); + let t_b2_8 = vmulq_f64(self.twiddle3im, x8m11); + let t_b2_9 = vmulq_f64(self.twiddle1im, x9m10); + let t_b3_1 = vmulq_f64(self.twiddle3im, x1m18); + let t_b3_2 = vmulq_f64(self.twiddle6im, x2m17); + let t_b3_3 = vmulq_f64(self.twiddle9im, x3m16); + let t_b3_4 = vmulq_f64(self.twiddle7im, x4m15); + let t_b3_5 = vmulq_f64(self.twiddle4im, x5m14); + let t_b3_6 = vmulq_f64(self.twiddle1im, x6m13); + let t_b3_7 = vmulq_f64(self.twiddle2im, x7m12); + let t_b3_8 = vmulq_f64(self.twiddle5im, x8m11); + let t_b3_9 = vmulq_f64(self.twiddle8im, x9m10); + let t_b4_1 = vmulq_f64(self.twiddle4im, x1m18); + let t_b4_2 = vmulq_f64(self.twiddle8im, x2m17); + let t_b4_3 = vmulq_f64(self.twiddle7im, x3m16); + let t_b4_4 = vmulq_f64(self.twiddle3im, x4m15); + let t_b4_5 = vmulq_f64(self.twiddle1im, x5m14); + let t_b4_6 = vmulq_f64(self.twiddle5im, x6m13); + let t_b4_7 = vmulq_f64(self.twiddle9im, x7m12); + let t_b4_8 = vmulq_f64(self.twiddle6im, x8m11); + let t_b4_9 = vmulq_f64(self.twiddle2im, x9m10); + let t_b5_1 = vmulq_f64(self.twiddle5im, x1m18); + let t_b5_2 = vmulq_f64(self.twiddle9im, x2m17); + let t_b5_3 = vmulq_f64(self.twiddle4im, x3m16); + let t_b5_4 = vmulq_f64(self.twiddle1im, x4m15); + let t_b5_5 = vmulq_f64(self.twiddle6im, x5m14); + let t_b5_6 = vmulq_f64(self.twiddle8im, x6m13); + let t_b5_7 = vmulq_f64(self.twiddle3im, x7m12); + let t_b5_8 = vmulq_f64(self.twiddle2im, x8m11); + let t_b5_9 = vmulq_f64(self.twiddle7im, x9m10); + let t_b6_1 = vmulq_f64(self.twiddle6im, x1m18); + let t_b6_2 = vmulq_f64(self.twiddle7im, x2m17); + let t_b6_3 = vmulq_f64(self.twiddle1im, x3m16); + let t_b6_4 = vmulq_f64(self.twiddle5im, x4m15); + let t_b6_5 = vmulq_f64(self.twiddle8im, x5m14); + let t_b6_6 = vmulq_f64(self.twiddle2im, x6m13); + let t_b6_7 = vmulq_f64(self.twiddle4im, x7m12); + let t_b6_8 = vmulq_f64(self.twiddle9im, x8m11); + let t_b6_9 = vmulq_f64(self.twiddle3im, x9m10); + let t_b7_1 = vmulq_f64(self.twiddle7im, x1m18); + let t_b7_2 = vmulq_f64(self.twiddle5im, x2m17); + let t_b7_3 = vmulq_f64(self.twiddle2im, x3m16); + let t_b7_4 = vmulq_f64(self.twiddle9im, x4m15); + let t_b7_5 = vmulq_f64(self.twiddle3im, x5m14); + let t_b7_6 = vmulq_f64(self.twiddle4im, x6m13); + let t_b7_7 = vmulq_f64(self.twiddle8im, x7m12); + let t_b7_8 = vmulq_f64(self.twiddle1im, x8m11); + let t_b7_9 = vmulq_f64(self.twiddle6im, x9m10); + let t_b8_1 = vmulq_f64(self.twiddle8im, x1m18); + let t_b8_2 = vmulq_f64(self.twiddle3im, x2m17); + let t_b8_3 = vmulq_f64(self.twiddle5im, x3m16); + let t_b8_4 = vmulq_f64(self.twiddle6im, x4m15); + let t_b8_5 = vmulq_f64(self.twiddle2im, x5m14); + let t_b8_6 = vmulq_f64(self.twiddle9im, x6m13); + let t_b8_7 = vmulq_f64(self.twiddle1im, x7m12); + let t_b8_8 = vmulq_f64(self.twiddle7im, x8m11); + let t_b8_9 = vmulq_f64(self.twiddle4im, x9m10); + let t_b9_1 = vmulq_f64(self.twiddle9im, x1m18); + let t_b9_2 = vmulq_f64(self.twiddle1im, x2m17); + let t_b9_3 = vmulq_f64(self.twiddle8im, x3m16); + let t_b9_4 = vmulq_f64(self.twiddle2im, x4m15); + let t_b9_5 = vmulq_f64(self.twiddle7im, x5m14); + let t_b9_6 = vmulq_f64(self.twiddle3im, x6m13); + let t_b9_7 = vmulq_f64(self.twiddle6im, x7m12); + let t_b9_8 = vmulq_f64(self.twiddle4im, x8m11); + let t_b9_9 = vmulq_f64(self.twiddle5im, x9m10); + + let x0 = values[0]; + let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9); + let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9); + let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9); + let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9); + let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9); + let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9); + let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9); + let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9); + let t_a9 = calc_f64!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9); + + let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9); + let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 - t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8 - t_b2_9); + let t_b3 = calc_f64!(t_b3_1 + t_b3_2 + t_b3_3 - t_b3_4 - t_b3_5 - t_b3_6 + t_b3_7 + t_b3_8 + t_b3_9); + let t_b4 = calc_f64!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 + t_b4_5 + t_b4_6 + t_b4_7 - t_b4_8 - t_b4_9); + let t_b5 = calc_f64!(t_b5_1 - t_b5_2 - t_b5_3 + t_b5_4 + t_b5_5 - t_b5_6 - t_b5_7 + t_b5_8 + t_b5_9); + let t_b6 = calc_f64!(t_b6_1 - t_b6_2 - t_b6_3 + t_b6_4 - t_b6_5 - t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9); + let t_b7 = calc_f64!(t_b7_1 - t_b7_2 + t_b7_3 + t_b7_4 - t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9); + let t_b8 = calc_f64!(t_b8_1 - t_b8_2 + t_b8_3 - t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 - t_b8_9); + let t_b9 = calc_f64!(t_b9_1 - t_b9_2 + t_b9_3 - t_b9_4 + t_b9_5 - t_b9_6 + t_b9_7 - t_b9_8 + t_b9_9); + + let t_b1_rot = self.rotate.rotate(t_b1); + let t_b2_rot = self.rotate.rotate(t_b2); + let t_b3_rot = self.rotate.rotate(t_b3); + let t_b4_rot = self.rotate.rotate(t_b4); + let t_b5_rot = self.rotate.rotate(t_b5); + let t_b6_rot = self.rotate.rotate(t_b6); + let t_b7_rot = self.rotate.rotate(t_b7); + let t_b8_rot = self.rotate.rotate(t_b8); + let t_b9_rot = self.rotate.rotate(t_b9); + + let y0 = calc_f64!(x0 + x1p18 + x2p17 + x3p16 + x4p15 + x5p14 + x6p13 + x7p12 + x8p11 + x9p10); + let [y1, y18] = solo_fft2_f64(t_a1, t_b1_rot); + let [y2, y17] = solo_fft2_f64(t_a2, t_b2_rot); + let [y3, y16] = solo_fft2_f64(t_a3, t_b3_rot); + let [y4, y15] = solo_fft2_f64(t_a4, t_b4_rot); + let [y5, y14] = solo_fft2_f64(t_a5, t_b5_rot); + let [y6, y13] = solo_fft2_f64(t_a6, t_b6_rot); + let [y7, y12] = solo_fft2_f64(t_a7, t_b7_rot); + let [y8, y11] = solo_fft2_f64(t_a8, t_b8_rot); + let [y9, y10] = solo_fft2_f64(t_a9, t_b9_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18] + } +} + +// ____ _____ _________ _ _ _ +// |___ \|___ / |___ /___ \| |__ (_) |_ +// __) | |_ \ _____ |_ \ __) | '_ \| | __| +// / __/ ___) | |_____| ___) / __/| |_) | | |_ +// |_____|____/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly23 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, + twiddle3re: float32x4_t, + twiddle3im: float32x4_t, + twiddle4re: float32x4_t, + twiddle4im: float32x4_t, + twiddle5re: float32x4_t, + twiddle5im: float32x4_t, + twiddle6re: float32x4_t, + twiddle6im: float32x4_t, + twiddle7re: float32x4_t, + twiddle7im: float32x4_t, + twiddle8re: float32x4_t, + twiddle8im: float32x4_t, + twiddle9re: float32x4_t, + twiddle9im: float32x4_t, + twiddle10re: float32x4_t, + twiddle10im: float32x4_t, + twiddle11re: float32x4_t, + twiddle11im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly23, 23, |this: &NeonF32Butterfly23<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly23, 23, |this: &NeonF32Butterfly23<_>| this + .direction); +impl NeonF32Butterfly23 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 23, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 23, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 23, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 23, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 23, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 23, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 23, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 23, direction); + let tw9: Complex = twiddles::compute_twiddle(9, 23, direction); + let tw10: Complex = twiddles::compute_twiddle(10, 23, direction); + let tw11: Complex = twiddles::compute_twiddle(11, 23, direction); + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f32(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f32(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f32(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f32(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f32(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f32(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f32(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f32(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f32(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f32(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f32(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f32(tw8.im) }; + let twiddle9re = unsafe { vmovq_n_f32(tw9.re) }; + let twiddle9im = unsafe { vmovq_n_f32(tw9.im) }; + let twiddle10re = unsafe { vmovq_n_f32(tw10.re) }; + let twiddle10im = unsafe { vmovq_n_f32(tw10.im) }; + let twiddle11re = unsafe { vmovq_n_f32(tw11.re) }; + let twiddle11im = unsafe { vmovq_n_f32(tw11.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + twiddle9re, + twiddle9im, + twiddle10re, + twiddle10im, + twiddle11re, + twiddle11im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}); + + let out = self.perform_parallel_fft_direct(values); + + write_partial_lo_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[11]), + extract_hi_lo_f32(input_packed[0], input_packed[12]), + extract_lo_hi_f32(input_packed[1], input_packed[12]), + extract_hi_lo_f32(input_packed[1], input_packed[13]), + extract_lo_hi_f32(input_packed[2], input_packed[13]), + extract_hi_lo_f32(input_packed[2], input_packed[14]), + extract_lo_hi_f32(input_packed[3], input_packed[14]), + extract_hi_lo_f32(input_packed[3], input_packed[15]), + extract_lo_hi_f32(input_packed[4], input_packed[15]), + extract_hi_lo_f32(input_packed[4], input_packed[16]), + extract_lo_hi_f32(input_packed[5], input_packed[16]), + extract_hi_lo_f32(input_packed[5], input_packed[17]), + extract_lo_hi_f32(input_packed[6], input_packed[17]), + extract_hi_lo_f32(input_packed[6], input_packed[18]), + extract_lo_hi_f32(input_packed[7], input_packed[18]), + extract_hi_lo_f32(input_packed[7], input_packed[19]), + extract_lo_hi_f32(input_packed[8], input_packed[19]), + extract_hi_lo_f32(input_packed[8], input_packed[20]), + extract_lo_hi_f32(input_packed[9], input_packed[20]), + extract_hi_lo_f32(input_packed[9], input_packed[21]), + extract_lo_hi_f32(input_packed[10], input_packed[21]), + extract_hi_lo_f32(input_packed[10], input_packed[22]), + extract_lo_hi_f32(input_packed[11], input_packed[22]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_lo_f32(out[8], out[9]), + extract_lo_lo_f32(out[10], out[11]), + extract_lo_lo_f32(out[12], out[13]), + extract_lo_lo_f32(out[14], out[15]), + extract_lo_lo_f32(out[16], out[17]), + extract_lo_lo_f32(out[18], out[19]), + extract_lo_lo_f32(out[20], out[21]), + extract_lo_hi_f32(out[22], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + extract_hi_hi_f32(out[9], out[10]), + extract_hi_hi_f32(out[11], out[12]), + extract_hi_hi_f32(out[13], out[14]), + extract_hi_hi_f32(out[15], out[16]), + extract_hi_hi_f32(out[17], out[18]), + extract_hi_hi_f32(out[19], out[20]), + extract_hi_hi_f32(out[21], out[22]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 23]) -> [float32x4_t; 23] { + let [x1p22, x1m22] = parallel_fft2_interleaved_f32(values[1], values[22]); + let [x2p21, x2m21] = parallel_fft2_interleaved_f32(values[2], values[21]); + let [x3p20, x3m20] = parallel_fft2_interleaved_f32(values[3], values[20]); + let [x4p19, x4m19] = parallel_fft2_interleaved_f32(values[4], values[19]); + let [x5p18, x5m18] = parallel_fft2_interleaved_f32(values[5], values[18]); + let [x6p17, x6m17] = parallel_fft2_interleaved_f32(values[6], values[17]); + let [x7p16, x7m16] = parallel_fft2_interleaved_f32(values[7], values[16]); + let [x8p15, x8m15] = parallel_fft2_interleaved_f32(values[8], values[15]); + let [x9p14, x9m14] = parallel_fft2_interleaved_f32(values[9], values[14]); + let [x10p13, x10m13] = parallel_fft2_interleaved_f32(values[10], values[13]); + let [x11p12, x11m12] = parallel_fft2_interleaved_f32(values[11], values[12]); + + let t_a1_1 = vmulq_f32(self.twiddle1re, x1p22); + let t_a1_2 = vmulq_f32(self.twiddle2re, x2p21); + let t_a1_3 = vmulq_f32(self.twiddle3re, x3p20); + let t_a1_4 = vmulq_f32(self.twiddle4re, x4p19); + let t_a1_5 = vmulq_f32(self.twiddle5re, x5p18); + let t_a1_6 = vmulq_f32(self.twiddle6re, x6p17); + let t_a1_7 = vmulq_f32(self.twiddle7re, x7p16); + let t_a1_8 = vmulq_f32(self.twiddle8re, x8p15); + let t_a1_9 = vmulq_f32(self.twiddle9re, x9p14); + let t_a1_10 = vmulq_f32(self.twiddle10re, x10p13); + let t_a1_11 = vmulq_f32(self.twiddle11re, x11p12); + let t_a2_1 = vmulq_f32(self.twiddle2re, x1p22); + let t_a2_2 = vmulq_f32(self.twiddle4re, x2p21); + let t_a2_3 = vmulq_f32(self.twiddle6re, x3p20); + let t_a2_4 = vmulq_f32(self.twiddle8re, x4p19); + let t_a2_5 = vmulq_f32(self.twiddle10re, x5p18); + let t_a2_6 = vmulq_f32(self.twiddle11re, x6p17); + let t_a2_7 = vmulq_f32(self.twiddle9re, x7p16); + let t_a2_8 = vmulq_f32(self.twiddle7re, x8p15); + let t_a2_9 = vmulq_f32(self.twiddle5re, x9p14); + let t_a2_10 = vmulq_f32(self.twiddle3re, x10p13); + let t_a2_11 = vmulq_f32(self.twiddle1re, x11p12); + let t_a3_1 = vmulq_f32(self.twiddle3re, x1p22); + let t_a3_2 = vmulq_f32(self.twiddle6re, x2p21); + let t_a3_3 = vmulq_f32(self.twiddle9re, x3p20); + let t_a3_4 = vmulq_f32(self.twiddle11re, x4p19); + let t_a3_5 = vmulq_f32(self.twiddle8re, x5p18); + let t_a3_6 = vmulq_f32(self.twiddle5re, x6p17); + let t_a3_7 = vmulq_f32(self.twiddle2re, x7p16); + let t_a3_8 = vmulq_f32(self.twiddle1re, x8p15); + let t_a3_9 = vmulq_f32(self.twiddle4re, x9p14); + let t_a3_10 = vmulq_f32(self.twiddle7re, x10p13); + let t_a3_11 = vmulq_f32(self.twiddle10re, x11p12); + let t_a4_1 = vmulq_f32(self.twiddle4re, x1p22); + let t_a4_2 = vmulq_f32(self.twiddle8re, x2p21); + let t_a4_3 = vmulq_f32(self.twiddle11re, x3p20); + let t_a4_4 = vmulq_f32(self.twiddle7re, x4p19); + let t_a4_5 = vmulq_f32(self.twiddle3re, x5p18); + let t_a4_6 = vmulq_f32(self.twiddle1re, x6p17); + let t_a4_7 = vmulq_f32(self.twiddle5re, x7p16); + let t_a4_8 = vmulq_f32(self.twiddle9re, x8p15); + let t_a4_9 = vmulq_f32(self.twiddle10re, x9p14); + let t_a4_10 = vmulq_f32(self.twiddle6re, x10p13); + let t_a4_11 = vmulq_f32(self.twiddle2re, x11p12); + let t_a5_1 = vmulq_f32(self.twiddle5re, x1p22); + let t_a5_2 = vmulq_f32(self.twiddle10re, x2p21); + let t_a5_3 = vmulq_f32(self.twiddle8re, x3p20); + let t_a5_4 = vmulq_f32(self.twiddle3re, x4p19); + let t_a5_5 = vmulq_f32(self.twiddle2re, x5p18); + let t_a5_6 = vmulq_f32(self.twiddle7re, x6p17); + let t_a5_7 = vmulq_f32(self.twiddle11re, x7p16); + let t_a5_8 = vmulq_f32(self.twiddle6re, x8p15); + let t_a5_9 = vmulq_f32(self.twiddle1re, x9p14); + let t_a5_10 = vmulq_f32(self.twiddle4re, x10p13); + let t_a5_11 = vmulq_f32(self.twiddle9re, x11p12); + let t_a6_1 = vmulq_f32(self.twiddle6re, x1p22); + let t_a6_2 = vmulq_f32(self.twiddle11re, x2p21); + let t_a6_3 = vmulq_f32(self.twiddle5re, x3p20); + let t_a6_4 = vmulq_f32(self.twiddle1re, x4p19); + let t_a6_5 = vmulq_f32(self.twiddle7re, x5p18); + let t_a6_6 = vmulq_f32(self.twiddle10re, x6p17); + let t_a6_7 = vmulq_f32(self.twiddle4re, x7p16); + let t_a6_8 = vmulq_f32(self.twiddle2re, x8p15); + let t_a6_9 = vmulq_f32(self.twiddle8re, x9p14); + let t_a6_10 = vmulq_f32(self.twiddle9re, x10p13); + let t_a6_11 = vmulq_f32(self.twiddle3re, x11p12); + let t_a7_1 = vmulq_f32(self.twiddle7re, x1p22); + let t_a7_2 = vmulq_f32(self.twiddle9re, x2p21); + let t_a7_3 = vmulq_f32(self.twiddle2re, x3p20); + let t_a7_4 = vmulq_f32(self.twiddle5re, x4p19); + let t_a7_5 = vmulq_f32(self.twiddle11re, x5p18); + let t_a7_6 = vmulq_f32(self.twiddle4re, x6p17); + let t_a7_7 = vmulq_f32(self.twiddle3re, x7p16); + let t_a7_8 = vmulq_f32(self.twiddle10re, x8p15); + let t_a7_9 = vmulq_f32(self.twiddle6re, x9p14); + let t_a7_10 = vmulq_f32(self.twiddle1re, x10p13); + let t_a7_11 = vmulq_f32(self.twiddle8re, x11p12); + let t_a8_1 = vmulq_f32(self.twiddle8re, x1p22); + let t_a8_2 = vmulq_f32(self.twiddle7re, x2p21); + let t_a8_3 = vmulq_f32(self.twiddle1re, x3p20); + let t_a8_4 = vmulq_f32(self.twiddle9re, x4p19); + let t_a8_5 = vmulq_f32(self.twiddle6re, x5p18); + let t_a8_6 = vmulq_f32(self.twiddle2re, x6p17); + let t_a8_7 = vmulq_f32(self.twiddle10re, x7p16); + let t_a8_8 = vmulq_f32(self.twiddle5re, x8p15); + let t_a8_9 = vmulq_f32(self.twiddle3re, x9p14); + let t_a8_10 = vmulq_f32(self.twiddle11re, x10p13); + let t_a8_11 = vmulq_f32(self.twiddle4re, x11p12); + let t_a9_1 = vmulq_f32(self.twiddle9re, x1p22); + let t_a9_2 = vmulq_f32(self.twiddle5re, x2p21); + let t_a9_3 = vmulq_f32(self.twiddle4re, x3p20); + let t_a9_4 = vmulq_f32(self.twiddle10re, x4p19); + let t_a9_5 = vmulq_f32(self.twiddle1re, x5p18); + let t_a9_6 = vmulq_f32(self.twiddle8re, x6p17); + let t_a9_7 = vmulq_f32(self.twiddle6re, x7p16); + let t_a9_8 = vmulq_f32(self.twiddle3re, x8p15); + let t_a9_9 = vmulq_f32(self.twiddle11re, x9p14); + let t_a9_10 = vmulq_f32(self.twiddle2re, x10p13); + let t_a9_11 = vmulq_f32(self.twiddle7re, x11p12); + let t_a10_1 = vmulq_f32(self.twiddle10re, x1p22); + let t_a10_2 = vmulq_f32(self.twiddle3re, x2p21); + let t_a10_3 = vmulq_f32(self.twiddle7re, x3p20); + let t_a10_4 = vmulq_f32(self.twiddle6re, x4p19); + let t_a10_5 = vmulq_f32(self.twiddle4re, x5p18); + let t_a10_6 = vmulq_f32(self.twiddle9re, x6p17); + let t_a10_7 = vmulq_f32(self.twiddle1re, x7p16); + let t_a10_8 = vmulq_f32(self.twiddle11re, x8p15); + let t_a10_9 = vmulq_f32(self.twiddle2re, x9p14); + let t_a10_10 = vmulq_f32(self.twiddle8re, x10p13); + let t_a10_11 = vmulq_f32(self.twiddle5re, x11p12); + let t_a11_1 = vmulq_f32(self.twiddle11re, x1p22); + let t_a11_2 = vmulq_f32(self.twiddle1re, x2p21); + let t_a11_3 = vmulq_f32(self.twiddle10re, x3p20); + let t_a11_4 = vmulq_f32(self.twiddle2re, x4p19); + let t_a11_5 = vmulq_f32(self.twiddle9re, x5p18); + let t_a11_6 = vmulq_f32(self.twiddle3re, x6p17); + let t_a11_7 = vmulq_f32(self.twiddle8re, x7p16); + let t_a11_8 = vmulq_f32(self.twiddle4re, x8p15); + let t_a11_9 = vmulq_f32(self.twiddle7re, x9p14); + let t_a11_10 = vmulq_f32(self.twiddle5re, x10p13); + let t_a11_11 = vmulq_f32(self.twiddle6re, x11p12); + + let t_b1_1 = vmulq_f32(self.twiddle1im, x1m22); + let t_b1_2 = vmulq_f32(self.twiddle2im, x2m21); + let t_b1_3 = vmulq_f32(self.twiddle3im, x3m20); + let t_b1_4 = vmulq_f32(self.twiddle4im, x4m19); + let t_b1_5 = vmulq_f32(self.twiddle5im, x5m18); + let t_b1_6 = vmulq_f32(self.twiddle6im, x6m17); + let t_b1_7 = vmulq_f32(self.twiddle7im, x7m16); + let t_b1_8 = vmulq_f32(self.twiddle8im, x8m15); + let t_b1_9 = vmulq_f32(self.twiddle9im, x9m14); + let t_b1_10 = vmulq_f32(self.twiddle10im, x10m13); + let t_b1_11 = vmulq_f32(self.twiddle11im, x11m12); + let t_b2_1 = vmulq_f32(self.twiddle2im, x1m22); + let t_b2_2 = vmulq_f32(self.twiddle4im, x2m21); + let t_b2_3 = vmulq_f32(self.twiddle6im, x3m20); + let t_b2_4 = vmulq_f32(self.twiddle8im, x4m19); + let t_b2_5 = vmulq_f32(self.twiddle10im, x5m18); + let t_b2_6 = vmulq_f32(self.twiddle11im, x6m17); + let t_b2_7 = vmulq_f32(self.twiddle9im, x7m16); + let t_b2_8 = vmulq_f32(self.twiddle7im, x8m15); + let t_b2_9 = vmulq_f32(self.twiddle5im, x9m14); + let t_b2_10 = vmulq_f32(self.twiddle3im, x10m13); + let t_b2_11 = vmulq_f32(self.twiddle1im, x11m12); + let t_b3_1 = vmulq_f32(self.twiddle3im, x1m22); + let t_b3_2 = vmulq_f32(self.twiddle6im, x2m21); + let t_b3_3 = vmulq_f32(self.twiddle9im, x3m20); + let t_b3_4 = vmulq_f32(self.twiddle11im, x4m19); + let t_b3_5 = vmulq_f32(self.twiddle8im, x5m18); + let t_b3_6 = vmulq_f32(self.twiddle5im, x6m17); + let t_b3_7 = vmulq_f32(self.twiddle2im, x7m16); + let t_b3_8 = vmulq_f32(self.twiddle1im, x8m15); + let t_b3_9 = vmulq_f32(self.twiddle4im, x9m14); + let t_b3_10 = vmulq_f32(self.twiddle7im, x10m13); + let t_b3_11 = vmulq_f32(self.twiddle10im, x11m12); + let t_b4_1 = vmulq_f32(self.twiddle4im, x1m22); + let t_b4_2 = vmulq_f32(self.twiddle8im, x2m21); + let t_b4_3 = vmulq_f32(self.twiddle11im, x3m20); + let t_b4_4 = vmulq_f32(self.twiddle7im, x4m19); + let t_b4_5 = vmulq_f32(self.twiddle3im, x5m18); + let t_b4_6 = vmulq_f32(self.twiddle1im, x6m17); + let t_b4_7 = vmulq_f32(self.twiddle5im, x7m16); + let t_b4_8 = vmulq_f32(self.twiddle9im, x8m15); + let t_b4_9 = vmulq_f32(self.twiddle10im, x9m14); + let t_b4_10 = vmulq_f32(self.twiddle6im, x10m13); + let t_b4_11 = vmulq_f32(self.twiddle2im, x11m12); + let t_b5_1 = vmulq_f32(self.twiddle5im, x1m22); + let t_b5_2 = vmulq_f32(self.twiddle10im, x2m21); + let t_b5_3 = vmulq_f32(self.twiddle8im, x3m20); + let t_b5_4 = vmulq_f32(self.twiddle3im, x4m19); + let t_b5_5 = vmulq_f32(self.twiddle2im, x5m18); + let t_b5_6 = vmulq_f32(self.twiddle7im, x6m17); + let t_b5_7 = vmulq_f32(self.twiddle11im, x7m16); + let t_b5_8 = vmulq_f32(self.twiddle6im, x8m15); + let t_b5_9 = vmulq_f32(self.twiddle1im, x9m14); + let t_b5_10 = vmulq_f32(self.twiddle4im, x10m13); + let t_b5_11 = vmulq_f32(self.twiddle9im, x11m12); + let t_b6_1 = vmulq_f32(self.twiddle6im, x1m22); + let t_b6_2 = vmulq_f32(self.twiddle11im, x2m21); + let t_b6_3 = vmulq_f32(self.twiddle5im, x3m20); + let t_b6_4 = vmulq_f32(self.twiddle1im, x4m19); + let t_b6_5 = vmulq_f32(self.twiddle7im, x5m18); + let t_b6_6 = vmulq_f32(self.twiddle10im, x6m17); + let t_b6_7 = vmulq_f32(self.twiddle4im, x7m16); + let t_b6_8 = vmulq_f32(self.twiddle2im, x8m15); + let t_b6_9 = vmulq_f32(self.twiddle8im, x9m14); + let t_b6_10 = vmulq_f32(self.twiddle9im, x10m13); + let t_b6_11 = vmulq_f32(self.twiddle3im, x11m12); + let t_b7_1 = vmulq_f32(self.twiddle7im, x1m22); + let t_b7_2 = vmulq_f32(self.twiddle9im, x2m21); + let t_b7_3 = vmulq_f32(self.twiddle2im, x3m20); + let t_b7_4 = vmulq_f32(self.twiddle5im, x4m19); + let t_b7_5 = vmulq_f32(self.twiddle11im, x5m18); + let t_b7_6 = vmulq_f32(self.twiddle4im, x6m17); + let t_b7_7 = vmulq_f32(self.twiddle3im, x7m16); + let t_b7_8 = vmulq_f32(self.twiddle10im, x8m15); + let t_b7_9 = vmulq_f32(self.twiddle6im, x9m14); + let t_b7_10 = vmulq_f32(self.twiddle1im, x10m13); + let t_b7_11 = vmulq_f32(self.twiddle8im, x11m12); + let t_b8_1 = vmulq_f32(self.twiddle8im, x1m22); + let t_b8_2 = vmulq_f32(self.twiddle7im, x2m21); + let t_b8_3 = vmulq_f32(self.twiddle1im, x3m20); + let t_b8_4 = vmulq_f32(self.twiddle9im, x4m19); + let t_b8_5 = vmulq_f32(self.twiddle6im, x5m18); + let t_b8_6 = vmulq_f32(self.twiddle2im, x6m17); + let t_b8_7 = vmulq_f32(self.twiddle10im, x7m16); + let t_b8_8 = vmulq_f32(self.twiddle5im, x8m15); + let t_b8_9 = vmulq_f32(self.twiddle3im, x9m14); + let t_b8_10 = vmulq_f32(self.twiddle11im, x10m13); + let t_b8_11 = vmulq_f32(self.twiddle4im, x11m12); + let t_b9_1 = vmulq_f32(self.twiddle9im, x1m22); + let t_b9_2 = vmulq_f32(self.twiddle5im, x2m21); + let t_b9_3 = vmulq_f32(self.twiddle4im, x3m20); + let t_b9_4 = vmulq_f32(self.twiddle10im, x4m19); + let t_b9_5 = vmulq_f32(self.twiddle1im, x5m18); + let t_b9_6 = vmulq_f32(self.twiddle8im, x6m17); + let t_b9_7 = vmulq_f32(self.twiddle6im, x7m16); + let t_b9_8 = vmulq_f32(self.twiddle3im, x8m15); + let t_b9_9 = vmulq_f32(self.twiddle11im, x9m14); + let t_b9_10 = vmulq_f32(self.twiddle2im, x10m13); + let t_b9_11 = vmulq_f32(self.twiddle7im, x11m12); + let t_b10_1 = vmulq_f32(self.twiddle10im, x1m22); + let t_b10_2 = vmulq_f32(self.twiddle3im, x2m21); + let t_b10_3 = vmulq_f32(self.twiddle7im, x3m20); + let t_b10_4 = vmulq_f32(self.twiddle6im, x4m19); + let t_b10_5 = vmulq_f32(self.twiddle4im, x5m18); + let t_b10_6 = vmulq_f32(self.twiddle9im, x6m17); + let t_b10_7 = vmulq_f32(self.twiddle1im, x7m16); + let t_b10_8 = vmulq_f32(self.twiddle11im, x8m15); + let t_b10_9 = vmulq_f32(self.twiddle2im, x9m14); + let t_b10_10 = vmulq_f32(self.twiddle8im, x10m13); + let t_b10_11 = vmulq_f32(self.twiddle5im, x11m12); + let t_b11_1 = vmulq_f32(self.twiddle11im, x1m22); + let t_b11_2 = vmulq_f32(self.twiddle1im, x2m21); + let t_b11_3 = vmulq_f32(self.twiddle10im, x3m20); + let t_b11_4 = vmulq_f32(self.twiddle2im, x4m19); + let t_b11_5 = vmulq_f32(self.twiddle9im, x5m18); + let t_b11_6 = vmulq_f32(self.twiddle3im, x6m17); + let t_b11_7 = vmulq_f32(self.twiddle8im, x7m16); + let t_b11_8 = vmulq_f32(self.twiddle4im, x8m15); + let t_b11_9 = vmulq_f32(self.twiddle7im, x9m14); + let t_b11_10 = vmulq_f32(self.twiddle5im, x10m13); + let t_b11_11 = vmulq_f32(self.twiddle6im, x11m12); + + let x0 = values[0]; + let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11); + let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11); + let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11); + let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11); + let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11); + let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11); + let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11); + let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11); + let t_a9 = calc_f32!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11); + let t_a10 = calc_f32!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11); + let t_a11 = calc_f32!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11); + + let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11); + let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11); + let t_b3 = calc_f32!(t_b3_1 + t_b3_2 + t_b3_3 - t_b3_4 - t_b3_5 - t_b3_6 - t_b3_7 + t_b3_8 + t_b3_9 + t_b3_10 + t_b3_11); + let t_b4 = calc_f32!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 - t_b4_5 + t_b4_6 + t_b4_7 + t_b4_8 - t_b4_9 - t_b4_10 - t_b4_11); + let t_b5 = calc_f32!(t_b5_1 + t_b5_2 - t_b5_3 - t_b5_4 + t_b5_5 + t_b5_6 - t_b5_7 - t_b5_8 - t_b5_9 + t_b5_10 + t_b5_11); + let t_b6 = calc_f32!(t_b6_1 - t_b6_2 - t_b6_3 + t_b6_4 + t_b6_5 - t_b6_6 - t_b6_7 + t_b6_8 + t_b6_9 - t_b6_10 - t_b6_11); + let t_b7 = calc_f32!(t_b7_1 - t_b7_2 - t_b7_3 + t_b7_4 - t_b7_5 - t_b7_6 + t_b7_7 + t_b7_8 - t_b7_9 + t_b7_10 + t_b7_11); + let t_b8 = calc_f32!(t_b8_1 - t_b8_2 + t_b8_3 + t_b8_4 - t_b8_5 + t_b8_6 + t_b8_7 - t_b8_8 + t_b8_9 + t_b8_10 - t_b8_11); + let t_b9 = calc_f32!(t_b9_1 - t_b9_2 + t_b9_3 - t_b9_4 - t_b9_5 + t_b9_6 - t_b9_7 + t_b9_8 - t_b9_9 - t_b9_10 + t_b9_11); + let t_b10 = calc_f32!(t_b10_1 - t_b10_2 + t_b10_3 - t_b10_4 + t_b10_5 - t_b10_6 + t_b10_7 + t_b10_8 - t_b10_9 + t_b10_10 - t_b10_11); + let t_b11 = calc_f32!(t_b11_1 - t_b11_2 + t_b11_3 - t_b11_4 + t_b11_5 - t_b11_6 + t_b11_7 - t_b11_8 + t_b11_9 - t_b11_10 + t_b11_11); + + let t_b1_rot = self.rotate.rotate_both(t_b1); + let t_b2_rot = self.rotate.rotate_both(t_b2); + let t_b3_rot = self.rotate.rotate_both(t_b3); + let t_b4_rot = self.rotate.rotate_both(t_b4); + let t_b5_rot = self.rotate.rotate_both(t_b5); + let t_b6_rot = self.rotate.rotate_both(t_b6); + let t_b7_rot = self.rotate.rotate_both(t_b7); + let t_b8_rot = self.rotate.rotate_both(t_b8); + let t_b9_rot = self.rotate.rotate_both(t_b9); + let t_b10_rot = self.rotate.rotate_both(t_b10); + let t_b11_rot = self.rotate.rotate_both(t_b11); + + let y0 = calc_f32!(x0 + x1p22 + x2p21 + x3p20 + x4p19 + x5p18 + x6p17 + x7p16 + x8p15 + x9p14 + x10p13 + x11p12); + let [y1, y22] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot); + let [y2, y21] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot); + let [y3, y20] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot); + let [y4, y19] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot); + let [y5, y18] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot); + let [y6, y17] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot); + let [y7, y16] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot); + let [y8, y15] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot); + let [y9, y14] = parallel_fft2_interleaved_f32(t_a9, t_b9_rot); + let [y10, y13] = parallel_fft2_interleaved_f32(t_a10, t_b10_rot); + let [y11, y12] = parallel_fft2_interleaved_f32(t_a11, t_b11_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22] + } +} + +// ____ _____ __ _ _ _ _ _ +// |___ \|___ / / /_ | || | | |__ (_) |_ +// __) | |_ \ _____ | '_ \| || |_| '_ \| | __| +// / __/ ___) | |_____| | (_) |__ _| |_) | | |_ +// |_____|____/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly23 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, + twiddle3re: float64x2_t, + twiddle3im: float64x2_t, + twiddle4re: float64x2_t, + twiddle4im: float64x2_t, + twiddle5re: float64x2_t, + twiddle5im: float64x2_t, + twiddle6re: float64x2_t, + twiddle6im: float64x2_t, + twiddle7re: float64x2_t, + twiddle7im: float64x2_t, + twiddle8re: float64x2_t, + twiddle8im: float64x2_t, + twiddle9re: float64x2_t, + twiddle9im: float64x2_t, + twiddle10re: float64x2_t, + twiddle10im: float64x2_t, + twiddle11re: float64x2_t, + twiddle11im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly23, 23, |this: &NeonF64Butterfly23<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly23, 23, |this: &NeonF64Butterfly23<_>| this + .direction); +impl NeonF64Butterfly23 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 23, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 23, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 23, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 23, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 23, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 23, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 23, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 23, direction); + let tw9: Complex = twiddles::compute_twiddle(9, 23, direction); + let tw10: Complex = twiddles::compute_twiddle(10, 23, direction); + let tw11: Complex = twiddles::compute_twiddle(11, 23, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f64(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f64(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f64(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f64(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f64(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f64(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f64(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f64(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f64(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f64(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f64(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f64(tw8.im) }; + let twiddle9re = unsafe { vmovq_n_f64(tw9.re) }; + let twiddle9im = unsafe { vmovq_n_f64(tw9.im) }; + let twiddle10re = unsafe { vmovq_n_f64(tw10.re) }; + let twiddle10im = unsafe { vmovq_n_f64(tw10.im) }; + let twiddle11re = unsafe { vmovq_n_f64(tw11.re) }; + let twiddle11im = unsafe { vmovq_n_f64(tw11.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + twiddle9re, + twiddle9im, + twiddle10re, + twiddle10im, + twiddle11re, + twiddle11im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 23]) -> [float64x2_t; 23] { + let [x1p22, x1m22] = solo_fft2_f64(values[1], values[22]); + let [x2p21, x2m21] = solo_fft2_f64(values[2], values[21]); + let [x3p20, x3m20] = solo_fft2_f64(values[3], values[20]); + let [x4p19, x4m19] = solo_fft2_f64(values[4], values[19]); + let [x5p18, x5m18] = solo_fft2_f64(values[5], values[18]); + let [x6p17, x6m17] = solo_fft2_f64(values[6], values[17]); + let [x7p16, x7m16] = solo_fft2_f64(values[7], values[16]); + let [x8p15, x8m15] = solo_fft2_f64(values[8], values[15]); + let [x9p14, x9m14] = solo_fft2_f64(values[9], values[14]); + let [x10p13, x10m13] = solo_fft2_f64(values[10], values[13]); + let [x11p12, x11m12] = solo_fft2_f64(values[11], values[12]); + + let t_a1_1 = vmulq_f64(self.twiddle1re, x1p22); + let t_a1_2 = vmulq_f64(self.twiddle2re, x2p21); + let t_a1_3 = vmulq_f64(self.twiddle3re, x3p20); + let t_a1_4 = vmulq_f64(self.twiddle4re, x4p19); + let t_a1_5 = vmulq_f64(self.twiddle5re, x5p18); + let t_a1_6 = vmulq_f64(self.twiddle6re, x6p17); + let t_a1_7 = vmulq_f64(self.twiddle7re, x7p16); + let t_a1_8 = vmulq_f64(self.twiddle8re, x8p15); + let t_a1_9 = vmulq_f64(self.twiddle9re, x9p14); + let t_a1_10 = vmulq_f64(self.twiddle10re, x10p13); + let t_a1_11 = vmulq_f64(self.twiddle11re, x11p12); + let t_a2_1 = vmulq_f64(self.twiddle2re, x1p22); + let t_a2_2 = vmulq_f64(self.twiddle4re, x2p21); + let t_a2_3 = vmulq_f64(self.twiddle6re, x3p20); + let t_a2_4 = vmulq_f64(self.twiddle8re, x4p19); + let t_a2_5 = vmulq_f64(self.twiddle10re, x5p18); + let t_a2_6 = vmulq_f64(self.twiddle11re, x6p17); + let t_a2_7 = vmulq_f64(self.twiddle9re, x7p16); + let t_a2_8 = vmulq_f64(self.twiddle7re, x8p15); + let t_a2_9 = vmulq_f64(self.twiddle5re, x9p14); + let t_a2_10 = vmulq_f64(self.twiddle3re, x10p13); + let t_a2_11 = vmulq_f64(self.twiddle1re, x11p12); + let t_a3_1 = vmulq_f64(self.twiddle3re, x1p22); + let t_a3_2 = vmulq_f64(self.twiddle6re, x2p21); + let t_a3_3 = vmulq_f64(self.twiddle9re, x3p20); + let t_a3_4 = vmulq_f64(self.twiddle11re, x4p19); + let t_a3_5 = vmulq_f64(self.twiddle8re, x5p18); + let t_a3_6 = vmulq_f64(self.twiddle5re, x6p17); + let t_a3_7 = vmulq_f64(self.twiddle2re, x7p16); + let t_a3_8 = vmulq_f64(self.twiddle1re, x8p15); + let t_a3_9 = vmulq_f64(self.twiddle4re, x9p14); + let t_a3_10 = vmulq_f64(self.twiddle7re, x10p13); + let t_a3_11 = vmulq_f64(self.twiddle10re, x11p12); + let t_a4_1 = vmulq_f64(self.twiddle4re, x1p22); + let t_a4_2 = vmulq_f64(self.twiddle8re, x2p21); + let t_a4_3 = vmulq_f64(self.twiddle11re, x3p20); + let t_a4_4 = vmulq_f64(self.twiddle7re, x4p19); + let t_a4_5 = vmulq_f64(self.twiddle3re, x5p18); + let t_a4_6 = vmulq_f64(self.twiddle1re, x6p17); + let t_a4_7 = vmulq_f64(self.twiddle5re, x7p16); + let t_a4_8 = vmulq_f64(self.twiddle9re, x8p15); + let t_a4_9 = vmulq_f64(self.twiddle10re, x9p14); + let t_a4_10 = vmulq_f64(self.twiddle6re, x10p13); + let t_a4_11 = vmulq_f64(self.twiddle2re, x11p12); + let t_a5_1 = vmulq_f64(self.twiddle5re, x1p22); + let t_a5_2 = vmulq_f64(self.twiddle10re, x2p21); + let t_a5_3 = vmulq_f64(self.twiddle8re, x3p20); + let t_a5_4 = vmulq_f64(self.twiddle3re, x4p19); + let t_a5_5 = vmulq_f64(self.twiddle2re, x5p18); + let t_a5_6 = vmulq_f64(self.twiddle7re, x6p17); + let t_a5_7 = vmulq_f64(self.twiddle11re, x7p16); + let t_a5_8 = vmulq_f64(self.twiddle6re, x8p15); + let t_a5_9 = vmulq_f64(self.twiddle1re, x9p14); + let t_a5_10 = vmulq_f64(self.twiddle4re, x10p13); + let t_a5_11 = vmulq_f64(self.twiddle9re, x11p12); + let t_a6_1 = vmulq_f64(self.twiddle6re, x1p22); + let t_a6_2 = vmulq_f64(self.twiddle11re, x2p21); + let t_a6_3 = vmulq_f64(self.twiddle5re, x3p20); + let t_a6_4 = vmulq_f64(self.twiddle1re, x4p19); + let t_a6_5 = vmulq_f64(self.twiddle7re, x5p18); + let t_a6_6 = vmulq_f64(self.twiddle10re, x6p17); + let t_a6_7 = vmulq_f64(self.twiddle4re, x7p16); + let t_a6_8 = vmulq_f64(self.twiddle2re, x8p15); + let t_a6_9 = vmulq_f64(self.twiddle8re, x9p14); + let t_a6_10 = vmulq_f64(self.twiddle9re, x10p13); + let t_a6_11 = vmulq_f64(self.twiddle3re, x11p12); + let t_a7_1 = vmulq_f64(self.twiddle7re, x1p22); + let t_a7_2 = vmulq_f64(self.twiddle9re, x2p21); + let t_a7_3 = vmulq_f64(self.twiddle2re, x3p20); + let t_a7_4 = vmulq_f64(self.twiddle5re, x4p19); + let t_a7_5 = vmulq_f64(self.twiddle11re, x5p18); + let t_a7_6 = vmulq_f64(self.twiddle4re, x6p17); + let t_a7_7 = vmulq_f64(self.twiddle3re, x7p16); + let t_a7_8 = vmulq_f64(self.twiddle10re, x8p15); + let t_a7_9 = vmulq_f64(self.twiddle6re, x9p14); + let t_a7_10 = vmulq_f64(self.twiddle1re, x10p13); + let t_a7_11 = vmulq_f64(self.twiddle8re, x11p12); + let t_a8_1 = vmulq_f64(self.twiddle8re, x1p22); + let t_a8_2 = vmulq_f64(self.twiddle7re, x2p21); + let t_a8_3 = vmulq_f64(self.twiddle1re, x3p20); + let t_a8_4 = vmulq_f64(self.twiddle9re, x4p19); + let t_a8_5 = vmulq_f64(self.twiddle6re, x5p18); + let t_a8_6 = vmulq_f64(self.twiddle2re, x6p17); + let t_a8_7 = vmulq_f64(self.twiddle10re, x7p16); + let t_a8_8 = vmulq_f64(self.twiddle5re, x8p15); + let t_a8_9 = vmulq_f64(self.twiddle3re, x9p14); + let t_a8_10 = vmulq_f64(self.twiddle11re, x10p13); + let t_a8_11 = vmulq_f64(self.twiddle4re, x11p12); + let t_a9_1 = vmulq_f64(self.twiddle9re, x1p22); + let t_a9_2 = vmulq_f64(self.twiddle5re, x2p21); + let t_a9_3 = vmulq_f64(self.twiddle4re, x3p20); + let t_a9_4 = vmulq_f64(self.twiddle10re, x4p19); + let t_a9_5 = vmulq_f64(self.twiddle1re, x5p18); + let t_a9_6 = vmulq_f64(self.twiddle8re, x6p17); + let t_a9_7 = vmulq_f64(self.twiddle6re, x7p16); + let t_a9_8 = vmulq_f64(self.twiddle3re, x8p15); + let t_a9_9 = vmulq_f64(self.twiddle11re, x9p14); + let t_a9_10 = vmulq_f64(self.twiddle2re, x10p13); + let t_a9_11 = vmulq_f64(self.twiddle7re, x11p12); + let t_a10_1 = vmulq_f64(self.twiddle10re, x1p22); + let t_a10_2 = vmulq_f64(self.twiddle3re, x2p21); + let t_a10_3 = vmulq_f64(self.twiddle7re, x3p20); + let t_a10_4 = vmulq_f64(self.twiddle6re, x4p19); + let t_a10_5 = vmulq_f64(self.twiddle4re, x5p18); + let t_a10_6 = vmulq_f64(self.twiddle9re, x6p17); + let t_a10_7 = vmulq_f64(self.twiddle1re, x7p16); + let t_a10_8 = vmulq_f64(self.twiddle11re, x8p15); + let t_a10_9 = vmulq_f64(self.twiddle2re, x9p14); + let t_a10_10 = vmulq_f64(self.twiddle8re, x10p13); + let t_a10_11 = vmulq_f64(self.twiddle5re, x11p12); + let t_a11_1 = vmulq_f64(self.twiddle11re, x1p22); + let t_a11_2 = vmulq_f64(self.twiddle1re, x2p21); + let t_a11_3 = vmulq_f64(self.twiddle10re, x3p20); + let t_a11_4 = vmulq_f64(self.twiddle2re, x4p19); + let t_a11_5 = vmulq_f64(self.twiddle9re, x5p18); + let t_a11_6 = vmulq_f64(self.twiddle3re, x6p17); + let t_a11_7 = vmulq_f64(self.twiddle8re, x7p16); + let t_a11_8 = vmulq_f64(self.twiddle4re, x8p15); + let t_a11_9 = vmulq_f64(self.twiddle7re, x9p14); + let t_a11_10 = vmulq_f64(self.twiddle5re, x10p13); + let t_a11_11 = vmulq_f64(self.twiddle6re, x11p12); + + let t_b1_1 = vmulq_f64(self.twiddle1im, x1m22); + let t_b1_2 = vmulq_f64(self.twiddle2im, x2m21); + let t_b1_3 = vmulq_f64(self.twiddle3im, x3m20); + let t_b1_4 = vmulq_f64(self.twiddle4im, x4m19); + let t_b1_5 = vmulq_f64(self.twiddle5im, x5m18); + let t_b1_6 = vmulq_f64(self.twiddle6im, x6m17); + let t_b1_7 = vmulq_f64(self.twiddle7im, x7m16); + let t_b1_8 = vmulq_f64(self.twiddle8im, x8m15); + let t_b1_9 = vmulq_f64(self.twiddle9im, x9m14); + let t_b1_10 = vmulq_f64(self.twiddle10im, x10m13); + let t_b1_11 = vmulq_f64(self.twiddle11im, x11m12); + let t_b2_1 = vmulq_f64(self.twiddle2im, x1m22); + let t_b2_2 = vmulq_f64(self.twiddle4im, x2m21); + let t_b2_3 = vmulq_f64(self.twiddle6im, x3m20); + let t_b2_4 = vmulq_f64(self.twiddle8im, x4m19); + let t_b2_5 = vmulq_f64(self.twiddle10im, x5m18); + let t_b2_6 = vmulq_f64(self.twiddle11im, x6m17); + let t_b2_7 = vmulq_f64(self.twiddle9im, x7m16); + let t_b2_8 = vmulq_f64(self.twiddle7im, x8m15); + let t_b2_9 = vmulq_f64(self.twiddle5im, x9m14); + let t_b2_10 = vmulq_f64(self.twiddle3im, x10m13); + let t_b2_11 = vmulq_f64(self.twiddle1im, x11m12); + let t_b3_1 = vmulq_f64(self.twiddle3im, x1m22); + let t_b3_2 = vmulq_f64(self.twiddle6im, x2m21); + let t_b3_3 = vmulq_f64(self.twiddle9im, x3m20); + let t_b3_4 = vmulq_f64(self.twiddle11im, x4m19); + let t_b3_5 = vmulq_f64(self.twiddle8im, x5m18); + let t_b3_6 = vmulq_f64(self.twiddle5im, x6m17); + let t_b3_7 = vmulq_f64(self.twiddle2im, x7m16); + let t_b3_8 = vmulq_f64(self.twiddle1im, x8m15); + let t_b3_9 = vmulq_f64(self.twiddle4im, x9m14); + let t_b3_10 = vmulq_f64(self.twiddle7im, x10m13); + let t_b3_11 = vmulq_f64(self.twiddle10im, x11m12); + let t_b4_1 = vmulq_f64(self.twiddle4im, x1m22); + let t_b4_2 = vmulq_f64(self.twiddle8im, x2m21); + let t_b4_3 = vmulq_f64(self.twiddle11im, x3m20); + let t_b4_4 = vmulq_f64(self.twiddle7im, x4m19); + let t_b4_5 = vmulq_f64(self.twiddle3im, x5m18); + let t_b4_6 = vmulq_f64(self.twiddle1im, x6m17); + let t_b4_7 = vmulq_f64(self.twiddle5im, x7m16); + let t_b4_8 = vmulq_f64(self.twiddle9im, x8m15); + let t_b4_9 = vmulq_f64(self.twiddle10im, x9m14); + let t_b4_10 = vmulq_f64(self.twiddle6im, x10m13); + let t_b4_11 = vmulq_f64(self.twiddle2im, x11m12); + let t_b5_1 = vmulq_f64(self.twiddle5im, x1m22); + let t_b5_2 = vmulq_f64(self.twiddle10im, x2m21); + let t_b5_3 = vmulq_f64(self.twiddle8im, x3m20); + let t_b5_4 = vmulq_f64(self.twiddle3im, x4m19); + let t_b5_5 = vmulq_f64(self.twiddle2im, x5m18); + let t_b5_6 = vmulq_f64(self.twiddle7im, x6m17); + let t_b5_7 = vmulq_f64(self.twiddle11im, x7m16); + let t_b5_8 = vmulq_f64(self.twiddle6im, x8m15); + let t_b5_9 = vmulq_f64(self.twiddle1im, x9m14); + let t_b5_10 = vmulq_f64(self.twiddle4im, x10m13); + let t_b5_11 = vmulq_f64(self.twiddle9im, x11m12); + let t_b6_1 = vmulq_f64(self.twiddle6im, x1m22); + let t_b6_2 = vmulq_f64(self.twiddle11im, x2m21); + let t_b6_3 = vmulq_f64(self.twiddle5im, x3m20); + let t_b6_4 = vmulq_f64(self.twiddle1im, x4m19); + let t_b6_5 = vmulq_f64(self.twiddle7im, x5m18); + let t_b6_6 = vmulq_f64(self.twiddle10im, x6m17); + let t_b6_7 = vmulq_f64(self.twiddle4im, x7m16); + let t_b6_8 = vmulq_f64(self.twiddle2im, x8m15); + let t_b6_9 = vmulq_f64(self.twiddle8im, x9m14); + let t_b6_10 = vmulq_f64(self.twiddle9im, x10m13); + let t_b6_11 = vmulq_f64(self.twiddle3im, x11m12); + let t_b7_1 = vmulq_f64(self.twiddle7im, x1m22); + let t_b7_2 = vmulq_f64(self.twiddle9im, x2m21); + let t_b7_3 = vmulq_f64(self.twiddle2im, x3m20); + let t_b7_4 = vmulq_f64(self.twiddle5im, x4m19); + let t_b7_5 = vmulq_f64(self.twiddle11im, x5m18); + let t_b7_6 = vmulq_f64(self.twiddle4im, x6m17); + let t_b7_7 = vmulq_f64(self.twiddle3im, x7m16); + let t_b7_8 = vmulq_f64(self.twiddle10im, x8m15); + let t_b7_9 = vmulq_f64(self.twiddle6im, x9m14); + let t_b7_10 = vmulq_f64(self.twiddle1im, x10m13); + let t_b7_11 = vmulq_f64(self.twiddle8im, x11m12); + let t_b8_1 = vmulq_f64(self.twiddle8im, x1m22); + let t_b8_2 = vmulq_f64(self.twiddle7im, x2m21); + let t_b8_3 = vmulq_f64(self.twiddle1im, x3m20); + let t_b8_4 = vmulq_f64(self.twiddle9im, x4m19); + let t_b8_5 = vmulq_f64(self.twiddle6im, x5m18); + let t_b8_6 = vmulq_f64(self.twiddle2im, x6m17); + let t_b8_7 = vmulq_f64(self.twiddle10im, x7m16); + let t_b8_8 = vmulq_f64(self.twiddle5im, x8m15); + let t_b8_9 = vmulq_f64(self.twiddle3im, x9m14); + let t_b8_10 = vmulq_f64(self.twiddle11im, x10m13); + let t_b8_11 = vmulq_f64(self.twiddle4im, x11m12); + let t_b9_1 = vmulq_f64(self.twiddle9im, x1m22); + let t_b9_2 = vmulq_f64(self.twiddle5im, x2m21); + let t_b9_3 = vmulq_f64(self.twiddle4im, x3m20); + let t_b9_4 = vmulq_f64(self.twiddle10im, x4m19); + let t_b9_5 = vmulq_f64(self.twiddle1im, x5m18); + let t_b9_6 = vmulq_f64(self.twiddle8im, x6m17); + let t_b9_7 = vmulq_f64(self.twiddle6im, x7m16); + let t_b9_8 = vmulq_f64(self.twiddle3im, x8m15); + let t_b9_9 = vmulq_f64(self.twiddle11im, x9m14); + let t_b9_10 = vmulq_f64(self.twiddle2im, x10m13); + let t_b9_11 = vmulq_f64(self.twiddle7im, x11m12); + let t_b10_1 = vmulq_f64(self.twiddle10im, x1m22); + let t_b10_2 = vmulq_f64(self.twiddle3im, x2m21); + let t_b10_3 = vmulq_f64(self.twiddle7im, x3m20); + let t_b10_4 = vmulq_f64(self.twiddle6im, x4m19); + let t_b10_5 = vmulq_f64(self.twiddle4im, x5m18); + let t_b10_6 = vmulq_f64(self.twiddle9im, x6m17); + let t_b10_7 = vmulq_f64(self.twiddle1im, x7m16); + let t_b10_8 = vmulq_f64(self.twiddle11im, x8m15); + let t_b10_9 = vmulq_f64(self.twiddle2im, x9m14); + let t_b10_10 = vmulq_f64(self.twiddle8im, x10m13); + let t_b10_11 = vmulq_f64(self.twiddle5im, x11m12); + let t_b11_1 = vmulq_f64(self.twiddle11im, x1m22); + let t_b11_2 = vmulq_f64(self.twiddle1im, x2m21); + let t_b11_3 = vmulq_f64(self.twiddle10im, x3m20); + let t_b11_4 = vmulq_f64(self.twiddle2im, x4m19); + let t_b11_5 = vmulq_f64(self.twiddle9im, x5m18); + let t_b11_6 = vmulq_f64(self.twiddle3im, x6m17); + let t_b11_7 = vmulq_f64(self.twiddle8im, x7m16); + let t_b11_8 = vmulq_f64(self.twiddle4im, x8m15); + let t_b11_9 = vmulq_f64(self.twiddle7im, x9m14); + let t_b11_10 = vmulq_f64(self.twiddle5im, x10m13); + let t_b11_11 = vmulq_f64(self.twiddle6im, x11m12); + + let x0 = values[0]; + let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11); + let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11); + let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11); + let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11); + let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11); + let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11); + let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11); + let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11); + let t_a9 = calc_f64!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11); + let t_a10 = calc_f64!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11); + let t_a11 = calc_f64!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11); + + let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11); + let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 - t_b2_6 - t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11); + let t_b3 = calc_f64!(t_b3_1 + t_b3_2 + t_b3_3 - t_b3_4 - t_b3_5 - t_b3_6 - t_b3_7 + t_b3_8 + t_b3_9 + t_b3_10 + t_b3_11); + let t_b4 = calc_f64!(t_b4_1 + t_b4_2 - t_b4_3 - t_b4_4 - t_b4_5 + t_b4_6 + t_b4_7 + t_b4_8 - t_b4_9 - t_b4_10 - t_b4_11); + let t_b5 = calc_f64!(t_b5_1 + t_b5_2 - t_b5_3 - t_b5_4 + t_b5_5 + t_b5_6 - t_b5_7 - t_b5_8 - t_b5_9 + t_b5_10 + t_b5_11); + let t_b6 = calc_f64!(t_b6_1 - t_b6_2 - t_b6_3 + t_b6_4 + t_b6_5 - t_b6_6 - t_b6_7 + t_b6_8 + t_b6_9 - t_b6_10 - t_b6_11); + let t_b7 = calc_f64!(t_b7_1 - t_b7_2 - t_b7_3 + t_b7_4 - t_b7_5 - t_b7_6 + t_b7_7 + t_b7_8 - t_b7_9 + t_b7_10 + t_b7_11); + let t_b8 = calc_f64!(t_b8_1 - t_b8_2 + t_b8_3 + t_b8_4 - t_b8_5 + t_b8_6 + t_b8_7 - t_b8_8 + t_b8_9 + t_b8_10 - t_b8_11); + let t_b9 = calc_f64!(t_b9_1 - t_b9_2 + t_b9_3 - t_b9_4 - t_b9_5 + t_b9_6 - t_b9_7 + t_b9_8 - t_b9_9 - t_b9_10 + t_b9_11); + let t_b10 = calc_f64!(t_b10_1 - t_b10_2 + t_b10_3 - t_b10_4 + t_b10_5 - t_b10_6 + t_b10_7 + t_b10_8 - t_b10_9 + t_b10_10 - t_b10_11); + let t_b11 = calc_f64!(t_b11_1 - t_b11_2 + t_b11_3 - t_b11_4 + t_b11_5 - t_b11_6 + t_b11_7 - t_b11_8 + t_b11_9 - t_b11_10 + t_b11_11); + + let t_b1_rot = self.rotate.rotate(t_b1); + let t_b2_rot = self.rotate.rotate(t_b2); + let t_b3_rot = self.rotate.rotate(t_b3); + let t_b4_rot = self.rotate.rotate(t_b4); + let t_b5_rot = self.rotate.rotate(t_b5); + let t_b6_rot = self.rotate.rotate(t_b6); + let t_b7_rot = self.rotate.rotate(t_b7); + let t_b8_rot = self.rotate.rotate(t_b8); + let t_b9_rot = self.rotate.rotate(t_b9); + let t_b10_rot = self.rotate.rotate(t_b10); + let t_b11_rot = self.rotate.rotate(t_b11); + + let y0 = calc_f64!(x0 + x1p22 + x2p21 + x3p20 + x4p19 + x5p18 + x6p17 + x7p16 + x8p15 + x9p14 + x10p13 + x11p12); + let [y1, y22] = solo_fft2_f64(t_a1, t_b1_rot); + let [y2, y21] = solo_fft2_f64(t_a2, t_b2_rot); + let [y3, y20] = solo_fft2_f64(t_a3, t_b3_rot); + let [y4, y19] = solo_fft2_f64(t_a4, t_b4_rot); + let [y5, y18] = solo_fft2_f64(t_a5, t_b5_rot); + let [y6, y17] = solo_fft2_f64(t_a6, t_b6_rot); + let [y7, y16] = solo_fft2_f64(t_a7, t_b7_rot); + let [y8, y15] = solo_fft2_f64(t_a8, t_b8_rot); + let [y9, y14] = solo_fft2_f64(t_a9, t_b9_rot); + let [y10, y13] = solo_fft2_f64(t_a10, t_b10_rot); + let [y11, y12] = solo_fft2_f64(t_a11, t_b11_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22] + } +} + +// ____ ___ _________ _ _ _ +// |___ \ / _ \ |___ /___ \| |__ (_) |_ +// __) | (_) | _____ |_ \ __) | '_ \| | __| +// / __/ \__, | |_____| ___) / __/| |_) | | |_ +// |_____| /_/ |____/_____|_.__/|_|\__| +// + +pub struct NeonF32Butterfly29 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, + twiddle3re: float32x4_t, + twiddle3im: float32x4_t, + twiddle4re: float32x4_t, + twiddle4im: float32x4_t, + twiddle5re: float32x4_t, + twiddle5im: float32x4_t, + twiddle6re: float32x4_t, + twiddle6im: float32x4_t, + twiddle7re: float32x4_t, + twiddle7im: float32x4_t, + twiddle8re: float32x4_t, + twiddle8im: float32x4_t, + twiddle9re: float32x4_t, + twiddle9im: float32x4_t, + twiddle10re: float32x4_t, + twiddle10im: float32x4_t, + twiddle11re: float32x4_t, + twiddle11im: float32x4_t, + twiddle12re: float32x4_t, + twiddle12im: float32x4_t, + twiddle13re: float32x4_t, + twiddle13im: float32x4_t, + twiddle14re: float32x4_t, + twiddle14im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly29, 29, |this: &NeonF32Butterfly29<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly29, 29, |this: &NeonF32Butterfly29<_>| this + .direction); +impl NeonF32Butterfly29 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 29, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 29, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 29, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 29, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 29, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 29, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 29, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 29, direction); + let tw9: Complex = twiddles::compute_twiddle(9, 29, direction); + let tw10: Complex = twiddles::compute_twiddle(10, 29, direction); + let tw11: Complex = twiddles::compute_twiddle(11, 29, direction); + let tw12: Complex = twiddles::compute_twiddle(12, 29, direction); + let tw13: Complex = twiddles::compute_twiddle(13, 29, direction); + let tw14: Complex = twiddles::compute_twiddle(14, 29, direction); + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f32(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f32(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f32(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f32(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f32(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f32(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f32(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f32(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f32(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f32(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f32(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f32(tw8.im) }; + let twiddle9re = unsafe { vmovq_n_f32(tw9.re) }; + let twiddle9im = unsafe { vmovq_n_f32(tw9.im) }; + let twiddle10re = unsafe { vmovq_n_f32(tw10.re) }; + let twiddle10im = unsafe { vmovq_n_f32(tw10.im) }; + let twiddle11re = unsafe { vmovq_n_f32(tw11.re) }; + let twiddle11im = unsafe { vmovq_n_f32(tw11.im) }; + let twiddle12re = unsafe { vmovq_n_f32(tw12.re) }; + let twiddle12im = unsafe { vmovq_n_f32(tw12.im) }; + let twiddle13re = unsafe { vmovq_n_f32(tw13.re) }; + let twiddle13im = unsafe { vmovq_n_f32(tw13.im) }; + let twiddle14re = unsafe { vmovq_n_f32(tw14.re) }; + let twiddle14im = unsafe { vmovq_n_f32(tw14.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + twiddle9re, + twiddle9im, + twiddle10re, + twiddle10im, + twiddle11re, + twiddle11im, + twiddle12re, + twiddle12im, + twiddle13re, + twiddle13im, + twiddle14re, + twiddle14im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}); + + let out = self.perform_parallel_fft_direct(values); + + write_partial_lo_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}); + + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[14]), + extract_hi_lo_f32(input_packed[0], input_packed[15]), + extract_lo_hi_f32(input_packed[1], input_packed[15]), + extract_hi_lo_f32(input_packed[1], input_packed[16]), + extract_lo_hi_f32(input_packed[2], input_packed[16]), + extract_hi_lo_f32(input_packed[2], input_packed[17]), + extract_lo_hi_f32(input_packed[3], input_packed[17]), + extract_hi_lo_f32(input_packed[3], input_packed[18]), + extract_lo_hi_f32(input_packed[4], input_packed[18]), + extract_hi_lo_f32(input_packed[4], input_packed[19]), + extract_lo_hi_f32(input_packed[5], input_packed[19]), + extract_hi_lo_f32(input_packed[5], input_packed[20]), + extract_lo_hi_f32(input_packed[6], input_packed[20]), + extract_hi_lo_f32(input_packed[6], input_packed[21]), + extract_lo_hi_f32(input_packed[7], input_packed[21]), + extract_hi_lo_f32(input_packed[7], input_packed[22]), + extract_lo_hi_f32(input_packed[8], input_packed[22]), + extract_hi_lo_f32(input_packed[8], input_packed[23]), + extract_lo_hi_f32(input_packed[9], input_packed[23]), + extract_hi_lo_f32(input_packed[9], input_packed[24]), + extract_lo_hi_f32(input_packed[10], input_packed[24]), + extract_hi_lo_f32(input_packed[10], input_packed[25]), + extract_lo_hi_f32(input_packed[11], input_packed[25]), + extract_hi_lo_f32(input_packed[11], input_packed[26]), + extract_lo_hi_f32(input_packed[12], input_packed[26]), + extract_hi_lo_f32(input_packed[12], input_packed[27]), + extract_lo_hi_f32(input_packed[13], input_packed[27]), + extract_hi_lo_f32(input_packed[13], input_packed[28]), + extract_lo_hi_f32(input_packed[14], input_packed[28]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_lo_f32(out[8], out[9]), + extract_lo_lo_f32(out[10], out[11]), + extract_lo_lo_f32(out[12], out[13]), + extract_lo_lo_f32(out[14], out[15]), + extract_lo_lo_f32(out[16], out[17]), + extract_lo_lo_f32(out[18], out[19]), + extract_lo_lo_f32(out[20], out[21]), + extract_lo_lo_f32(out[22], out[23]), + extract_lo_lo_f32(out[24], out[25]), + extract_lo_lo_f32(out[26], out[27]), + extract_lo_hi_f32(out[28], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + extract_hi_hi_f32(out[9], out[10]), + extract_hi_hi_f32(out[11], out[12]), + extract_hi_hi_f32(out[13], out[14]), + extract_hi_hi_f32(out[15], out[16]), + extract_hi_hi_f32(out[17], out[18]), + extract_hi_hi_f32(out[19], out[20]), + extract_hi_hi_f32(out[21], out[22]), + extract_hi_hi_f32(out[23], out[24]), + extract_hi_hi_f32(out[25], out[26]), + extract_hi_hi_f32(out[27], out[28]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 29]) -> [float32x4_t; 29] { + let [x1p28, x1m28] = parallel_fft2_interleaved_f32(values[1], values[28]); + let [x2p27, x2m27] = parallel_fft2_interleaved_f32(values[2], values[27]); + let [x3p26, x3m26] = parallel_fft2_interleaved_f32(values[3], values[26]); + let [x4p25, x4m25] = parallel_fft2_interleaved_f32(values[4], values[25]); + let [x5p24, x5m24] = parallel_fft2_interleaved_f32(values[5], values[24]); + let [x6p23, x6m23] = parallel_fft2_interleaved_f32(values[6], values[23]); + let [x7p22, x7m22] = parallel_fft2_interleaved_f32(values[7], values[22]); + let [x8p21, x8m21] = parallel_fft2_interleaved_f32(values[8], values[21]); + let [x9p20, x9m20] = parallel_fft2_interleaved_f32(values[9], values[20]); + let [x10p19, x10m19] = parallel_fft2_interleaved_f32(values[10], values[19]); + let [x11p18, x11m18] = parallel_fft2_interleaved_f32(values[11], values[18]); + let [x12p17, x12m17] = parallel_fft2_interleaved_f32(values[12], values[17]); + let [x13p16, x13m16] = parallel_fft2_interleaved_f32(values[13], values[16]); + let [x14p15, x14m15] = parallel_fft2_interleaved_f32(values[14], values[15]); + + let t_a1_1 = vmulq_f32(self.twiddle1re, x1p28); + let t_a1_2 = vmulq_f32(self.twiddle2re, x2p27); + let t_a1_3 = vmulq_f32(self.twiddle3re, x3p26); + let t_a1_4 = vmulq_f32(self.twiddle4re, x4p25); + let t_a1_5 = vmulq_f32(self.twiddle5re, x5p24); + let t_a1_6 = vmulq_f32(self.twiddle6re, x6p23); + let t_a1_7 = vmulq_f32(self.twiddle7re, x7p22); + let t_a1_8 = vmulq_f32(self.twiddle8re, x8p21); + let t_a1_9 = vmulq_f32(self.twiddle9re, x9p20); + let t_a1_10 = vmulq_f32(self.twiddle10re, x10p19); + let t_a1_11 = vmulq_f32(self.twiddle11re, x11p18); + let t_a1_12 = vmulq_f32(self.twiddle12re, x12p17); + let t_a1_13 = vmulq_f32(self.twiddle13re, x13p16); + let t_a1_14 = vmulq_f32(self.twiddle14re, x14p15); + let t_a2_1 = vmulq_f32(self.twiddle2re, x1p28); + let t_a2_2 = vmulq_f32(self.twiddle4re, x2p27); + let t_a2_3 = vmulq_f32(self.twiddle6re, x3p26); + let t_a2_4 = vmulq_f32(self.twiddle8re, x4p25); + let t_a2_5 = vmulq_f32(self.twiddle10re, x5p24); + let t_a2_6 = vmulq_f32(self.twiddle12re, x6p23); + let t_a2_7 = vmulq_f32(self.twiddle14re, x7p22); + let t_a2_8 = vmulq_f32(self.twiddle13re, x8p21); + let t_a2_9 = vmulq_f32(self.twiddle11re, x9p20); + let t_a2_10 = vmulq_f32(self.twiddle9re, x10p19); + let t_a2_11 = vmulq_f32(self.twiddle7re, x11p18); + let t_a2_12 = vmulq_f32(self.twiddle5re, x12p17); + let t_a2_13 = vmulq_f32(self.twiddle3re, x13p16); + let t_a2_14 = vmulq_f32(self.twiddle1re, x14p15); + let t_a3_1 = vmulq_f32(self.twiddle3re, x1p28); + let t_a3_2 = vmulq_f32(self.twiddle6re, x2p27); + let t_a3_3 = vmulq_f32(self.twiddle9re, x3p26); + let t_a3_4 = vmulq_f32(self.twiddle12re, x4p25); + let t_a3_5 = vmulq_f32(self.twiddle14re, x5p24); + let t_a3_6 = vmulq_f32(self.twiddle11re, x6p23); + let t_a3_7 = vmulq_f32(self.twiddle8re, x7p22); + let t_a3_8 = vmulq_f32(self.twiddle5re, x8p21); + let t_a3_9 = vmulq_f32(self.twiddle2re, x9p20); + let t_a3_10 = vmulq_f32(self.twiddle1re, x10p19); + let t_a3_11 = vmulq_f32(self.twiddle4re, x11p18); + let t_a3_12 = vmulq_f32(self.twiddle7re, x12p17); + let t_a3_13 = vmulq_f32(self.twiddle10re, x13p16); + let t_a3_14 = vmulq_f32(self.twiddle13re, x14p15); + let t_a4_1 = vmulq_f32(self.twiddle4re, x1p28); + let t_a4_2 = vmulq_f32(self.twiddle8re, x2p27); + let t_a4_3 = vmulq_f32(self.twiddle12re, x3p26); + let t_a4_4 = vmulq_f32(self.twiddle13re, x4p25); + let t_a4_5 = vmulq_f32(self.twiddle9re, x5p24); + let t_a4_6 = vmulq_f32(self.twiddle5re, x6p23); + let t_a4_7 = vmulq_f32(self.twiddle1re, x7p22); + let t_a4_8 = vmulq_f32(self.twiddle3re, x8p21); + let t_a4_9 = vmulq_f32(self.twiddle7re, x9p20); + let t_a4_10 = vmulq_f32(self.twiddle11re, x10p19); + let t_a4_11 = vmulq_f32(self.twiddle14re, x11p18); + let t_a4_12 = vmulq_f32(self.twiddle10re, x12p17); + let t_a4_13 = vmulq_f32(self.twiddle6re, x13p16); + let t_a4_14 = vmulq_f32(self.twiddle2re, x14p15); + let t_a5_1 = vmulq_f32(self.twiddle5re, x1p28); + let t_a5_2 = vmulq_f32(self.twiddle10re, x2p27); + let t_a5_3 = vmulq_f32(self.twiddle14re, x3p26); + let t_a5_4 = vmulq_f32(self.twiddle9re, x4p25); + let t_a5_5 = vmulq_f32(self.twiddle4re, x5p24); + let t_a5_6 = vmulq_f32(self.twiddle1re, x6p23); + let t_a5_7 = vmulq_f32(self.twiddle6re, x7p22); + let t_a5_8 = vmulq_f32(self.twiddle11re, x8p21); + let t_a5_9 = vmulq_f32(self.twiddle13re, x9p20); + let t_a5_10 = vmulq_f32(self.twiddle8re, x10p19); + let t_a5_11 = vmulq_f32(self.twiddle3re, x11p18); + let t_a5_12 = vmulq_f32(self.twiddle2re, x12p17); + let t_a5_13 = vmulq_f32(self.twiddle7re, x13p16); + let t_a5_14 = vmulq_f32(self.twiddle12re, x14p15); + let t_a6_1 = vmulq_f32(self.twiddle6re, x1p28); + let t_a6_2 = vmulq_f32(self.twiddle12re, x2p27); + let t_a6_3 = vmulq_f32(self.twiddle11re, x3p26); + let t_a6_4 = vmulq_f32(self.twiddle5re, x4p25); + let t_a6_5 = vmulq_f32(self.twiddle1re, x5p24); + let t_a6_6 = vmulq_f32(self.twiddle7re, x6p23); + let t_a6_7 = vmulq_f32(self.twiddle13re, x7p22); + let t_a6_8 = vmulq_f32(self.twiddle10re, x8p21); + let t_a6_9 = vmulq_f32(self.twiddle4re, x9p20); + let t_a6_10 = vmulq_f32(self.twiddle2re, x10p19); + let t_a6_11 = vmulq_f32(self.twiddle8re, x11p18); + let t_a6_12 = vmulq_f32(self.twiddle14re, x12p17); + let t_a6_13 = vmulq_f32(self.twiddle9re, x13p16); + let t_a6_14 = vmulq_f32(self.twiddle3re, x14p15); + let t_a7_1 = vmulq_f32(self.twiddle7re, x1p28); + let t_a7_2 = vmulq_f32(self.twiddle14re, x2p27); + let t_a7_3 = vmulq_f32(self.twiddle8re, x3p26); + let t_a7_4 = vmulq_f32(self.twiddle1re, x4p25); + let t_a7_5 = vmulq_f32(self.twiddle6re, x5p24); + let t_a7_6 = vmulq_f32(self.twiddle13re, x6p23); + let t_a7_7 = vmulq_f32(self.twiddle9re, x7p22); + let t_a7_8 = vmulq_f32(self.twiddle2re, x8p21); + let t_a7_9 = vmulq_f32(self.twiddle5re, x9p20); + let t_a7_10 = vmulq_f32(self.twiddle12re, x10p19); + let t_a7_11 = vmulq_f32(self.twiddle10re, x11p18); + let t_a7_12 = vmulq_f32(self.twiddle3re, x12p17); + let t_a7_13 = vmulq_f32(self.twiddle4re, x13p16); + let t_a7_14 = vmulq_f32(self.twiddle11re, x14p15); + let t_a8_1 = vmulq_f32(self.twiddle8re, x1p28); + let t_a8_2 = vmulq_f32(self.twiddle13re, x2p27); + let t_a8_3 = vmulq_f32(self.twiddle5re, x3p26); + let t_a8_4 = vmulq_f32(self.twiddle3re, x4p25); + let t_a8_5 = vmulq_f32(self.twiddle11re, x5p24); + let t_a8_6 = vmulq_f32(self.twiddle10re, x6p23); + let t_a8_7 = vmulq_f32(self.twiddle2re, x7p22); + let t_a8_8 = vmulq_f32(self.twiddle6re, x8p21); + let t_a8_9 = vmulq_f32(self.twiddle14re, x9p20); + let t_a8_10 = vmulq_f32(self.twiddle7re, x10p19); + let t_a8_11 = vmulq_f32(self.twiddle1re, x11p18); + let t_a8_12 = vmulq_f32(self.twiddle9re, x12p17); + let t_a8_13 = vmulq_f32(self.twiddle12re, x13p16); + let t_a8_14 = vmulq_f32(self.twiddle4re, x14p15); + let t_a9_1 = vmulq_f32(self.twiddle9re, x1p28); + let t_a9_2 = vmulq_f32(self.twiddle11re, x2p27); + let t_a9_3 = vmulq_f32(self.twiddle2re, x3p26); + let t_a9_4 = vmulq_f32(self.twiddle7re, x4p25); + let t_a9_5 = vmulq_f32(self.twiddle13re, x5p24); + let t_a9_6 = vmulq_f32(self.twiddle4re, x6p23); + let t_a9_7 = vmulq_f32(self.twiddle5re, x7p22); + let t_a9_8 = vmulq_f32(self.twiddle14re, x8p21); + let t_a9_9 = vmulq_f32(self.twiddle6re, x9p20); + let t_a9_10 = vmulq_f32(self.twiddle3re, x10p19); + let t_a9_11 = vmulq_f32(self.twiddle12re, x11p18); + let t_a9_12 = vmulq_f32(self.twiddle8re, x12p17); + let t_a9_13 = vmulq_f32(self.twiddle1re, x13p16); + let t_a9_14 = vmulq_f32(self.twiddle10re, x14p15); + let t_a10_1 = vmulq_f32(self.twiddle10re, x1p28); + let t_a10_2 = vmulq_f32(self.twiddle9re, x2p27); + let t_a10_3 = vmulq_f32(self.twiddle1re, x3p26); + let t_a10_4 = vmulq_f32(self.twiddle11re, x4p25); + let t_a10_5 = vmulq_f32(self.twiddle8re, x5p24); + let t_a10_6 = vmulq_f32(self.twiddle2re, x6p23); + let t_a10_7 = vmulq_f32(self.twiddle12re, x7p22); + let t_a10_8 = vmulq_f32(self.twiddle7re, x8p21); + let t_a10_9 = vmulq_f32(self.twiddle3re, x9p20); + let t_a10_10 = vmulq_f32(self.twiddle13re, x10p19); + let t_a10_11 = vmulq_f32(self.twiddle6re, x11p18); + let t_a10_12 = vmulq_f32(self.twiddle4re, x12p17); + let t_a10_13 = vmulq_f32(self.twiddle14re, x13p16); + let t_a10_14 = vmulq_f32(self.twiddle5re, x14p15); + let t_a11_1 = vmulq_f32(self.twiddle11re, x1p28); + let t_a11_2 = vmulq_f32(self.twiddle7re, x2p27); + let t_a11_3 = vmulq_f32(self.twiddle4re, x3p26); + let t_a11_4 = vmulq_f32(self.twiddle14re, x4p25); + let t_a11_5 = vmulq_f32(self.twiddle3re, x5p24); + let t_a11_6 = vmulq_f32(self.twiddle8re, x6p23); + let t_a11_7 = vmulq_f32(self.twiddle10re, x7p22); + let t_a11_8 = vmulq_f32(self.twiddle1re, x8p21); + let t_a11_9 = vmulq_f32(self.twiddle12re, x9p20); + let t_a11_10 = vmulq_f32(self.twiddle6re, x10p19); + let t_a11_11 = vmulq_f32(self.twiddle5re, x11p18); + let t_a11_12 = vmulq_f32(self.twiddle13re, x12p17); + let t_a11_13 = vmulq_f32(self.twiddle2re, x13p16); + let t_a11_14 = vmulq_f32(self.twiddle9re, x14p15); + let t_a12_1 = vmulq_f32(self.twiddle12re, x1p28); + let t_a12_2 = vmulq_f32(self.twiddle5re, x2p27); + let t_a12_3 = vmulq_f32(self.twiddle7re, x3p26); + let t_a12_4 = vmulq_f32(self.twiddle10re, x4p25); + let t_a12_5 = vmulq_f32(self.twiddle2re, x5p24); + let t_a12_6 = vmulq_f32(self.twiddle14re, x6p23); + let t_a12_7 = vmulq_f32(self.twiddle3re, x7p22); + let t_a12_8 = vmulq_f32(self.twiddle9re, x8p21); + let t_a12_9 = vmulq_f32(self.twiddle8re, x9p20); + let t_a12_10 = vmulq_f32(self.twiddle4re, x10p19); + let t_a12_11 = vmulq_f32(self.twiddle13re, x11p18); + let t_a12_12 = vmulq_f32(self.twiddle1re, x12p17); + let t_a12_13 = vmulq_f32(self.twiddle11re, x13p16); + let t_a12_14 = vmulq_f32(self.twiddle6re, x14p15); + let t_a13_1 = vmulq_f32(self.twiddle13re, x1p28); + let t_a13_2 = vmulq_f32(self.twiddle3re, x2p27); + let t_a13_3 = vmulq_f32(self.twiddle10re, x3p26); + let t_a13_4 = vmulq_f32(self.twiddle6re, x4p25); + let t_a13_5 = vmulq_f32(self.twiddle7re, x5p24); + let t_a13_6 = vmulq_f32(self.twiddle9re, x6p23); + let t_a13_7 = vmulq_f32(self.twiddle4re, x7p22); + let t_a13_8 = vmulq_f32(self.twiddle12re, x8p21); + let t_a13_9 = vmulq_f32(self.twiddle1re, x9p20); + let t_a13_10 = vmulq_f32(self.twiddle14re, x10p19); + let t_a13_11 = vmulq_f32(self.twiddle2re, x11p18); + let t_a13_12 = vmulq_f32(self.twiddle11re, x12p17); + let t_a13_13 = vmulq_f32(self.twiddle5re, x13p16); + let t_a13_14 = vmulq_f32(self.twiddle8re, x14p15); + let t_a14_1 = vmulq_f32(self.twiddle14re, x1p28); + let t_a14_2 = vmulq_f32(self.twiddle1re, x2p27); + let t_a14_3 = vmulq_f32(self.twiddle13re, x3p26); + let t_a14_4 = vmulq_f32(self.twiddle2re, x4p25); + let t_a14_5 = vmulq_f32(self.twiddle12re, x5p24); + let t_a14_6 = vmulq_f32(self.twiddle3re, x6p23); + let t_a14_7 = vmulq_f32(self.twiddle11re, x7p22); + let t_a14_8 = vmulq_f32(self.twiddle4re, x8p21); + let t_a14_9 = vmulq_f32(self.twiddle10re, x9p20); + let t_a14_10 = vmulq_f32(self.twiddle5re, x10p19); + let t_a14_11 = vmulq_f32(self.twiddle9re, x11p18); + let t_a14_12 = vmulq_f32(self.twiddle6re, x12p17); + let t_a14_13 = vmulq_f32(self.twiddle8re, x13p16); + let t_a14_14 = vmulq_f32(self.twiddle7re, x14p15); + + let t_b1_1 = vmulq_f32(self.twiddle1im, x1m28); + let t_b1_2 = vmulq_f32(self.twiddle2im, x2m27); + let t_b1_3 = vmulq_f32(self.twiddle3im, x3m26); + let t_b1_4 = vmulq_f32(self.twiddle4im, x4m25); + let t_b1_5 = vmulq_f32(self.twiddle5im, x5m24); + let t_b1_6 = vmulq_f32(self.twiddle6im, x6m23); + let t_b1_7 = vmulq_f32(self.twiddle7im, x7m22); + let t_b1_8 = vmulq_f32(self.twiddle8im, x8m21); + let t_b1_9 = vmulq_f32(self.twiddle9im, x9m20); + let t_b1_10 = vmulq_f32(self.twiddle10im, x10m19); + let t_b1_11 = vmulq_f32(self.twiddle11im, x11m18); + let t_b1_12 = vmulq_f32(self.twiddle12im, x12m17); + let t_b1_13 = vmulq_f32(self.twiddle13im, x13m16); + let t_b1_14 = vmulq_f32(self.twiddle14im, x14m15); + let t_b2_1 = vmulq_f32(self.twiddle2im, x1m28); + let t_b2_2 = vmulq_f32(self.twiddle4im, x2m27); + let t_b2_3 = vmulq_f32(self.twiddle6im, x3m26); + let t_b2_4 = vmulq_f32(self.twiddle8im, x4m25); + let t_b2_5 = vmulq_f32(self.twiddle10im, x5m24); + let t_b2_6 = vmulq_f32(self.twiddle12im, x6m23); + let t_b2_7 = vmulq_f32(self.twiddle14im, x7m22); + let t_b2_8 = vmulq_f32(self.twiddle13im, x8m21); + let t_b2_9 = vmulq_f32(self.twiddle11im, x9m20); + let t_b2_10 = vmulq_f32(self.twiddle9im, x10m19); + let t_b2_11 = vmulq_f32(self.twiddle7im, x11m18); + let t_b2_12 = vmulq_f32(self.twiddle5im, x12m17); + let t_b2_13 = vmulq_f32(self.twiddle3im, x13m16); + let t_b2_14 = vmulq_f32(self.twiddle1im, x14m15); + let t_b3_1 = vmulq_f32(self.twiddle3im, x1m28); + let t_b3_2 = vmulq_f32(self.twiddle6im, x2m27); + let t_b3_3 = vmulq_f32(self.twiddle9im, x3m26); + let t_b3_4 = vmulq_f32(self.twiddle12im, x4m25); + let t_b3_5 = vmulq_f32(self.twiddle14im, x5m24); + let t_b3_6 = vmulq_f32(self.twiddle11im, x6m23); + let t_b3_7 = vmulq_f32(self.twiddle8im, x7m22); + let t_b3_8 = vmulq_f32(self.twiddle5im, x8m21); + let t_b3_9 = vmulq_f32(self.twiddle2im, x9m20); + let t_b3_10 = vmulq_f32(self.twiddle1im, x10m19); + let t_b3_11 = vmulq_f32(self.twiddle4im, x11m18); + let t_b3_12 = vmulq_f32(self.twiddle7im, x12m17); + let t_b3_13 = vmulq_f32(self.twiddle10im, x13m16); + let t_b3_14 = vmulq_f32(self.twiddle13im, x14m15); + let t_b4_1 = vmulq_f32(self.twiddle4im, x1m28); + let t_b4_2 = vmulq_f32(self.twiddle8im, x2m27); + let t_b4_3 = vmulq_f32(self.twiddle12im, x3m26); + let t_b4_4 = vmulq_f32(self.twiddle13im, x4m25); + let t_b4_5 = vmulq_f32(self.twiddle9im, x5m24); + let t_b4_6 = vmulq_f32(self.twiddle5im, x6m23); + let t_b4_7 = vmulq_f32(self.twiddle1im, x7m22); + let t_b4_8 = vmulq_f32(self.twiddle3im, x8m21); + let t_b4_9 = vmulq_f32(self.twiddle7im, x9m20); + let t_b4_10 = vmulq_f32(self.twiddle11im, x10m19); + let t_b4_11 = vmulq_f32(self.twiddle14im, x11m18); + let t_b4_12 = vmulq_f32(self.twiddle10im, x12m17); + let t_b4_13 = vmulq_f32(self.twiddle6im, x13m16); + let t_b4_14 = vmulq_f32(self.twiddle2im, x14m15); + let t_b5_1 = vmulq_f32(self.twiddle5im, x1m28); + let t_b5_2 = vmulq_f32(self.twiddle10im, x2m27); + let t_b5_3 = vmulq_f32(self.twiddle14im, x3m26); + let t_b5_4 = vmulq_f32(self.twiddle9im, x4m25); + let t_b5_5 = vmulq_f32(self.twiddle4im, x5m24); + let t_b5_6 = vmulq_f32(self.twiddle1im, x6m23); + let t_b5_7 = vmulq_f32(self.twiddle6im, x7m22); + let t_b5_8 = vmulq_f32(self.twiddle11im, x8m21); + let t_b5_9 = vmulq_f32(self.twiddle13im, x9m20); + let t_b5_10 = vmulq_f32(self.twiddle8im, x10m19); + let t_b5_11 = vmulq_f32(self.twiddle3im, x11m18); + let t_b5_12 = vmulq_f32(self.twiddle2im, x12m17); + let t_b5_13 = vmulq_f32(self.twiddle7im, x13m16); + let t_b5_14 = vmulq_f32(self.twiddle12im, x14m15); + let t_b6_1 = vmulq_f32(self.twiddle6im, x1m28); + let t_b6_2 = vmulq_f32(self.twiddle12im, x2m27); + let t_b6_3 = vmulq_f32(self.twiddle11im, x3m26); + let t_b6_4 = vmulq_f32(self.twiddle5im, x4m25); + let t_b6_5 = vmulq_f32(self.twiddle1im, x5m24); + let t_b6_6 = vmulq_f32(self.twiddle7im, x6m23); + let t_b6_7 = vmulq_f32(self.twiddle13im, x7m22); + let t_b6_8 = vmulq_f32(self.twiddle10im, x8m21); + let t_b6_9 = vmulq_f32(self.twiddle4im, x9m20); + let t_b6_10 = vmulq_f32(self.twiddle2im, x10m19); + let t_b6_11 = vmulq_f32(self.twiddle8im, x11m18); + let t_b6_12 = vmulq_f32(self.twiddle14im, x12m17); + let t_b6_13 = vmulq_f32(self.twiddle9im, x13m16); + let t_b6_14 = vmulq_f32(self.twiddle3im, x14m15); + let t_b7_1 = vmulq_f32(self.twiddle7im, x1m28); + let t_b7_2 = vmulq_f32(self.twiddle14im, x2m27); + let t_b7_3 = vmulq_f32(self.twiddle8im, x3m26); + let t_b7_4 = vmulq_f32(self.twiddle1im, x4m25); + let t_b7_5 = vmulq_f32(self.twiddle6im, x5m24); + let t_b7_6 = vmulq_f32(self.twiddle13im, x6m23); + let t_b7_7 = vmulq_f32(self.twiddle9im, x7m22); + let t_b7_8 = vmulq_f32(self.twiddle2im, x8m21); + let t_b7_9 = vmulq_f32(self.twiddle5im, x9m20); + let t_b7_10 = vmulq_f32(self.twiddle12im, x10m19); + let t_b7_11 = vmulq_f32(self.twiddle10im, x11m18); + let t_b7_12 = vmulq_f32(self.twiddle3im, x12m17); + let t_b7_13 = vmulq_f32(self.twiddle4im, x13m16); + let t_b7_14 = vmulq_f32(self.twiddle11im, x14m15); + let t_b8_1 = vmulq_f32(self.twiddle8im, x1m28); + let t_b8_2 = vmulq_f32(self.twiddle13im, x2m27); + let t_b8_3 = vmulq_f32(self.twiddle5im, x3m26); + let t_b8_4 = vmulq_f32(self.twiddle3im, x4m25); + let t_b8_5 = vmulq_f32(self.twiddle11im, x5m24); + let t_b8_6 = vmulq_f32(self.twiddle10im, x6m23); + let t_b8_7 = vmulq_f32(self.twiddle2im, x7m22); + let t_b8_8 = vmulq_f32(self.twiddle6im, x8m21); + let t_b8_9 = vmulq_f32(self.twiddle14im, x9m20); + let t_b8_10 = vmulq_f32(self.twiddle7im, x10m19); + let t_b8_11 = vmulq_f32(self.twiddle1im, x11m18); + let t_b8_12 = vmulq_f32(self.twiddle9im, x12m17); + let t_b8_13 = vmulq_f32(self.twiddle12im, x13m16); + let t_b8_14 = vmulq_f32(self.twiddle4im, x14m15); + let t_b9_1 = vmulq_f32(self.twiddle9im, x1m28); + let t_b9_2 = vmulq_f32(self.twiddle11im, x2m27); + let t_b9_3 = vmulq_f32(self.twiddle2im, x3m26); + let t_b9_4 = vmulq_f32(self.twiddle7im, x4m25); + let t_b9_5 = vmulq_f32(self.twiddle13im, x5m24); + let t_b9_6 = vmulq_f32(self.twiddle4im, x6m23); + let t_b9_7 = vmulq_f32(self.twiddle5im, x7m22); + let t_b9_8 = vmulq_f32(self.twiddle14im, x8m21); + let t_b9_9 = vmulq_f32(self.twiddle6im, x9m20); + let t_b9_10 = vmulq_f32(self.twiddle3im, x10m19); + let t_b9_11 = vmulq_f32(self.twiddle12im, x11m18); + let t_b9_12 = vmulq_f32(self.twiddle8im, x12m17); + let t_b9_13 = vmulq_f32(self.twiddle1im, x13m16); + let t_b9_14 = vmulq_f32(self.twiddle10im, x14m15); + let t_b10_1 = vmulq_f32(self.twiddle10im, x1m28); + let t_b10_2 = vmulq_f32(self.twiddle9im, x2m27); + let t_b10_3 = vmulq_f32(self.twiddle1im, x3m26); + let t_b10_4 = vmulq_f32(self.twiddle11im, x4m25); + let t_b10_5 = vmulq_f32(self.twiddle8im, x5m24); + let t_b10_6 = vmulq_f32(self.twiddle2im, x6m23); + let t_b10_7 = vmulq_f32(self.twiddle12im, x7m22); + let t_b10_8 = vmulq_f32(self.twiddle7im, x8m21); + let t_b10_9 = vmulq_f32(self.twiddle3im, x9m20); + let t_b10_10 = vmulq_f32(self.twiddle13im, x10m19); + let t_b10_11 = vmulq_f32(self.twiddle6im, x11m18); + let t_b10_12 = vmulq_f32(self.twiddle4im, x12m17); + let t_b10_13 = vmulq_f32(self.twiddle14im, x13m16); + let t_b10_14 = vmulq_f32(self.twiddle5im, x14m15); + let t_b11_1 = vmulq_f32(self.twiddle11im, x1m28); + let t_b11_2 = vmulq_f32(self.twiddle7im, x2m27); + let t_b11_3 = vmulq_f32(self.twiddle4im, x3m26); + let t_b11_4 = vmulq_f32(self.twiddle14im, x4m25); + let t_b11_5 = vmulq_f32(self.twiddle3im, x5m24); + let t_b11_6 = vmulq_f32(self.twiddle8im, x6m23); + let t_b11_7 = vmulq_f32(self.twiddle10im, x7m22); + let t_b11_8 = vmulq_f32(self.twiddle1im, x8m21); + let t_b11_9 = vmulq_f32(self.twiddle12im, x9m20); + let t_b11_10 = vmulq_f32(self.twiddle6im, x10m19); + let t_b11_11 = vmulq_f32(self.twiddle5im, x11m18); + let t_b11_12 = vmulq_f32(self.twiddle13im, x12m17); + let t_b11_13 = vmulq_f32(self.twiddle2im, x13m16); + let t_b11_14 = vmulq_f32(self.twiddle9im, x14m15); + let t_b12_1 = vmulq_f32(self.twiddle12im, x1m28); + let t_b12_2 = vmulq_f32(self.twiddle5im, x2m27); + let t_b12_3 = vmulq_f32(self.twiddle7im, x3m26); + let t_b12_4 = vmulq_f32(self.twiddle10im, x4m25); + let t_b12_5 = vmulq_f32(self.twiddle2im, x5m24); + let t_b12_6 = vmulq_f32(self.twiddle14im, x6m23); + let t_b12_7 = vmulq_f32(self.twiddle3im, x7m22); + let t_b12_8 = vmulq_f32(self.twiddle9im, x8m21); + let t_b12_9 = vmulq_f32(self.twiddle8im, x9m20); + let t_b12_10 = vmulq_f32(self.twiddle4im, x10m19); + let t_b12_11 = vmulq_f32(self.twiddle13im, x11m18); + let t_b12_12 = vmulq_f32(self.twiddle1im, x12m17); + let t_b12_13 = vmulq_f32(self.twiddle11im, x13m16); + let t_b12_14 = vmulq_f32(self.twiddle6im, x14m15); + let t_b13_1 = vmulq_f32(self.twiddle13im, x1m28); + let t_b13_2 = vmulq_f32(self.twiddle3im, x2m27); + let t_b13_3 = vmulq_f32(self.twiddle10im, x3m26); + let t_b13_4 = vmulq_f32(self.twiddle6im, x4m25); + let t_b13_5 = vmulq_f32(self.twiddle7im, x5m24); + let t_b13_6 = vmulq_f32(self.twiddle9im, x6m23); + let t_b13_7 = vmulq_f32(self.twiddle4im, x7m22); + let t_b13_8 = vmulq_f32(self.twiddle12im, x8m21); + let t_b13_9 = vmulq_f32(self.twiddle1im, x9m20); + let t_b13_10 = vmulq_f32(self.twiddle14im, x10m19); + let t_b13_11 = vmulq_f32(self.twiddle2im, x11m18); + let t_b13_12 = vmulq_f32(self.twiddle11im, x12m17); + let t_b13_13 = vmulq_f32(self.twiddle5im, x13m16); + let t_b13_14 = vmulq_f32(self.twiddle8im, x14m15); + let t_b14_1 = vmulq_f32(self.twiddle14im, x1m28); + let t_b14_2 = vmulq_f32(self.twiddle1im, x2m27); + let t_b14_3 = vmulq_f32(self.twiddle13im, x3m26); + let t_b14_4 = vmulq_f32(self.twiddle2im, x4m25); + let t_b14_5 = vmulq_f32(self.twiddle12im, x5m24); + let t_b14_6 = vmulq_f32(self.twiddle3im, x6m23); + let t_b14_7 = vmulq_f32(self.twiddle11im, x7m22); + let t_b14_8 = vmulq_f32(self.twiddle4im, x8m21); + let t_b14_9 = vmulq_f32(self.twiddle10im, x9m20); + let t_b14_10 = vmulq_f32(self.twiddle5im, x10m19); + let t_b14_11 = vmulq_f32(self.twiddle9im, x11m18); + let t_b14_12 = vmulq_f32(self.twiddle6im, x12m17); + let t_b14_13 = vmulq_f32(self.twiddle8im, x13m16); + let t_b14_14 = vmulq_f32(self.twiddle7im, x14m15); + + let x0 = values[0]; + let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11 + t_a1_12 + t_a1_13 + t_a1_14); + let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11 + t_a2_12 + t_a2_13 + t_a2_14); + let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11 + t_a3_12 + t_a3_13 + t_a3_14); + let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11 + t_a4_12 + t_a4_13 + t_a4_14); + let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11 + t_a5_12 + t_a5_13 + t_a5_14); + let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11 + t_a6_12 + t_a6_13 + t_a6_14); + let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11 + t_a7_12 + t_a7_13 + t_a7_14); + let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11 + t_a8_12 + t_a8_13 + t_a8_14); + let t_a9 = calc_f32!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11 + t_a9_12 + t_a9_13 + t_a9_14); + let t_a10 = calc_f32!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11 + t_a10_12 + t_a10_13 + t_a10_14); + let t_a11 = calc_f32!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11 + t_a11_12 + t_a11_13 + t_a11_14); + let t_a12 = calc_f32!(x0 + t_a12_1 + t_a12_2 + t_a12_3 + t_a12_4 + t_a12_5 + t_a12_6 + t_a12_7 + t_a12_8 + t_a12_9 + t_a12_10 + t_a12_11 + t_a12_12 + t_a12_13 + t_a12_14); + let t_a13 = calc_f32!(x0 + t_a13_1 + t_a13_2 + t_a13_3 + t_a13_4 + t_a13_5 + t_a13_6 + t_a13_7 + t_a13_8 + t_a13_9 + t_a13_10 + t_a13_11 + t_a13_12 + t_a13_13 + t_a13_14); + let t_a14 = calc_f32!(x0 + t_a14_1 + t_a14_2 + t_a14_3 + t_a14_4 + t_a14_5 + t_a14_6 + t_a14_7 + t_a14_8 + t_a14_9 + t_a14_10 + t_a14_11 + t_a14_12 + t_a14_13 + t_a14_14); + + let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11 + t_b1_12 + t_b1_13 + t_b1_14); + let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 + t_b2_6 + t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11 - t_b2_12 - t_b2_13 - t_b2_14); + let t_b3 = calc_f32!(t_b3_1 + t_b3_2 + t_b3_3 + t_b3_4 - t_b3_5 - t_b3_6 - t_b3_7 - t_b3_8 - t_b3_9 + t_b3_10 + t_b3_11 + t_b3_12 + t_b3_13 + t_b3_14); + let t_b4 = calc_f32!(t_b4_1 + t_b4_2 + t_b4_3 - t_b4_4 - t_b4_5 - t_b4_6 - t_b4_7 + t_b4_8 + t_b4_9 + t_b4_10 - t_b4_11 - t_b4_12 - t_b4_13 - t_b4_14); + let t_b5 = calc_f32!(t_b5_1 + t_b5_2 - t_b5_3 - t_b5_4 - t_b5_5 + t_b5_6 + t_b5_7 + t_b5_8 - t_b5_9 - t_b5_10 - t_b5_11 + t_b5_12 + t_b5_13 + t_b5_14); + let t_b6 = calc_f32!(t_b6_1 + t_b6_2 - t_b6_3 - t_b6_4 + t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9 + t_b6_10 + t_b6_11 + t_b6_12 - t_b6_13 - t_b6_14); + let t_b7 = calc_f32!(t_b7_1 + t_b7_2 - t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9 + t_b7_10 - t_b7_11 - t_b7_12 + t_b7_13 + t_b7_14); + let t_b8 = calc_f32!(t_b8_1 - t_b8_2 - t_b8_3 + t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 + t_b8_9 - t_b8_10 + t_b8_11 + t_b8_12 - t_b8_13 - t_b8_14); + let t_b9 = calc_f32!(t_b9_1 - t_b9_2 - t_b9_3 + t_b9_4 - t_b9_5 - t_b9_6 + t_b9_7 + t_b9_8 - t_b9_9 + t_b9_10 + t_b9_11 - t_b9_12 + t_b9_13 + t_b9_14); + let t_b10 = calc_f32!(t_b10_1 - t_b10_2 + t_b10_3 + t_b10_4 - t_b10_5 + t_b10_6 + t_b10_7 - t_b10_8 + t_b10_9 + t_b10_10 - t_b10_11 + t_b10_12 + t_b10_13 - t_b10_14); + let t_b11 = calc_f32!(t_b11_1 - t_b11_2 + t_b11_3 - t_b11_4 - t_b11_5 + t_b11_6 - t_b11_7 + t_b11_8 + t_b11_9 - t_b11_10 + t_b11_11 - t_b11_12 - t_b11_13 + t_b11_14); + let t_b12 = calc_f32!(t_b12_1 - t_b12_2 + t_b12_3 - t_b12_4 + t_b12_5 + t_b12_6 - t_b12_7 + t_b12_8 - t_b12_9 + t_b12_10 - t_b12_11 - t_b12_12 + t_b12_13 - t_b12_14); + let t_b13 = calc_f32!(t_b13_1 - t_b13_2 + t_b13_3 - t_b13_4 + t_b13_5 - t_b13_6 + t_b13_7 - t_b13_8 + t_b13_9 + t_b13_10 - t_b13_11 + t_b13_12 - t_b13_13 + t_b13_14); + let t_b14 = calc_f32!(t_b14_1 - t_b14_2 + t_b14_3 - t_b14_4 + t_b14_5 - t_b14_6 + t_b14_7 - t_b14_8 + t_b14_9 - t_b14_10 + t_b14_11 - t_b14_12 + t_b14_13 - t_b14_14); + + let t_b1_rot = self.rotate.rotate_both(t_b1); + let t_b2_rot = self.rotate.rotate_both(t_b2); + let t_b3_rot = self.rotate.rotate_both(t_b3); + let t_b4_rot = self.rotate.rotate_both(t_b4); + let t_b5_rot = self.rotate.rotate_both(t_b5); + let t_b6_rot = self.rotate.rotate_both(t_b6); + let t_b7_rot = self.rotate.rotate_both(t_b7); + let t_b8_rot = self.rotate.rotate_both(t_b8); + let t_b9_rot = self.rotate.rotate_both(t_b9); + let t_b10_rot = self.rotate.rotate_both(t_b10); + let t_b11_rot = self.rotate.rotate_both(t_b11); + let t_b12_rot = self.rotate.rotate_both(t_b12); + let t_b13_rot = self.rotate.rotate_both(t_b13); + let t_b14_rot = self.rotate.rotate_both(t_b14); + + let y0 = calc_f32!(x0 + x1p28 + x2p27 + x3p26 + x4p25 + x5p24 + x6p23 + x7p22 + x8p21 + x9p20 + x10p19 + x11p18 + x12p17 + x13p16 + x14p15); + let [y1, y28] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot); + let [y2, y27] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot); + let [y3, y26] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot); + let [y4, y25] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot); + let [y5, y24] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot); + let [y6, y23] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot); + let [y7, y22] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot); + let [y8, y21] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot); + let [y9, y20] = parallel_fft2_interleaved_f32(t_a9, t_b9_rot); + let [y10, y19] = parallel_fft2_interleaved_f32(t_a10, t_b10_rot); + let [y11, y18] = parallel_fft2_interleaved_f32(t_a11, t_b11_rot); + let [y12, y17] = parallel_fft2_interleaved_f32(t_a12, t_b12_rot); + let [y13, y16] = parallel_fft2_interleaved_f32(t_a13, t_b13_rot); + let [y14, y15] = parallel_fft2_interleaved_f32(t_a14, t_b14_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28] + } +} + +// ____ ___ __ _ _ _ _ _ +// |___ \ / _ \ / /_ | || | | |__ (_) |_ +// __) | (_) | _____ | '_ \| || |_| '_ \| | __| +// / __/ \__, | |_____| | (_) |__ _| |_) | | |_ +// |_____| /_/ \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly29 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, + twiddle3re: float64x2_t, + twiddle3im: float64x2_t, + twiddle4re: float64x2_t, + twiddle4im: float64x2_t, + twiddle5re: float64x2_t, + twiddle5im: float64x2_t, + twiddle6re: float64x2_t, + twiddle6im: float64x2_t, + twiddle7re: float64x2_t, + twiddle7im: float64x2_t, + twiddle8re: float64x2_t, + twiddle8im: float64x2_t, + twiddle9re: float64x2_t, + twiddle9im: float64x2_t, + twiddle10re: float64x2_t, + twiddle10im: float64x2_t, + twiddle11re: float64x2_t, + twiddle11im: float64x2_t, + twiddle12re: float64x2_t, + twiddle12im: float64x2_t, + twiddle13re: float64x2_t, + twiddle13im: float64x2_t, + twiddle14re: float64x2_t, + twiddle14im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly29, 29, |this: &NeonF64Butterfly29<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly29, 29, |this: &NeonF64Butterfly29<_>| this + .direction); +impl NeonF64Butterfly29 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 29, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 29, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 29, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 29, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 29, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 29, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 29, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 29, direction); + let tw9: Complex = twiddles::compute_twiddle(9, 29, direction); + let tw10: Complex = twiddles::compute_twiddle(10, 29, direction); + let tw11: Complex = twiddles::compute_twiddle(11, 29, direction); + let tw12: Complex = twiddles::compute_twiddle(12, 29, direction); + let tw13: Complex = twiddles::compute_twiddle(13, 29, direction); + let tw14: Complex = twiddles::compute_twiddle(14, 29, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f64(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f64(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f64(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f64(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f64(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f64(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f64(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f64(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f64(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f64(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f64(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f64(tw8.im) }; + let twiddle9re = unsafe { vmovq_n_f64(tw9.re) }; + let twiddle9im = unsafe { vmovq_n_f64(tw9.im) }; + let twiddle10re = unsafe { vmovq_n_f64(tw10.re) }; + let twiddle10im = unsafe { vmovq_n_f64(tw10.im) }; + let twiddle11re = unsafe { vmovq_n_f64(tw11.re) }; + let twiddle11im = unsafe { vmovq_n_f64(tw11.im) }; + let twiddle12re = unsafe { vmovq_n_f64(tw12.re) }; + let twiddle12im = unsafe { vmovq_n_f64(tw12.im) }; + let twiddle13re = unsafe { vmovq_n_f64(tw13.re) }; + let twiddle13im = unsafe { vmovq_n_f64(tw13.im) }; + let twiddle14re = unsafe { vmovq_n_f64(tw14.re) }; + let twiddle14im = unsafe { vmovq_n_f64(tw14.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + twiddle9re, + twiddle9im, + twiddle10re, + twiddle10im, + twiddle11re, + twiddle11im, + twiddle12re, + twiddle12im, + twiddle13re, + twiddle13im, + twiddle14re, + twiddle14im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 29]) -> [float64x2_t; 29] { + let [x1p28, x1m28] = solo_fft2_f64(values[1], values[28]); + let [x2p27, x2m27] = solo_fft2_f64(values[2], values[27]); + let [x3p26, x3m26] = solo_fft2_f64(values[3], values[26]); + let [x4p25, x4m25] = solo_fft2_f64(values[4], values[25]); + let [x5p24, x5m24] = solo_fft2_f64(values[5], values[24]); + let [x6p23, x6m23] = solo_fft2_f64(values[6], values[23]); + let [x7p22, x7m22] = solo_fft2_f64(values[7], values[22]); + let [x8p21, x8m21] = solo_fft2_f64(values[8], values[21]); + let [x9p20, x9m20] = solo_fft2_f64(values[9], values[20]); + let [x10p19, x10m19] = solo_fft2_f64(values[10], values[19]); + let [x11p18, x11m18] = solo_fft2_f64(values[11], values[18]); + let [x12p17, x12m17] = solo_fft2_f64(values[12], values[17]); + let [x13p16, x13m16] = solo_fft2_f64(values[13], values[16]); + let [x14p15, x14m15] = solo_fft2_f64(values[14], values[15]); + + let t_a1_1 = vmulq_f64(self.twiddle1re, x1p28); + let t_a1_2 = vmulq_f64(self.twiddle2re, x2p27); + let t_a1_3 = vmulq_f64(self.twiddle3re, x3p26); + let t_a1_4 = vmulq_f64(self.twiddle4re, x4p25); + let t_a1_5 = vmulq_f64(self.twiddle5re, x5p24); + let t_a1_6 = vmulq_f64(self.twiddle6re, x6p23); + let t_a1_7 = vmulq_f64(self.twiddle7re, x7p22); + let t_a1_8 = vmulq_f64(self.twiddle8re, x8p21); + let t_a1_9 = vmulq_f64(self.twiddle9re, x9p20); + let t_a1_10 = vmulq_f64(self.twiddle10re, x10p19); + let t_a1_11 = vmulq_f64(self.twiddle11re, x11p18); + let t_a1_12 = vmulq_f64(self.twiddle12re, x12p17); + let t_a1_13 = vmulq_f64(self.twiddle13re, x13p16); + let t_a1_14 = vmulq_f64(self.twiddle14re, x14p15); + let t_a2_1 = vmulq_f64(self.twiddle2re, x1p28); + let t_a2_2 = vmulq_f64(self.twiddle4re, x2p27); + let t_a2_3 = vmulq_f64(self.twiddle6re, x3p26); + let t_a2_4 = vmulq_f64(self.twiddle8re, x4p25); + let t_a2_5 = vmulq_f64(self.twiddle10re, x5p24); + let t_a2_6 = vmulq_f64(self.twiddle12re, x6p23); + let t_a2_7 = vmulq_f64(self.twiddle14re, x7p22); + let t_a2_8 = vmulq_f64(self.twiddle13re, x8p21); + let t_a2_9 = vmulq_f64(self.twiddle11re, x9p20); + let t_a2_10 = vmulq_f64(self.twiddle9re, x10p19); + let t_a2_11 = vmulq_f64(self.twiddle7re, x11p18); + let t_a2_12 = vmulq_f64(self.twiddle5re, x12p17); + let t_a2_13 = vmulq_f64(self.twiddle3re, x13p16); + let t_a2_14 = vmulq_f64(self.twiddle1re, x14p15); + let t_a3_1 = vmulq_f64(self.twiddle3re, x1p28); + let t_a3_2 = vmulq_f64(self.twiddle6re, x2p27); + let t_a3_3 = vmulq_f64(self.twiddle9re, x3p26); + let t_a3_4 = vmulq_f64(self.twiddle12re, x4p25); + let t_a3_5 = vmulq_f64(self.twiddle14re, x5p24); + let t_a3_6 = vmulq_f64(self.twiddle11re, x6p23); + let t_a3_7 = vmulq_f64(self.twiddle8re, x7p22); + let t_a3_8 = vmulq_f64(self.twiddle5re, x8p21); + let t_a3_9 = vmulq_f64(self.twiddle2re, x9p20); + let t_a3_10 = vmulq_f64(self.twiddle1re, x10p19); + let t_a3_11 = vmulq_f64(self.twiddle4re, x11p18); + let t_a3_12 = vmulq_f64(self.twiddle7re, x12p17); + let t_a3_13 = vmulq_f64(self.twiddle10re, x13p16); + let t_a3_14 = vmulq_f64(self.twiddle13re, x14p15); + let t_a4_1 = vmulq_f64(self.twiddle4re, x1p28); + let t_a4_2 = vmulq_f64(self.twiddle8re, x2p27); + let t_a4_3 = vmulq_f64(self.twiddle12re, x3p26); + let t_a4_4 = vmulq_f64(self.twiddle13re, x4p25); + let t_a4_5 = vmulq_f64(self.twiddle9re, x5p24); + let t_a4_6 = vmulq_f64(self.twiddle5re, x6p23); + let t_a4_7 = vmulq_f64(self.twiddle1re, x7p22); + let t_a4_8 = vmulq_f64(self.twiddle3re, x8p21); + let t_a4_9 = vmulq_f64(self.twiddle7re, x9p20); + let t_a4_10 = vmulq_f64(self.twiddle11re, x10p19); + let t_a4_11 = vmulq_f64(self.twiddle14re, x11p18); + let t_a4_12 = vmulq_f64(self.twiddle10re, x12p17); + let t_a4_13 = vmulq_f64(self.twiddle6re, x13p16); + let t_a4_14 = vmulq_f64(self.twiddle2re, x14p15); + let t_a5_1 = vmulq_f64(self.twiddle5re, x1p28); + let t_a5_2 = vmulq_f64(self.twiddle10re, x2p27); + let t_a5_3 = vmulq_f64(self.twiddle14re, x3p26); + let t_a5_4 = vmulq_f64(self.twiddle9re, x4p25); + let t_a5_5 = vmulq_f64(self.twiddle4re, x5p24); + let t_a5_6 = vmulq_f64(self.twiddle1re, x6p23); + let t_a5_7 = vmulq_f64(self.twiddle6re, x7p22); + let t_a5_8 = vmulq_f64(self.twiddle11re, x8p21); + let t_a5_9 = vmulq_f64(self.twiddle13re, x9p20); + let t_a5_10 = vmulq_f64(self.twiddle8re, x10p19); + let t_a5_11 = vmulq_f64(self.twiddle3re, x11p18); + let t_a5_12 = vmulq_f64(self.twiddle2re, x12p17); + let t_a5_13 = vmulq_f64(self.twiddle7re, x13p16); + let t_a5_14 = vmulq_f64(self.twiddle12re, x14p15); + let t_a6_1 = vmulq_f64(self.twiddle6re, x1p28); + let t_a6_2 = vmulq_f64(self.twiddle12re, x2p27); + let t_a6_3 = vmulq_f64(self.twiddle11re, x3p26); + let t_a6_4 = vmulq_f64(self.twiddle5re, x4p25); + let t_a6_5 = vmulq_f64(self.twiddle1re, x5p24); + let t_a6_6 = vmulq_f64(self.twiddle7re, x6p23); + let t_a6_7 = vmulq_f64(self.twiddle13re, x7p22); + let t_a6_8 = vmulq_f64(self.twiddle10re, x8p21); + let t_a6_9 = vmulq_f64(self.twiddle4re, x9p20); + let t_a6_10 = vmulq_f64(self.twiddle2re, x10p19); + let t_a6_11 = vmulq_f64(self.twiddle8re, x11p18); + let t_a6_12 = vmulq_f64(self.twiddle14re, x12p17); + let t_a6_13 = vmulq_f64(self.twiddle9re, x13p16); + let t_a6_14 = vmulq_f64(self.twiddle3re, x14p15); + let t_a7_1 = vmulq_f64(self.twiddle7re, x1p28); + let t_a7_2 = vmulq_f64(self.twiddle14re, x2p27); + let t_a7_3 = vmulq_f64(self.twiddle8re, x3p26); + let t_a7_4 = vmulq_f64(self.twiddle1re, x4p25); + let t_a7_5 = vmulq_f64(self.twiddle6re, x5p24); + let t_a7_6 = vmulq_f64(self.twiddle13re, x6p23); + let t_a7_7 = vmulq_f64(self.twiddle9re, x7p22); + let t_a7_8 = vmulq_f64(self.twiddle2re, x8p21); + let t_a7_9 = vmulq_f64(self.twiddle5re, x9p20); + let t_a7_10 = vmulq_f64(self.twiddle12re, x10p19); + let t_a7_11 = vmulq_f64(self.twiddle10re, x11p18); + let t_a7_12 = vmulq_f64(self.twiddle3re, x12p17); + let t_a7_13 = vmulq_f64(self.twiddle4re, x13p16); + let t_a7_14 = vmulq_f64(self.twiddle11re, x14p15); + let t_a8_1 = vmulq_f64(self.twiddle8re, x1p28); + let t_a8_2 = vmulq_f64(self.twiddle13re, x2p27); + let t_a8_3 = vmulq_f64(self.twiddle5re, x3p26); + let t_a8_4 = vmulq_f64(self.twiddle3re, x4p25); + let t_a8_5 = vmulq_f64(self.twiddle11re, x5p24); + let t_a8_6 = vmulq_f64(self.twiddle10re, x6p23); + let t_a8_7 = vmulq_f64(self.twiddle2re, x7p22); + let t_a8_8 = vmulq_f64(self.twiddle6re, x8p21); + let t_a8_9 = vmulq_f64(self.twiddle14re, x9p20); + let t_a8_10 = vmulq_f64(self.twiddle7re, x10p19); + let t_a8_11 = vmulq_f64(self.twiddle1re, x11p18); + let t_a8_12 = vmulq_f64(self.twiddle9re, x12p17); + let t_a8_13 = vmulq_f64(self.twiddle12re, x13p16); + let t_a8_14 = vmulq_f64(self.twiddle4re, x14p15); + let t_a9_1 = vmulq_f64(self.twiddle9re, x1p28); + let t_a9_2 = vmulq_f64(self.twiddle11re, x2p27); + let t_a9_3 = vmulq_f64(self.twiddle2re, x3p26); + let t_a9_4 = vmulq_f64(self.twiddle7re, x4p25); + let t_a9_5 = vmulq_f64(self.twiddle13re, x5p24); + let t_a9_6 = vmulq_f64(self.twiddle4re, x6p23); + let t_a9_7 = vmulq_f64(self.twiddle5re, x7p22); + let t_a9_8 = vmulq_f64(self.twiddle14re, x8p21); + let t_a9_9 = vmulq_f64(self.twiddle6re, x9p20); + let t_a9_10 = vmulq_f64(self.twiddle3re, x10p19); + let t_a9_11 = vmulq_f64(self.twiddle12re, x11p18); + let t_a9_12 = vmulq_f64(self.twiddle8re, x12p17); + let t_a9_13 = vmulq_f64(self.twiddle1re, x13p16); + let t_a9_14 = vmulq_f64(self.twiddle10re, x14p15); + let t_a10_1 = vmulq_f64(self.twiddle10re, x1p28); + let t_a10_2 = vmulq_f64(self.twiddle9re, x2p27); + let t_a10_3 = vmulq_f64(self.twiddle1re, x3p26); + let t_a10_4 = vmulq_f64(self.twiddle11re, x4p25); + let t_a10_5 = vmulq_f64(self.twiddle8re, x5p24); + let t_a10_6 = vmulq_f64(self.twiddle2re, x6p23); + let t_a10_7 = vmulq_f64(self.twiddle12re, x7p22); + let t_a10_8 = vmulq_f64(self.twiddle7re, x8p21); + let t_a10_9 = vmulq_f64(self.twiddle3re, x9p20); + let t_a10_10 = vmulq_f64(self.twiddle13re, x10p19); + let t_a10_11 = vmulq_f64(self.twiddle6re, x11p18); + let t_a10_12 = vmulq_f64(self.twiddle4re, x12p17); + let t_a10_13 = vmulq_f64(self.twiddle14re, x13p16); + let t_a10_14 = vmulq_f64(self.twiddle5re, x14p15); + let t_a11_1 = vmulq_f64(self.twiddle11re, x1p28); + let t_a11_2 = vmulq_f64(self.twiddle7re, x2p27); + let t_a11_3 = vmulq_f64(self.twiddle4re, x3p26); + let t_a11_4 = vmulq_f64(self.twiddle14re, x4p25); + let t_a11_5 = vmulq_f64(self.twiddle3re, x5p24); + let t_a11_6 = vmulq_f64(self.twiddle8re, x6p23); + let t_a11_7 = vmulq_f64(self.twiddle10re, x7p22); + let t_a11_8 = vmulq_f64(self.twiddle1re, x8p21); + let t_a11_9 = vmulq_f64(self.twiddle12re, x9p20); + let t_a11_10 = vmulq_f64(self.twiddle6re, x10p19); + let t_a11_11 = vmulq_f64(self.twiddle5re, x11p18); + let t_a11_12 = vmulq_f64(self.twiddle13re, x12p17); + let t_a11_13 = vmulq_f64(self.twiddle2re, x13p16); + let t_a11_14 = vmulq_f64(self.twiddle9re, x14p15); + let t_a12_1 = vmulq_f64(self.twiddle12re, x1p28); + let t_a12_2 = vmulq_f64(self.twiddle5re, x2p27); + let t_a12_3 = vmulq_f64(self.twiddle7re, x3p26); + let t_a12_4 = vmulq_f64(self.twiddle10re, x4p25); + let t_a12_5 = vmulq_f64(self.twiddle2re, x5p24); + let t_a12_6 = vmulq_f64(self.twiddle14re, x6p23); + let t_a12_7 = vmulq_f64(self.twiddle3re, x7p22); + let t_a12_8 = vmulq_f64(self.twiddle9re, x8p21); + let t_a12_9 = vmulq_f64(self.twiddle8re, x9p20); + let t_a12_10 = vmulq_f64(self.twiddle4re, x10p19); + let t_a12_11 = vmulq_f64(self.twiddle13re, x11p18); + let t_a12_12 = vmulq_f64(self.twiddle1re, x12p17); + let t_a12_13 = vmulq_f64(self.twiddle11re, x13p16); + let t_a12_14 = vmulq_f64(self.twiddle6re, x14p15); + let t_a13_1 = vmulq_f64(self.twiddle13re, x1p28); + let t_a13_2 = vmulq_f64(self.twiddle3re, x2p27); + let t_a13_3 = vmulq_f64(self.twiddle10re, x3p26); + let t_a13_4 = vmulq_f64(self.twiddle6re, x4p25); + let t_a13_5 = vmulq_f64(self.twiddle7re, x5p24); + let t_a13_6 = vmulq_f64(self.twiddle9re, x6p23); + let t_a13_7 = vmulq_f64(self.twiddle4re, x7p22); + let t_a13_8 = vmulq_f64(self.twiddle12re, x8p21); + let t_a13_9 = vmulq_f64(self.twiddle1re, x9p20); + let t_a13_10 = vmulq_f64(self.twiddle14re, x10p19); + let t_a13_11 = vmulq_f64(self.twiddle2re, x11p18); + let t_a13_12 = vmulq_f64(self.twiddle11re, x12p17); + let t_a13_13 = vmulq_f64(self.twiddle5re, x13p16); + let t_a13_14 = vmulq_f64(self.twiddle8re, x14p15); + let t_a14_1 = vmulq_f64(self.twiddle14re, x1p28); + let t_a14_2 = vmulq_f64(self.twiddle1re, x2p27); + let t_a14_3 = vmulq_f64(self.twiddle13re, x3p26); + let t_a14_4 = vmulq_f64(self.twiddle2re, x4p25); + let t_a14_5 = vmulq_f64(self.twiddle12re, x5p24); + let t_a14_6 = vmulq_f64(self.twiddle3re, x6p23); + let t_a14_7 = vmulq_f64(self.twiddle11re, x7p22); + let t_a14_8 = vmulq_f64(self.twiddle4re, x8p21); + let t_a14_9 = vmulq_f64(self.twiddle10re, x9p20); + let t_a14_10 = vmulq_f64(self.twiddle5re, x10p19); + let t_a14_11 = vmulq_f64(self.twiddle9re, x11p18); + let t_a14_12 = vmulq_f64(self.twiddle6re, x12p17); + let t_a14_13 = vmulq_f64(self.twiddle8re, x13p16); + let t_a14_14 = vmulq_f64(self.twiddle7re, x14p15); + + let t_b1_1 = vmulq_f64(self.twiddle1im, x1m28); + let t_b1_2 = vmulq_f64(self.twiddle2im, x2m27); + let t_b1_3 = vmulq_f64(self.twiddle3im, x3m26); + let t_b1_4 = vmulq_f64(self.twiddle4im, x4m25); + let t_b1_5 = vmulq_f64(self.twiddle5im, x5m24); + let t_b1_6 = vmulq_f64(self.twiddle6im, x6m23); + let t_b1_7 = vmulq_f64(self.twiddle7im, x7m22); + let t_b1_8 = vmulq_f64(self.twiddle8im, x8m21); + let t_b1_9 = vmulq_f64(self.twiddle9im, x9m20); + let t_b1_10 = vmulq_f64(self.twiddle10im, x10m19); + let t_b1_11 = vmulq_f64(self.twiddle11im, x11m18); + let t_b1_12 = vmulq_f64(self.twiddle12im, x12m17); + let t_b1_13 = vmulq_f64(self.twiddle13im, x13m16); + let t_b1_14 = vmulq_f64(self.twiddle14im, x14m15); + let t_b2_1 = vmulq_f64(self.twiddle2im, x1m28); + let t_b2_2 = vmulq_f64(self.twiddle4im, x2m27); + let t_b2_3 = vmulq_f64(self.twiddle6im, x3m26); + let t_b2_4 = vmulq_f64(self.twiddle8im, x4m25); + let t_b2_5 = vmulq_f64(self.twiddle10im, x5m24); + let t_b2_6 = vmulq_f64(self.twiddle12im, x6m23); + let t_b2_7 = vmulq_f64(self.twiddle14im, x7m22); + let t_b2_8 = vmulq_f64(self.twiddle13im, x8m21); + let t_b2_9 = vmulq_f64(self.twiddle11im, x9m20); + let t_b2_10 = vmulq_f64(self.twiddle9im, x10m19); + let t_b2_11 = vmulq_f64(self.twiddle7im, x11m18); + let t_b2_12 = vmulq_f64(self.twiddle5im, x12m17); + let t_b2_13 = vmulq_f64(self.twiddle3im, x13m16); + let t_b2_14 = vmulq_f64(self.twiddle1im, x14m15); + let t_b3_1 = vmulq_f64(self.twiddle3im, x1m28); + let t_b3_2 = vmulq_f64(self.twiddle6im, x2m27); + let t_b3_3 = vmulq_f64(self.twiddle9im, x3m26); + let t_b3_4 = vmulq_f64(self.twiddle12im, x4m25); + let t_b3_5 = vmulq_f64(self.twiddle14im, x5m24); + let t_b3_6 = vmulq_f64(self.twiddle11im, x6m23); + let t_b3_7 = vmulq_f64(self.twiddle8im, x7m22); + let t_b3_8 = vmulq_f64(self.twiddle5im, x8m21); + let t_b3_9 = vmulq_f64(self.twiddle2im, x9m20); + let t_b3_10 = vmulq_f64(self.twiddle1im, x10m19); + let t_b3_11 = vmulq_f64(self.twiddle4im, x11m18); + let t_b3_12 = vmulq_f64(self.twiddle7im, x12m17); + let t_b3_13 = vmulq_f64(self.twiddle10im, x13m16); + let t_b3_14 = vmulq_f64(self.twiddle13im, x14m15); + let t_b4_1 = vmulq_f64(self.twiddle4im, x1m28); + let t_b4_2 = vmulq_f64(self.twiddle8im, x2m27); + let t_b4_3 = vmulq_f64(self.twiddle12im, x3m26); + let t_b4_4 = vmulq_f64(self.twiddle13im, x4m25); + let t_b4_5 = vmulq_f64(self.twiddle9im, x5m24); + let t_b4_6 = vmulq_f64(self.twiddle5im, x6m23); + let t_b4_7 = vmulq_f64(self.twiddle1im, x7m22); + let t_b4_8 = vmulq_f64(self.twiddle3im, x8m21); + let t_b4_9 = vmulq_f64(self.twiddle7im, x9m20); + let t_b4_10 = vmulq_f64(self.twiddle11im, x10m19); + let t_b4_11 = vmulq_f64(self.twiddle14im, x11m18); + let t_b4_12 = vmulq_f64(self.twiddle10im, x12m17); + let t_b4_13 = vmulq_f64(self.twiddle6im, x13m16); + let t_b4_14 = vmulq_f64(self.twiddle2im, x14m15); + let t_b5_1 = vmulq_f64(self.twiddle5im, x1m28); + let t_b5_2 = vmulq_f64(self.twiddle10im, x2m27); + let t_b5_3 = vmulq_f64(self.twiddle14im, x3m26); + let t_b5_4 = vmulq_f64(self.twiddle9im, x4m25); + let t_b5_5 = vmulq_f64(self.twiddle4im, x5m24); + let t_b5_6 = vmulq_f64(self.twiddle1im, x6m23); + let t_b5_7 = vmulq_f64(self.twiddle6im, x7m22); + let t_b5_8 = vmulq_f64(self.twiddle11im, x8m21); + let t_b5_9 = vmulq_f64(self.twiddle13im, x9m20); + let t_b5_10 = vmulq_f64(self.twiddle8im, x10m19); + let t_b5_11 = vmulq_f64(self.twiddle3im, x11m18); + let t_b5_12 = vmulq_f64(self.twiddle2im, x12m17); + let t_b5_13 = vmulq_f64(self.twiddle7im, x13m16); + let t_b5_14 = vmulq_f64(self.twiddle12im, x14m15); + let t_b6_1 = vmulq_f64(self.twiddle6im, x1m28); + let t_b6_2 = vmulq_f64(self.twiddle12im, x2m27); + let t_b6_3 = vmulq_f64(self.twiddle11im, x3m26); + let t_b6_4 = vmulq_f64(self.twiddle5im, x4m25); + let t_b6_5 = vmulq_f64(self.twiddle1im, x5m24); + let t_b6_6 = vmulq_f64(self.twiddle7im, x6m23); + let t_b6_7 = vmulq_f64(self.twiddle13im, x7m22); + let t_b6_8 = vmulq_f64(self.twiddle10im, x8m21); + let t_b6_9 = vmulq_f64(self.twiddle4im, x9m20); + let t_b6_10 = vmulq_f64(self.twiddle2im, x10m19); + let t_b6_11 = vmulq_f64(self.twiddle8im, x11m18); + let t_b6_12 = vmulq_f64(self.twiddle14im, x12m17); + let t_b6_13 = vmulq_f64(self.twiddle9im, x13m16); + let t_b6_14 = vmulq_f64(self.twiddle3im, x14m15); + let t_b7_1 = vmulq_f64(self.twiddle7im, x1m28); + let t_b7_2 = vmulq_f64(self.twiddle14im, x2m27); + let t_b7_3 = vmulq_f64(self.twiddle8im, x3m26); + let t_b7_4 = vmulq_f64(self.twiddle1im, x4m25); + let t_b7_5 = vmulq_f64(self.twiddle6im, x5m24); + let t_b7_6 = vmulq_f64(self.twiddle13im, x6m23); + let t_b7_7 = vmulq_f64(self.twiddle9im, x7m22); + let t_b7_8 = vmulq_f64(self.twiddle2im, x8m21); + let t_b7_9 = vmulq_f64(self.twiddle5im, x9m20); + let t_b7_10 = vmulq_f64(self.twiddle12im, x10m19); + let t_b7_11 = vmulq_f64(self.twiddle10im, x11m18); + let t_b7_12 = vmulq_f64(self.twiddle3im, x12m17); + let t_b7_13 = vmulq_f64(self.twiddle4im, x13m16); + let t_b7_14 = vmulq_f64(self.twiddle11im, x14m15); + let t_b8_1 = vmulq_f64(self.twiddle8im, x1m28); + let t_b8_2 = vmulq_f64(self.twiddle13im, x2m27); + let t_b8_3 = vmulq_f64(self.twiddle5im, x3m26); + let t_b8_4 = vmulq_f64(self.twiddle3im, x4m25); + let t_b8_5 = vmulq_f64(self.twiddle11im, x5m24); + let t_b8_6 = vmulq_f64(self.twiddle10im, x6m23); + let t_b8_7 = vmulq_f64(self.twiddle2im, x7m22); + let t_b8_8 = vmulq_f64(self.twiddle6im, x8m21); + let t_b8_9 = vmulq_f64(self.twiddle14im, x9m20); + let t_b8_10 = vmulq_f64(self.twiddle7im, x10m19); + let t_b8_11 = vmulq_f64(self.twiddle1im, x11m18); + let t_b8_12 = vmulq_f64(self.twiddle9im, x12m17); + let t_b8_13 = vmulq_f64(self.twiddle12im, x13m16); + let t_b8_14 = vmulq_f64(self.twiddle4im, x14m15); + let t_b9_1 = vmulq_f64(self.twiddle9im, x1m28); + let t_b9_2 = vmulq_f64(self.twiddle11im, x2m27); + let t_b9_3 = vmulq_f64(self.twiddle2im, x3m26); + let t_b9_4 = vmulq_f64(self.twiddle7im, x4m25); + let t_b9_5 = vmulq_f64(self.twiddle13im, x5m24); + let t_b9_6 = vmulq_f64(self.twiddle4im, x6m23); + let t_b9_7 = vmulq_f64(self.twiddle5im, x7m22); + let t_b9_8 = vmulq_f64(self.twiddle14im, x8m21); + let t_b9_9 = vmulq_f64(self.twiddle6im, x9m20); + let t_b9_10 = vmulq_f64(self.twiddle3im, x10m19); + let t_b9_11 = vmulq_f64(self.twiddle12im, x11m18); + let t_b9_12 = vmulq_f64(self.twiddle8im, x12m17); + let t_b9_13 = vmulq_f64(self.twiddle1im, x13m16); + let t_b9_14 = vmulq_f64(self.twiddle10im, x14m15); + let t_b10_1 = vmulq_f64(self.twiddle10im, x1m28); + let t_b10_2 = vmulq_f64(self.twiddle9im, x2m27); + let t_b10_3 = vmulq_f64(self.twiddle1im, x3m26); + let t_b10_4 = vmulq_f64(self.twiddle11im, x4m25); + let t_b10_5 = vmulq_f64(self.twiddle8im, x5m24); + let t_b10_6 = vmulq_f64(self.twiddle2im, x6m23); + let t_b10_7 = vmulq_f64(self.twiddle12im, x7m22); + let t_b10_8 = vmulq_f64(self.twiddle7im, x8m21); + let t_b10_9 = vmulq_f64(self.twiddle3im, x9m20); + let t_b10_10 = vmulq_f64(self.twiddle13im, x10m19); + let t_b10_11 = vmulq_f64(self.twiddle6im, x11m18); + let t_b10_12 = vmulq_f64(self.twiddle4im, x12m17); + let t_b10_13 = vmulq_f64(self.twiddle14im, x13m16); + let t_b10_14 = vmulq_f64(self.twiddle5im, x14m15); + let t_b11_1 = vmulq_f64(self.twiddle11im, x1m28); + let t_b11_2 = vmulq_f64(self.twiddle7im, x2m27); + let t_b11_3 = vmulq_f64(self.twiddle4im, x3m26); + let t_b11_4 = vmulq_f64(self.twiddle14im, x4m25); + let t_b11_5 = vmulq_f64(self.twiddle3im, x5m24); + let t_b11_6 = vmulq_f64(self.twiddle8im, x6m23); + let t_b11_7 = vmulq_f64(self.twiddle10im, x7m22); + let t_b11_8 = vmulq_f64(self.twiddle1im, x8m21); + let t_b11_9 = vmulq_f64(self.twiddle12im, x9m20); + let t_b11_10 = vmulq_f64(self.twiddle6im, x10m19); + let t_b11_11 = vmulq_f64(self.twiddle5im, x11m18); + let t_b11_12 = vmulq_f64(self.twiddle13im, x12m17); + let t_b11_13 = vmulq_f64(self.twiddle2im, x13m16); + let t_b11_14 = vmulq_f64(self.twiddle9im, x14m15); + let t_b12_1 = vmulq_f64(self.twiddle12im, x1m28); + let t_b12_2 = vmulq_f64(self.twiddle5im, x2m27); + let t_b12_3 = vmulq_f64(self.twiddle7im, x3m26); + let t_b12_4 = vmulq_f64(self.twiddle10im, x4m25); + let t_b12_5 = vmulq_f64(self.twiddle2im, x5m24); + let t_b12_6 = vmulq_f64(self.twiddle14im, x6m23); + let t_b12_7 = vmulq_f64(self.twiddle3im, x7m22); + let t_b12_8 = vmulq_f64(self.twiddle9im, x8m21); + let t_b12_9 = vmulq_f64(self.twiddle8im, x9m20); + let t_b12_10 = vmulq_f64(self.twiddle4im, x10m19); + let t_b12_11 = vmulq_f64(self.twiddle13im, x11m18); + let t_b12_12 = vmulq_f64(self.twiddle1im, x12m17); + let t_b12_13 = vmulq_f64(self.twiddle11im, x13m16); + let t_b12_14 = vmulq_f64(self.twiddle6im, x14m15); + let t_b13_1 = vmulq_f64(self.twiddle13im, x1m28); + let t_b13_2 = vmulq_f64(self.twiddle3im, x2m27); + let t_b13_3 = vmulq_f64(self.twiddle10im, x3m26); + let t_b13_4 = vmulq_f64(self.twiddle6im, x4m25); + let t_b13_5 = vmulq_f64(self.twiddle7im, x5m24); + let t_b13_6 = vmulq_f64(self.twiddle9im, x6m23); + let t_b13_7 = vmulq_f64(self.twiddle4im, x7m22); + let t_b13_8 = vmulq_f64(self.twiddle12im, x8m21); + let t_b13_9 = vmulq_f64(self.twiddle1im, x9m20); + let t_b13_10 = vmulq_f64(self.twiddle14im, x10m19); + let t_b13_11 = vmulq_f64(self.twiddle2im, x11m18); + let t_b13_12 = vmulq_f64(self.twiddle11im, x12m17); + let t_b13_13 = vmulq_f64(self.twiddle5im, x13m16); + let t_b13_14 = vmulq_f64(self.twiddle8im, x14m15); + let t_b14_1 = vmulq_f64(self.twiddle14im, x1m28); + let t_b14_2 = vmulq_f64(self.twiddle1im, x2m27); + let t_b14_3 = vmulq_f64(self.twiddle13im, x3m26); + let t_b14_4 = vmulq_f64(self.twiddle2im, x4m25); + let t_b14_5 = vmulq_f64(self.twiddle12im, x5m24); + let t_b14_6 = vmulq_f64(self.twiddle3im, x6m23); + let t_b14_7 = vmulq_f64(self.twiddle11im, x7m22); + let t_b14_8 = vmulq_f64(self.twiddle4im, x8m21); + let t_b14_9 = vmulq_f64(self.twiddle10im, x9m20); + let t_b14_10 = vmulq_f64(self.twiddle5im, x10m19); + let t_b14_11 = vmulq_f64(self.twiddle9im, x11m18); + let t_b14_12 = vmulq_f64(self.twiddle6im, x12m17); + let t_b14_13 = vmulq_f64(self.twiddle8im, x13m16); + let t_b14_14 = vmulq_f64(self.twiddle7im, x14m15); + + let x0 = values[0]; + let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11 + t_a1_12 + t_a1_13 + t_a1_14); + let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11 + t_a2_12 + t_a2_13 + t_a2_14); + let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11 + t_a3_12 + t_a3_13 + t_a3_14); + let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11 + t_a4_12 + t_a4_13 + t_a4_14); + let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11 + t_a5_12 + t_a5_13 + t_a5_14); + let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11 + t_a6_12 + t_a6_13 + t_a6_14); + let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11 + t_a7_12 + t_a7_13 + t_a7_14); + let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11 + t_a8_12 + t_a8_13 + t_a8_14); + let t_a9 = calc_f64!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11 + t_a9_12 + t_a9_13 + t_a9_14); + let t_a10 = calc_f64!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11 + t_a10_12 + t_a10_13 + t_a10_14); + let t_a11 = calc_f64!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11 + t_a11_12 + t_a11_13 + t_a11_14); + let t_a12 = calc_f64!(x0 + t_a12_1 + t_a12_2 + t_a12_3 + t_a12_4 + t_a12_5 + t_a12_6 + t_a12_7 + t_a12_8 + t_a12_9 + t_a12_10 + t_a12_11 + t_a12_12 + t_a12_13 + t_a12_14); + let t_a13 = calc_f64!(x0 + t_a13_1 + t_a13_2 + t_a13_3 + t_a13_4 + t_a13_5 + t_a13_6 + t_a13_7 + t_a13_8 + t_a13_9 + t_a13_10 + t_a13_11 + t_a13_12 + t_a13_13 + t_a13_14); + let t_a14 = calc_f64!(x0 + t_a14_1 + t_a14_2 + t_a14_3 + t_a14_4 + t_a14_5 + t_a14_6 + t_a14_7 + t_a14_8 + t_a14_9 + t_a14_10 + t_a14_11 + t_a14_12 + t_a14_13 + t_a14_14); + + let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11 + t_b1_12 + t_b1_13 + t_b1_14); + let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 + t_b2_6 + t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11 - t_b2_12 - t_b2_13 - t_b2_14); + let t_b3 = calc_f64!(t_b3_1 + t_b3_2 + t_b3_3 + t_b3_4 - t_b3_5 - t_b3_6 - t_b3_7 - t_b3_8 - t_b3_9 + t_b3_10 + t_b3_11 + t_b3_12 + t_b3_13 + t_b3_14); + let t_b4 = calc_f64!(t_b4_1 + t_b4_2 + t_b4_3 - t_b4_4 - t_b4_5 - t_b4_6 - t_b4_7 + t_b4_8 + t_b4_9 + t_b4_10 - t_b4_11 - t_b4_12 - t_b4_13 - t_b4_14); + let t_b5 = calc_f64!(t_b5_1 + t_b5_2 - t_b5_3 - t_b5_4 - t_b5_5 + t_b5_6 + t_b5_7 + t_b5_8 - t_b5_9 - t_b5_10 - t_b5_11 + t_b5_12 + t_b5_13 + t_b5_14); + let t_b6 = calc_f64!(t_b6_1 + t_b6_2 - t_b6_3 - t_b6_4 + t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9 + t_b6_10 + t_b6_11 + t_b6_12 - t_b6_13 - t_b6_14); + let t_b7 = calc_f64!(t_b7_1 + t_b7_2 - t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9 + t_b7_10 - t_b7_11 - t_b7_12 + t_b7_13 + t_b7_14); + let t_b8 = calc_f64!(t_b8_1 - t_b8_2 - t_b8_3 + t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 + t_b8_9 - t_b8_10 + t_b8_11 + t_b8_12 - t_b8_13 - t_b8_14); + let t_b9 = calc_f64!(t_b9_1 - t_b9_2 - t_b9_3 + t_b9_4 - t_b9_5 - t_b9_6 + t_b9_7 + t_b9_8 - t_b9_9 + t_b9_10 + t_b9_11 - t_b9_12 + t_b9_13 + t_b9_14); + let t_b10 = calc_f64!(t_b10_1 - t_b10_2 + t_b10_3 + t_b10_4 - t_b10_5 + t_b10_6 + t_b10_7 - t_b10_8 + t_b10_9 + t_b10_10 - t_b10_11 + t_b10_12 + t_b10_13 - t_b10_14); + let t_b11 = calc_f64!(t_b11_1 - t_b11_2 + t_b11_3 - t_b11_4 - t_b11_5 + t_b11_6 - t_b11_7 + t_b11_8 + t_b11_9 - t_b11_10 + t_b11_11 - t_b11_12 - t_b11_13 + t_b11_14); + let t_b12 = calc_f64!(t_b12_1 - t_b12_2 + t_b12_3 - t_b12_4 + t_b12_5 + t_b12_6 - t_b12_7 + t_b12_8 - t_b12_9 + t_b12_10 - t_b12_11 - t_b12_12 + t_b12_13 - t_b12_14); + let t_b13 = calc_f64!(t_b13_1 - t_b13_2 + t_b13_3 - t_b13_4 + t_b13_5 - t_b13_6 + t_b13_7 - t_b13_8 + t_b13_9 + t_b13_10 - t_b13_11 + t_b13_12 - t_b13_13 + t_b13_14); + let t_b14 = calc_f64!(t_b14_1 - t_b14_2 + t_b14_3 - t_b14_4 + t_b14_5 - t_b14_6 + t_b14_7 - t_b14_8 + t_b14_9 - t_b14_10 + t_b14_11 - t_b14_12 + t_b14_13 - t_b14_14); + + let t_b1_rot = self.rotate.rotate(t_b1); + let t_b2_rot = self.rotate.rotate(t_b2); + let t_b3_rot = self.rotate.rotate(t_b3); + let t_b4_rot = self.rotate.rotate(t_b4); + let t_b5_rot = self.rotate.rotate(t_b5); + let t_b6_rot = self.rotate.rotate(t_b6); + let t_b7_rot = self.rotate.rotate(t_b7); + let t_b8_rot = self.rotate.rotate(t_b8); + let t_b9_rot = self.rotate.rotate(t_b9); + let t_b10_rot = self.rotate.rotate(t_b10); + let t_b11_rot = self.rotate.rotate(t_b11); + let t_b12_rot = self.rotate.rotate(t_b12); + let t_b13_rot = self.rotate.rotate(t_b13); + let t_b14_rot = self.rotate.rotate(t_b14); + + let y0 = calc_f64!(x0 + x1p28 + x2p27 + x3p26 + x4p25 + x5p24 + x6p23 + x7p22 + x8p21 + x9p20 + x10p19 + x11p18 + x12p17 + x13p16 + x14p15); + let [y1, y28] = solo_fft2_f64(t_a1, t_b1_rot); + let [y2, y27] = solo_fft2_f64(t_a2, t_b2_rot); + let [y3, y26] = solo_fft2_f64(t_a3, t_b3_rot); + let [y4, y25] = solo_fft2_f64(t_a4, t_b4_rot); + let [y5, y24] = solo_fft2_f64(t_a5, t_b5_rot); + let [y6, y23] = solo_fft2_f64(t_a6, t_b6_rot); + let [y7, y22] = solo_fft2_f64(t_a7, t_b7_rot); + let [y8, y21] = solo_fft2_f64(t_a8, t_b8_rot); + let [y9, y20] = solo_fft2_f64(t_a9, t_b9_rot); + let [y10, y19] = solo_fft2_f64(t_a10, t_b10_rot); + let [y11, y18] = solo_fft2_f64(t_a11, t_b11_rot); + let [y12, y17] = solo_fft2_f64(t_a12, t_b12_rot); + let [y13, y16] = solo_fft2_f64(t_a13, t_b13_rot); + let [y14, y15] = solo_fft2_f64(t_a14, t_b14_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28] + } +} + +// _____ _ _________ _ _ _ +// |___ // | |___ /___ \| |__ (_) |_ +// |_ \| | _____ |_ \ __) | '_ \| | __| +// ___) | | |_____| ___) / __/| |_) | | |_ +// |____/|_| |____/_____|_.__/|_|\__| +// +pub struct NeonF32Butterfly31 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F32, + twiddle1re: float32x4_t, + twiddle1im: float32x4_t, + twiddle2re: float32x4_t, + twiddle2im: float32x4_t, + twiddle3re: float32x4_t, + twiddle3im: float32x4_t, + twiddle4re: float32x4_t, + twiddle4im: float32x4_t, + twiddle5re: float32x4_t, + twiddle5im: float32x4_t, + twiddle6re: float32x4_t, + twiddle6im: float32x4_t, + twiddle7re: float32x4_t, + twiddle7im: float32x4_t, + twiddle8re: float32x4_t, + twiddle8im: float32x4_t, + twiddle9re: float32x4_t, + twiddle9im: float32x4_t, + twiddle10re: float32x4_t, + twiddle10im: float32x4_t, + twiddle11re: float32x4_t, + twiddle11im: float32x4_t, + twiddle12re: float32x4_t, + twiddle12im: float32x4_t, + twiddle13re: float32x4_t, + twiddle13im: float32x4_t, + twiddle14re: float32x4_t, + twiddle14im: float32x4_t, + twiddle15re: float32x4_t, + twiddle15im: float32x4_t, +} + +boilerplate_fft_neon_f32_butterfly!(NeonF32Butterfly31, 31, |this: &NeonF32Butterfly31<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF32Butterfly31, 31, |this: &NeonF32Butterfly31<_>| this + .direction); +impl NeonF32Butterfly31 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f32::(); + let rotate = Rotate90F32::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 31, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 31, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 31, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 31, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 31, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 31, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 31, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 31, direction); + let tw9: Complex = twiddles::compute_twiddle(9, 31, direction); + let tw10: Complex = twiddles::compute_twiddle(10, 31, direction); + let tw11: Complex = twiddles::compute_twiddle(11, 31, direction); + let tw12: Complex = twiddles::compute_twiddle(12, 31, direction); + let tw13: Complex = twiddles::compute_twiddle(13, 31, direction); + let tw14: Complex = twiddles::compute_twiddle(14, 31, direction); + let tw15: Complex = twiddles::compute_twiddle(15, 31, direction); + let twiddle1re = unsafe { vmovq_n_f32(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f32(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f32(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f32(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f32(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f32(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f32(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f32(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f32(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f32(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f32(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f32(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f32(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f32(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f32(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f32(tw8.im) }; + let twiddle9re = unsafe { vmovq_n_f32(tw9.re) }; + let twiddle9im = unsafe { vmovq_n_f32(tw9.im) }; + let twiddle10re = unsafe { vmovq_n_f32(tw10.re) }; + let twiddle10im = unsafe { vmovq_n_f32(tw10.im) }; + let twiddle11re = unsafe { vmovq_n_f32(tw11.re) }; + let twiddle11im = unsafe { vmovq_n_f32(tw11.im) }; + let twiddle12re = unsafe { vmovq_n_f32(tw12.re) }; + let twiddle12im = unsafe { vmovq_n_f32(tw12.im) }; + let twiddle13re = unsafe { vmovq_n_f32(tw13.re) }; + let twiddle13im = unsafe { vmovq_n_f32(tw13.im) }; + let twiddle14re = unsafe { vmovq_n_f32(tw14.re) }; + let twiddle14im = unsafe { vmovq_n_f32(tw14.im) }; + let twiddle15re = unsafe { vmovq_n_f32(tw15.re) }; + let twiddle15im = unsafe { vmovq_n_f32(tw15.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + twiddle9re, + twiddle9im, + twiddle10re, + twiddle10im, + twiddle11re, + twiddle11im, + twiddle12re, + twiddle12im, + twiddle13re, + twiddle13im, + twiddle14re, + twiddle14im, + twiddle15re, + twiddle15im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}); + + let out = self.perform_parallel_fft_direct(values); + + write_partial_lo_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let input_packed = read_complex_to_array!(input, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60}); + + let values = [ + extract_lo_hi_f32(input_packed[0], input_packed[15]), + extract_hi_lo_f32(input_packed[0], input_packed[16]), + extract_lo_hi_f32(input_packed[1], input_packed[16]), + extract_hi_lo_f32(input_packed[1], input_packed[17]), + extract_lo_hi_f32(input_packed[2], input_packed[17]), + extract_hi_lo_f32(input_packed[2], input_packed[18]), + extract_lo_hi_f32(input_packed[3], input_packed[18]), + extract_hi_lo_f32(input_packed[3], input_packed[19]), + extract_lo_hi_f32(input_packed[4], input_packed[19]), + extract_hi_lo_f32(input_packed[4], input_packed[20]), + extract_lo_hi_f32(input_packed[5], input_packed[20]), + extract_hi_lo_f32(input_packed[5], input_packed[21]), + extract_lo_hi_f32(input_packed[6], input_packed[21]), + extract_hi_lo_f32(input_packed[6], input_packed[22]), + extract_lo_hi_f32(input_packed[7], input_packed[22]), + extract_hi_lo_f32(input_packed[7], input_packed[23]), + extract_lo_hi_f32(input_packed[8], input_packed[23]), + extract_hi_lo_f32(input_packed[8], input_packed[24]), + extract_lo_hi_f32(input_packed[9], input_packed[24]), + extract_hi_lo_f32(input_packed[9], input_packed[25]), + extract_lo_hi_f32(input_packed[10], input_packed[25]), + extract_hi_lo_f32(input_packed[10], input_packed[26]), + extract_lo_hi_f32(input_packed[11], input_packed[26]), + extract_hi_lo_f32(input_packed[11], input_packed[27]), + extract_lo_hi_f32(input_packed[12], input_packed[27]), + extract_hi_lo_f32(input_packed[12], input_packed[28]), + extract_lo_hi_f32(input_packed[13], input_packed[28]), + extract_hi_lo_f32(input_packed[13], input_packed[29]), + extract_lo_hi_f32(input_packed[14], input_packed[29]), + extract_hi_lo_f32(input_packed[14], input_packed[30]), + extract_lo_hi_f32(input_packed[15], input_packed[30]), + ]; + + let out = self.perform_parallel_fft_direct(values); + + let out_packed = [ + extract_lo_lo_f32(out[0], out[1]), + extract_lo_lo_f32(out[2], out[3]), + extract_lo_lo_f32(out[4], out[5]), + extract_lo_lo_f32(out[6], out[7]), + extract_lo_lo_f32(out[8], out[9]), + extract_lo_lo_f32(out[10], out[11]), + extract_lo_lo_f32(out[12], out[13]), + extract_lo_lo_f32(out[14], out[15]), + extract_lo_lo_f32(out[16], out[17]), + extract_lo_lo_f32(out[18], out[19]), + extract_lo_lo_f32(out[20], out[21]), + extract_lo_lo_f32(out[22], out[23]), + extract_lo_lo_f32(out[24], out[25]), + extract_lo_lo_f32(out[26], out[27]), + extract_lo_lo_f32(out[28], out[29]), + extract_lo_hi_f32(out[30], out[0]), + extract_hi_hi_f32(out[1], out[2]), + extract_hi_hi_f32(out[3], out[4]), + extract_hi_hi_f32(out[5], out[6]), + extract_hi_hi_f32(out[7], out[8]), + extract_hi_hi_f32(out[9], out[10]), + extract_hi_hi_f32(out[11], out[12]), + extract_hi_hi_f32(out[13], out[14]), + extract_hi_hi_f32(out[15], out[16]), + extract_hi_hi_f32(out[17], out[18]), + extract_hi_hi_f32(out[19], out[20]), + extract_hi_hi_f32(out[21], out[22]), + extract_hi_hi_f32(out[23], out[24]), + extract_hi_hi_f32(out[25], out[26]), + extract_hi_hi_f32(out[27], out[28]), + extract_hi_hi_f32(out[29], out[30]), + ]; + + write_complex_to_array_strided!(out_packed, output, 2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [float32x4_t; 31]) -> [float32x4_t; 31] { + let [x1p30, x1m30] = parallel_fft2_interleaved_f32(values[1], values[30]); + let [x2p29, x2m29] = parallel_fft2_interleaved_f32(values[2], values[29]); + let [x3p28, x3m28] = parallel_fft2_interleaved_f32(values[3], values[28]); + let [x4p27, x4m27] = parallel_fft2_interleaved_f32(values[4], values[27]); + let [x5p26, x5m26] = parallel_fft2_interleaved_f32(values[5], values[26]); + let [x6p25, x6m25] = parallel_fft2_interleaved_f32(values[6], values[25]); + let [x7p24, x7m24] = parallel_fft2_interleaved_f32(values[7], values[24]); + let [x8p23, x8m23] = parallel_fft2_interleaved_f32(values[8], values[23]); + let [x9p22, x9m22] = parallel_fft2_interleaved_f32(values[9], values[22]); + let [x10p21, x10m21] = parallel_fft2_interleaved_f32(values[10], values[21]); + let [x11p20, x11m20] = parallel_fft2_interleaved_f32(values[11], values[20]); + let [x12p19, x12m19] = parallel_fft2_interleaved_f32(values[12], values[19]); + let [x13p18, x13m18] = parallel_fft2_interleaved_f32(values[13], values[18]); + let [x14p17, x14m17] = parallel_fft2_interleaved_f32(values[14], values[17]); + let [x15p16, x15m16] = parallel_fft2_interleaved_f32(values[15], values[16]); + + let t_a1_1 = vmulq_f32(self.twiddle1re, x1p30); + let t_a1_2 = vmulq_f32(self.twiddle2re, x2p29); + let t_a1_3 = vmulq_f32(self.twiddle3re, x3p28); + let t_a1_4 = vmulq_f32(self.twiddle4re, x4p27); + let t_a1_5 = vmulq_f32(self.twiddle5re, x5p26); + let t_a1_6 = vmulq_f32(self.twiddle6re, x6p25); + let t_a1_7 = vmulq_f32(self.twiddle7re, x7p24); + let t_a1_8 = vmulq_f32(self.twiddle8re, x8p23); + let t_a1_9 = vmulq_f32(self.twiddle9re, x9p22); + let t_a1_10 = vmulq_f32(self.twiddle10re, x10p21); + let t_a1_11 = vmulq_f32(self.twiddle11re, x11p20); + let t_a1_12 = vmulq_f32(self.twiddle12re, x12p19); + let t_a1_13 = vmulq_f32(self.twiddle13re, x13p18); + let t_a1_14 = vmulq_f32(self.twiddle14re, x14p17); + let t_a1_15 = vmulq_f32(self.twiddle15re, x15p16); + let t_a2_1 = vmulq_f32(self.twiddle2re, x1p30); + let t_a2_2 = vmulq_f32(self.twiddle4re, x2p29); + let t_a2_3 = vmulq_f32(self.twiddle6re, x3p28); + let t_a2_4 = vmulq_f32(self.twiddle8re, x4p27); + let t_a2_5 = vmulq_f32(self.twiddle10re, x5p26); + let t_a2_6 = vmulq_f32(self.twiddle12re, x6p25); + let t_a2_7 = vmulq_f32(self.twiddle14re, x7p24); + let t_a2_8 = vmulq_f32(self.twiddle15re, x8p23); + let t_a2_9 = vmulq_f32(self.twiddle13re, x9p22); + let t_a2_10 = vmulq_f32(self.twiddle11re, x10p21); + let t_a2_11 = vmulq_f32(self.twiddle9re, x11p20); + let t_a2_12 = vmulq_f32(self.twiddle7re, x12p19); + let t_a2_13 = vmulq_f32(self.twiddle5re, x13p18); + let t_a2_14 = vmulq_f32(self.twiddle3re, x14p17); + let t_a2_15 = vmulq_f32(self.twiddle1re, x15p16); + let t_a3_1 = vmulq_f32(self.twiddle3re, x1p30); + let t_a3_2 = vmulq_f32(self.twiddle6re, x2p29); + let t_a3_3 = vmulq_f32(self.twiddle9re, x3p28); + let t_a3_4 = vmulq_f32(self.twiddle12re, x4p27); + let t_a3_5 = vmulq_f32(self.twiddle15re, x5p26); + let t_a3_6 = vmulq_f32(self.twiddle13re, x6p25); + let t_a3_7 = vmulq_f32(self.twiddle10re, x7p24); + let t_a3_8 = vmulq_f32(self.twiddle7re, x8p23); + let t_a3_9 = vmulq_f32(self.twiddle4re, x9p22); + let t_a3_10 = vmulq_f32(self.twiddle1re, x10p21); + let t_a3_11 = vmulq_f32(self.twiddle2re, x11p20); + let t_a3_12 = vmulq_f32(self.twiddle5re, x12p19); + let t_a3_13 = vmulq_f32(self.twiddle8re, x13p18); + let t_a3_14 = vmulq_f32(self.twiddle11re, x14p17); + let t_a3_15 = vmulq_f32(self.twiddle14re, x15p16); + let t_a4_1 = vmulq_f32(self.twiddle4re, x1p30); + let t_a4_2 = vmulq_f32(self.twiddle8re, x2p29); + let t_a4_3 = vmulq_f32(self.twiddle12re, x3p28); + let t_a4_4 = vmulq_f32(self.twiddle15re, x4p27); + let t_a4_5 = vmulq_f32(self.twiddle11re, x5p26); + let t_a4_6 = vmulq_f32(self.twiddle7re, x6p25); + let t_a4_7 = vmulq_f32(self.twiddle3re, x7p24); + let t_a4_8 = vmulq_f32(self.twiddle1re, x8p23); + let t_a4_9 = vmulq_f32(self.twiddle5re, x9p22); + let t_a4_10 = vmulq_f32(self.twiddle9re, x10p21); + let t_a4_11 = vmulq_f32(self.twiddle13re, x11p20); + let t_a4_12 = vmulq_f32(self.twiddle14re, x12p19); + let t_a4_13 = vmulq_f32(self.twiddle10re, x13p18); + let t_a4_14 = vmulq_f32(self.twiddle6re, x14p17); + let t_a4_15 = vmulq_f32(self.twiddle2re, x15p16); + let t_a5_1 = vmulq_f32(self.twiddle5re, x1p30); + let t_a5_2 = vmulq_f32(self.twiddle10re, x2p29); + let t_a5_3 = vmulq_f32(self.twiddle15re, x3p28); + let t_a5_4 = vmulq_f32(self.twiddle11re, x4p27); + let t_a5_5 = vmulq_f32(self.twiddle6re, x5p26); + let t_a5_6 = vmulq_f32(self.twiddle1re, x6p25); + let t_a5_7 = vmulq_f32(self.twiddle4re, x7p24); + let t_a5_8 = vmulq_f32(self.twiddle9re, x8p23); + let t_a5_9 = vmulq_f32(self.twiddle14re, x9p22); + let t_a5_10 = vmulq_f32(self.twiddle12re, x10p21); + let t_a5_11 = vmulq_f32(self.twiddle7re, x11p20); + let t_a5_12 = vmulq_f32(self.twiddle2re, x12p19); + let t_a5_13 = vmulq_f32(self.twiddle3re, x13p18); + let t_a5_14 = vmulq_f32(self.twiddle8re, x14p17); + let t_a5_15 = vmulq_f32(self.twiddle13re, x15p16); + let t_a6_1 = vmulq_f32(self.twiddle6re, x1p30); + let t_a6_2 = vmulq_f32(self.twiddle12re, x2p29); + let t_a6_3 = vmulq_f32(self.twiddle13re, x3p28); + let t_a6_4 = vmulq_f32(self.twiddle7re, x4p27); + let t_a6_5 = vmulq_f32(self.twiddle1re, x5p26); + let t_a6_6 = vmulq_f32(self.twiddle5re, x6p25); + let t_a6_7 = vmulq_f32(self.twiddle11re, x7p24); + let t_a6_8 = vmulq_f32(self.twiddle14re, x8p23); + let t_a6_9 = vmulq_f32(self.twiddle8re, x9p22); + let t_a6_10 = vmulq_f32(self.twiddle2re, x10p21); + let t_a6_11 = vmulq_f32(self.twiddle4re, x11p20); + let t_a6_12 = vmulq_f32(self.twiddle10re, x12p19); + let t_a6_13 = vmulq_f32(self.twiddle15re, x13p18); + let t_a6_14 = vmulq_f32(self.twiddle9re, x14p17); + let t_a6_15 = vmulq_f32(self.twiddle3re, x15p16); + let t_a7_1 = vmulq_f32(self.twiddle7re, x1p30); + let t_a7_2 = vmulq_f32(self.twiddle14re, x2p29); + let t_a7_3 = vmulq_f32(self.twiddle10re, x3p28); + let t_a7_4 = vmulq_f32(self.twiddle3re, x4p27); + let t_a7_5 = vmulq_f32(self.twiddle4re, x5p26); + let t_a7_6 = vmulq_f32(self.twiddle11re, x6p25); + let t_a7_7 = vmulq_f32(self.twiddle13re, x7p24); + let t_a7_8 = vmulq_f32(self.twiddle6re, x8p23); + let t_a7_9 = vmulq_f32(self.twiddle1re, x9p22); + let t_a7_10 = vmulq_f32(self.twiddle8re, x10p21); + let t_a7_11 = vmulq_f32(self.twiddle15re, x11p20); + let t_a7_12 = vmulq_f32(self.twiddle9re, x12p19); + let t_a7_13 = vmulq_f32(self.twiddle2re, x13p18); + let t_a7_14 = vmulq_f32(self.twiddle5re, x14p17); + let t_a7_15 = vmulq_f32(self.twiddle12re, x15p16); + let t_a8_1 = vmulq_f32(self.twiddle8re, x1p30); + let t_a8_2 = vmulq_f32(self.twiddle15re, x2p29); + let t_a8_3 = vmulq_f32(self.twiddle7re, x3p28); + let t_a8_4 = vmulq_f32(self.twiddle1re, x4p27); + let t_a8_5 = vmulq_f32(self.twiddle9re, x5p26); + let t_a8_6 = vmulq_f32(self.twiddle14re, x6p25); + let t_a8_7 = vmulq_f32(self.twiddle6re, x7p24); + let t_a8_8 = vmulq_f32(self.twiddle2re, x8p23); + let t_a8_9 = vmulq_f32(self.twiddle10re, x9p22); + let t_a8_10 = vmulq_f32(self.twiddle13re, x10p21); + let t_a8_11 = vmulq_f32(self.twiddle5re, x11p20); + let t_a8_12 = vmulq_f32(self.twiddle3re, x12p19); + let t_a8_13 = vmulq_f32(self.twiddle11re, x13p18); + let t_a8_14 = vmulq_f32(self.twiddle12re, x14p17); + let t_a8_15 = vmulq_f32(self.twiddle4re, x15p16); + let t_a9_1 = vmulq_f32(self.twiddle9re, x1p30); + let t_a9_2 = vmulq_f32(self.twiddle13re, x2p29); + let t_a9_3 = vmulq_f32(self.twiddle4re, x3p28); + let t_a9_4 = vmulq_f32(self.twiddle5re, x4p27); + let t_a9_5 = vmulq_f32(self.twiddle14re, x5p26); + let t_a9_6 = vmulq_f32(self.twiddle8re, x6p25); + let t_a9_7 = vmulq_f32(self.twiddle1re, x7p24); + let t_a9_8 = vmulq_f32(self.twiddle10re, x8p23); + let t_a9_9 = vmulq_f32(self.twiddle12re, x9p22); + let t_a9_10 = vmulq_f32(self.twiddle3re, x10p21); + let t_a9_11 = vmulq_f32(self.twiddle6re, x11p20); + let t_a9_12 = vmulq_f32(self.twiddle15re, x12p19); + let t_a9_13 = vmulq_f32(self.twiddle7re, x13p18); + let t_a9_14 = vmulq_f32(self.twiddle2re, x14p17); + let t_a9_15 = vmulq_f32(self.twiddle11re, x15p16); + let t_a10_1 = vmulq_f32(self.twiddle10re, x1p30); + let t_a10_2 = vmulq_f32(self.twiddle11re, x2p29); + let t_a10_3 = vmulq_f32(self.twiddle1re, x3p28); + let t_a10_4 = vmulq_f32(self.twiddle9re, x4p27); + let t_a10_5 = vmulq_f32(self.twiddle12re, x5p26); + let t_a10_6 = vmulq_f32(self.twiddle2re, x6p25); + let t_a10_7 = vmulq_f32(self.twiddle8re, x7p24); + let t_a10_8 = vmulq_f32(self.twiddle13re, x8p23); + let t_a10_9 = vmulq_f32(self.twiddle3re, x9p22); + let t_a10_10 = vmulq_f32(self.twiddle7re, x10p21); + let t_a10_11 = vmulq_f32(self.twiddle14re, x11p20); + let t_a10_12 = vmulq_f32(self.twiddle4re, x12p19); + let t_a10_13 = vmulq_f32(self.twiddle6re, x13p18); + let t_a10_14 = vmulq_f32(self.twiddle15re, x14p17); + let t_a10_15 = vmulq_f32(self.twiddle5re, x15p16); + let t_a11_1 = vmulq_f32(self.twiddle11re, x1p30); + let t_a11_2 = vmulq_f32(self.twiddle9re, x2p29); + let t_a11_3 = vmulq_f32(self.twiddle2re, x3p28); + let t_a11_4 = vmulq_f32(self.twiddle13re, x4p27); + let t_a11_5 = vmulq_f32(self.twiddle7re, x5p26); + let t_a11_6 = vmulq_f32(self.twiddle4re, x6p25); + let t_a11_7 = vmulq_f32(self.twiddle15re, x7p24); + let t_a11_8 = vmulq_f32(self.twiddle5re, x8p23); + let t_a11_9 = vmulq_f32(self.twiddle6re, x9p22); + let t_a11_10 = vmulq_f32(self.twiddle14re, x10p21); + let t_a11_11 = vmulq_f32(self.twiddle3re, x11p20); + let t_a11_12 = vmulq_f32(self.twiddle8re, x12p19); + let t_a11_13 = vmulq_f32(self.twiddle12re, x13p18); + let t_a11_14 = vmulq_f32(self.twiddle1re, x14p17); + let t_a11_15 = vmulq_f32(self.twiddle10re, x15p16); + let t_a12_1 = vmulq_f32(self.twiddle12re, x1p30); + let t_a12_2 = vmulq_f32(self.twiddle7re, x2p29); + let t_a12_3 = vmulq_f32(self.twiddle5re, x3p28); + let t_a12_4 = vmulq_f32(self.twiddle14re, x4p27); + let t_a12_5 = vmulq_f32(self.twiddle2re, x5p26); + let t_a12_6 = vmulq_f32(self.twiddle10re, x6p25); + let t_a12_7 = vmulq_f32(self.twiddle9re, x7p24); + let t_a12_8 = vmulq_f32(self.twiddle3re, x8p23); + let t_a12_9 = vmulq_f32(self.twiddle15re, x9p22); + let t_a12_10 = vmulq_f32(self.twiddle4re, x10p21); + let t_a12_11 = vmulq_f32(self.twiddle8re, x11p20); + let t_a12_12 = vmulq_f32(self.twiddle11re, x12p19); + let t_a12_13 = vmulq_f32(self.twiddle1re, x13p18); + let t_a12_14 = vmulq_f32(self.twiddle13re, x14p17); + let t_a12_15 = vmulq_f32(self.twiddle6re, x15p16); + let t_a13_1 = vmulq_f32(self.twiddle13re, x1p30); + let t_a13_2 = vmulq_f32(self.twiddle5re, x2p29); + let t_a13_3 = vmulq_f32(self.twiddle8re, x3p28); + let t_a13_4 = vmulq_f32(self.twiddle10re, x4p27); + let t_a13_5 = vmulq_f32(self.twiddle3re, x5p26); + let t_a13_6 = vmulq_f32(self.twiddle15re, x6p25); + let t_a13_7 = vmulq_f32(self.twiddle2re, x7p24); + let t_a13_8 = vmulq_f32(self.twiddle11re, x8p23); + let t_a13_9 = vmulq_f32(self.twiddle7re, x9p22); + let t_a13_10 = vmulq_f32(self.twiddle6re, x10p21); + let t_a13_11 = vmulq_f32(self.twiddle12re, x11p20); + let t_a13_12 = vmulq_f32(self.twiddle1re, x12p19); + let t_a13_13 = vmulq_f32(self.twiddle14re, x13p18); + let t_a13_14 = vmulq_f32(self.twiddle4re, x14p17); + let t_a13_15 = vmulq_f32(self.twiddle9re, x15p16); + let t_a14_1 = vmulq_f32(self.twiddle14re, x1p30); + let t_a14_2 = vmulq_f32(self.twiddle3re, x2p29); + let t_a14_3 = vmulq_f32(self.twiddle11re, x3p28); + let t_a14_4 = vmulq_f32(self.twiddle6re, x4p27); + let t_a14_5 = vmulq_f32(self.twiddle8re, x5p26); + let t_a14_6 = vmulq_f32(self.twiddle9re, x6p25); + let t_a14_7 = vmulq_f32(self.twiddle5re, x7p24); + let t_a14_8 = vmulq_f32(self.twiddle12re, x8p23); + let t_a14_9 = vmulq_f32(self.twiddle2re, x9p22); + let t_a14_10 = vmulq_f32(self.twiddle15re, x10p21); + let t_a14_11 = vmulq_f32(self.twiddle1re, x11p20); + let t_a14_12 = vmulq_f32(self.twiddle13re, x12p19); + let t_a14_13 = vmulq_f32(self.twiddle4re, x13p18); + let t_a14_14 = vmulq_f32(self.twiddle10re, x14p17); + let t_a14_15 = vmulq_f32(self.twiddle7re, x15p16); + let t_a15_1 = vmulq_f32(self.twiddle15re, x1p30); + let t_a15_2 = vmulq_f32(self.twiddle1re, x2p29); + let t_a15_3 = vmulq_f32(self.twiddle14re, x3p28); + let t_a15_4 = vmulq_f32(self.twiddle2re, x4p27); + let t_a15_5 = vmulq_f32(self.twiddle13re, x5p26); + let t_a15_6 = vmulq_f32(self.twiddle3re, x6p25); + let t_a15_7 = vmulq_f32(self.twiddle12re, x7p24); + let t_a15_8 = vmulq_f32(self.twiddle4re, x8p23); + let t_a15_9 = vmulq_f32(self.twiddle11re, x9p22); + let t_a15_10 = vmulq_f32(self.twiddle5re, x10p21); + let t_a15_11 = vmulq_f32(self.twiddle10re, x11p20); + let t_a15_12 = vmulq_f32(self.twiddle6re, x12p19); + let t_a15_13 = vmulq_f32(self.twiddle9re, x13p18); + let t_a15_14 = vmulq_f32(self.twiddle7re, x14p17); + let t_a15_15 = vmulq_f32(self.twiddle8re, x15p16); + + let t_b1_1 = vmulq_f32(self.twiddle1im, x1m30); + let t_b1_2 = vmulq_f32(self.twiddle2im, x2m29); + let t_b1_3 = vmulq_f32(self.twiddle3im, x3m28); + let t_b1_4 = vmulq_f32(self.twiddle4im, x4m27); + let t_b1_5 = vmulq_f32(self.twiddle5im, x5m26); + let t_b1_6 = vmulq_f32(self.twiddle6im, x6m25); + let t_b1_7 = vmulq_f32(self.twiddle7im, x7m24); + let t_b1_8 = vmulq_f32(self.twiddle8im, x8m23); + let t_b1_9 = vmulq_f32(self.twiddle9im, x9m22); + let t_b1_10 = vmulq_f32(self.twiddle10im, x10m21); + let t_b1_11 = vmulq_f32(self.twiddle11im, x11m20); + let t_b1_12 = vmulq_f32(self.twiddle12im, x12m19); + let t_b1_13 = vmulq_f32(self.twiddle13im, x13m18); + let t_b1_14 = vmulq_f32(self.twiddle14im, x14m17); + let t_b1_15 = vmulq_f32(self.twiddle15im, x15m16); + let t_b2_1 = vmulq_f32(self.twiddle2im, x1m30); + let t_b2_2 = vmulq_f32(self.twiddle4im, x2m29); + let t_b2_3 = vmulq_f32(self.twiddle6im, x3m28); + let t_b2_4 = vmulq_f32(self.twiddle8im, x4m27); + let t_b2_5 = vmulq_f32(self.twiddle10im, x5m26); + let t_b2_6 = vmulq_f32(self.twiddle12im, x6m25); + let t_b2_7 = vmulq_f32(self.twiddle14im, x7m24); + let t_b2_8 = vmulq_f32(self.twiddle15im, x8m23); + let t_b2_9 = vmulq_f32(self.twiddle13im, x9m22); + let t_b2_10 = vmulq_f32(self.twiddle11im, x10m21); + let t_b2_11 = vmulq_f32(self.twiddle9im, x11m20); + let t_b2_12 = vmulq_f32(self.twiddle7im, x12m19); + let t_b2_13 = vmulq_f32(self.twiddle5im, x13m18); + let t_b2_14 = vmulq_f32(self.twiddle3im, x14m17); + let t_b2_15 = vmulq_f32(self.twiddle1im, x15m16); + let t_b3_1 = vmulq_f32(self.twiddle3im, x1m30); + let t_b3_2 = vmulq_f32(self.twiddle6im, x2m29); + let t_b3_3 = vmulq_f32(self.twiddle9im, x3m28); + let t_b3_4 = vmulq_f32(self.twiddle12im, x4m27); + let t_b3_5 = vmulq_f32(self.twiddle15im, x5m26); + let t_b3_6 = vmulq_f32(self.twiddle13im, x6m25); + let t_b3_7 = vmulq_f32(self.twiddle10im, x7m24); + let t_b3_8 = vmulq_f32(self.twiddle7im, x8m23); + let t_b3_9 = vmulq_f32(self.twiddle4im, x9m22); + let t_b3_10 = vmulq_f32(self.twiddle1im, x10m21); + let t_b3_11 = vmulq_f32(self.twiddle2im, x11m20); + let t_b3_12 = vmulq_f32(self.twiddle5im, x12m19); + let t_b3_13 = vmulq_f32(self.twiddle8im, x13m18); + let t_b3_14 = vmulq_f32(self.twiddle11im, x14m17); + let t_b3_15 = vmulq_f32(self.twiddle14im, x15m16); + let t_b4_1 = vmulq_f32(self.twiddle4im, x1m30); + let t_b4_2 = vmulq_f32(self.twiddle8im, x2m29); + let t_b4_3 = vmulq_f32(self.twiddle12im, x3m28); + let t_b4_4 = vmulq_f32(self.twiddle15im, x4m27); + let t_b4_5 = vmulq_f32(self.twiddle11im, x5m26); + let t_b4_6 = vmulq_f32(self.twiddle7im, x6m25); + let t_b4_7 = vmulq_f32(self.twiddle3im, x7m24); + let t_b4_8 = vmulq_f32(self.twiddle1im, x8m23); + let t_b4_9 = vmulq_f32(self.twiddle5im, x9m22); + let t_b4_10 = vmulq_f32(self.twiddle9im, x10m21); + let t_b4_11 = vmulq_f32(self.twiddle13im, x11m20); + let t_b4_12 = vmulq_f32(self.twiddle14im, x12m19); + let t_b4_13 = vmulq_f32(self.twiddle10im, x13m18); + let t_b4_14 = vmulq_f32(self.twiddle6im, x14m17); + let t_b4_15 = vmulq_f32(self.twiddle2im, x15m16); + let t_b5_1 = vmulq_f32(self.twiddle5im, x1m30); + let t_b5_2 = vmulq_f32(self.twiddle10im, x2m29); + let t_b5_3 = vmulq_f32(self.twiddle15im, x3m28); + let t_b5_4 = vmulq_f32(self.twiddle11im, x4m27); + let t_b5_5 = vmulq_f32(self.twiddle6im, x5m26); + let t_b5_6 = vmulq_f32(self.twiddle1im, x6m25); + let t_b5_7 = vmulq_f32(self.twiddle4im, x7m24); + let t_b5_8 = vmulq_f32(self.twiddle9im, x8m23); + let t_b5_9 = vmulq_f32(self.twiddle14im, x9m22); + let t_b5_10 = vmulq_f32(self.twiddle12im, x10m21); + let t_b5_11 = vmulq_f32(self.twiddle7im, x11m20); + let t_b5_12 = vmulq_f32(self.twiddle2im, x12m19); + let t_b5_13 = vmulq_f32(self.twiddle3im, x13m18); + let t_b5_14 = vmulq_f32(self.twiddle8im, x14m17); + let t_b5_15 = vmulq_f32(self.twiddle13im, x15m16); + let t_b6_1 = vmulq_f32(self.twiddle6im, x1m30); + let t_b6_2 = vmulq_f32(self.twiddle12im, x2m29); + let t_b6_3 = vmulq_f32(self.twiddle13im, x3m28); + let t_b6_4 = vmulq_f32(self.twiddle7im, x4m27); + let t_b6_5 = vmulq_f32(self.twiddle1im, x5m26); + let t_b6_6 = vmulq_f32(self.twiddle5im, x6m25); + let t_b6_7 = vmulq_f32(self.twiddle11im, x7m24); + let t_b6_8 = vmulq_f32(self.twiddle14im, x8m23); + let t_b6_9 = vmulq_f32(self.twiddle8im, x9m22); + let t_b6_10 = vmulq_f32(self.twiddle2im, x10m21); + let t_b6_11 = vmulq_f32(self.twiddle4im, x11m20); + let t_b6_12 = vmulq_f32(self.twiddle10im, x12m19); + let t_b6_13 = vmulq_f32(self.twiddle15im, x13m18); + let t_b6_14 = vmulq_f32(self.twiddle9im, x14m17); + let t_b6_15 = vmulq_f32(self.twiddle3im, x15m16); + let t_b7_1 = vmulq_f32(self.twiddle7im, x1m30); + let t_b7_2 = vmulq_f32(self.twiddle14im, x2m29); + let t_b7_3 = vmulq_f32(self.twiddle10im, x3m28); + let t_b7_4 = vmulq_f32(self.twiddle3im, x4m27); + let t_b7_5 = vmulq_f32(self.twiddle4im, x5m26); + let t_b7_6 = vmulq_f32(self.twiddle11im, x6m25); + let t_b7_7 = vmulq_f32(self.twiddle13im, x7m24); + let t_b7_8 = vmulq_f32(self.twiddle6im, x8m23); + let t_b7_9 = vmulq_f32(self.twiddle1im, x9m22); + let t_b7_10 = vmulq_f32(self.twiddle8im, x10m21); + let t_b7_11 = vmulq_f32(self.twiddle15im, x11m20); + let t_b7_12 = vmulq_f32(self.twiddle9im, x12m19); + let t_b7_13 = vmulq_f32(self.twiddle2im, x13m18); + let t_b7_14 = vmulq_f32(self.twiddle5im, x14m17); + let t_b7_15 = vmulq_f32(self.twiddle12im, x15m16); + let t_b8_1 = vmulq_f32(self.twiddle8im, x1m30); + let t_b8_2 = vmulq_f32(self.twiddle15im, x2m29); + let t_b8_3 = vmulq_f32(self.twiddle7im, x3m28); + let t_b8_4 = vmulq_f32(self.twiddle1im, x4m27); + let t_b8_5 = vmulq_f32(self.twiddle9im, x5m26); + let t_b8_6 = vmulq_f32(self.twiddle14im, x6m25); + let t_b8_7 = vmulq_f32(self.twiddle6im, x7m24); + let t_b8_8 = vmulq_f32(self.twiddle2im, x8m23); + let t_b8_9 = vmulq_f32(self.twiddle10im, x9m22); + let t_b8_10 = vmulq_f32(self.twiddle13im, x10m21); + let t_b8_11 = vmulq_f32(self.twiddle5im, x11m20); + let t_b8_12 = vmulq_f32(self.twiddle3im, x12m19); + let t_b8_13 = vmulq_f32(self.twiddle11im, x13m18); + let t_b8_14 = vmulq_f32(self.twiddle12im, x14m17); + let t_b8_15 = vmulq_f32(self.twiddle4im, x15m16); + let t_b9_1 = vmulq_f32(self.twiddle9im, x1m30); + let t_b9_2 = vmulq_f32(self.twiddle13im, x2m29); + let t_b9_3 = vmulq_f32(self.twiddle4im, x3m28); + let t_b9_4 = vmulq_f32(self.twiddle5im, x4m27); + let t_b9_5 = vmulq_f32(self.twiddle14im, x5m26); + let t_b9_6 = vmulq_f32(self.twiddle8im, x6m25); + let t_b9_7 = vmulq_f32(self.twiddle1im, x7m24); + let t_b9_8 = vmulq_f32(self.twiddle10im, x8m23); + let t_b9_9 = vmulq_f32(self.twiddle12im, x9m22); + let t_b9_10 = vmulq_f32(self.twiddle3im, x10m21); + let t_b9_11 = vmulq_f32(self.twiddle6im, x11m20); + let t_b9_12 = vmulq_f32(self.twiddle15im, x12m19); + let t_b9_13 = vmulq_f32(self.twiddle7im, x13m18); + let t_b9_14 = vmulq_f32(self.twiddle2im, x14m17); + let t_b9_15 = vmulq_f32(self.twiddle11im, x15m16); + let t_b10_1 = vmulq_f32(self.twiddle10im, x1m30); + let t_b10_2 = vmulq_f32(self.twiddle11im, x2m29); + let t_b10_3 = vmulq_f32(self.twiddle1im, x3m28); + let t_b10_4 = vmulq_f32(self.twiddle9im, x4m27); + let t_b10_5 = vmulq_f32(self.twiddle12im, x5m26); + let t_b10_6 = vmulq_f32(self.twiddle2im, x6m25); + let t_b10_7 = vmulq_f32(self.twiddle8im, x7m24); + let t_b10_8 = vmulq_f32(self.twiddle13im, x8m23); + let t_b10_9 = vmulq_f32(self.twiddle3im, x9m22); + let t_b10_10 = vmulq_f32(self.twiddle7im, x10m21); + let t_b10_11 = vmulq_f32(self.twiddle14im, x11m20); + let t_b10_12 = vmulq_f32(self.twiddle4im, x12m19); + let t_b10_13 = vmulq_f32(self.twiddle6im, x13m18); + let t_b10_14 = vmulq_f32(self.twiddle15im, x14m17); + let t_b10_15 = vmulq_f32(self.twiddle5im, x15m16); + let t_b11_1 = vmulq_f32(self.twiddle11im, x1m30); + let t_b11_2 = vmulq_f32(self.twiddle9im, x2m29); + let t_b11_3 = vmulq_f32(self.twiddle2im, x3m28); + let t_b11_4 = vmulq_f32(self.twiddle13im, x4m27); + let t_b11_5 = vmulq_f32(self.twiddle7im, x5m26); + let t_b11_6 = vmulq_f32(self.twiddle4im, x6m25); + let t_b11_7 = vmulq_f32(self.twiddle15im, x7m24); + let t_b11_8 = vmulq_f32(self.twiddle5im, x8m23); + let t_b11_9 = vmulq_f32(self.twiddle6im, x9m22); + let t_b11_10 = vmulq_f32(self.twiddle14im, x10m21); + let t_b11_11 = vmulq_f32(self.twiddle3im, x11m20); + let t_b11_12 = vmulq_f32(self.twiddle8im, x12m19); + let t_b11_13 = vmulq_f32(self.twiddle12im, x13m18); + let t_b11_14 = vmulq_f32(self.twiddle1im, x14m17); + let t_b11_15 = vmulq_f32(self.twiddle10im, x15m16); + let t_b12_1 = vmulq_f32(self.twiddle12im, x1m30); + let t_b12_2 = vmulq_f32(self.twiddle7im, x2m29); + let t_b12_3 = vmulq_f32(self.twiddle5im, x3m28); + let t_b12_4 = vmulq_f32(self.twiddle14im, x4m27); + let t_b12_5 = vmulq_f32(self.twiddle2im, x5m26); + let t_b12_6 = vmulq_f32(self.twiddle10im, x6m25); + let t_b12_7 = vmulq_f32(self.twiddle9im, x7m24); + let t_b12_8 = vmulq_f32(self.twiddle3im, x8m23); + let t_b12_9 = vmulq_f32(self.twiddle15im, x9m22); + let t_b12_10 = vmulq_f32(self.twiddle4im, x10m21); + let t_b12_11 = vmulq_f32(self.twiddle8im, x11m20); + let t_b12_12 = vmulq_f32(self.twiddle11im, x12m19); + let t_b12_13 = vmulq_f32(self.twiddle1im, x13m18); + let t_b12_14 = vmulq_f32(self.twiddle13im, x14m17); + let t_b12_15 = vmulq_f32(self.twiddle6im, x15m16); + let t_b13_1 = vmulq_f32(self.twiddle13im, x1m30); + let t_b13_2 = vmulq_f32(self.twiddle5im, x2m29); + let t_b13_3 = vmulq_f32(self.twiddle8im, x3m28); + let t_b13_4 = vmulq_f32(self.twiddle10im, x4m27); + let t_b13_5 = vmulq_f32(self.twiddle3im, x5m26); + let t_b13_6 = vmulq_f32(self.twiddle15im, x6m25); + let t_b13_7 = vmulq_f32(self.twiddle2im, x7m24); + let t_b13_8 = vmulq_f32(self.twiddle11im, x8m23); + let t_b13_9 = vmulq_f32(self.twiddle7im, x9m22); + let t_b13_10 = vmulq_f32(self.twiddle6im, x10m21); + let t_b13_11 = vmulq_f32(self.twiddle12im, x11m20); + let t_b13_12 = vmulq_f32(self.twiddle1im, x12m19); + let t_b13_13 = vmulq_f32(self.twiddle14im, x13m18); + let t_b13_14 = vmulq_f32(self.twiddle4im, x14m17); + let t_b13_15 = vmulq_f32(self.twiddle9im, x15m16); + let t_b14_1 = vmulq_f32(self.twiddle14im, x1m30); + let t_b14_2 = vmulq_f32(self.twiddle3im, x2m29); + let t_b14_3 = vmulq_f32(self.twiddle11im, x3m28); + let t_b14_4 = vmulq_f32(self.twiddle6im, x4m27); + let t_b14_5 = vmulq_f32(self.twiddle8im, x5m26); + let t_b14_6 = vmulq_f32(self.twiddle9im, x6m25); + let t_b14_7 = vmulq_f32(self.twiddle5im, x7m24); + let t_b14_8 = vmulq_f32(self.twiddle12im, x8m23); + let t_b14_9 = vmulq_f32(self.twiddle2im, x9m22); + let t_b14_10 = vmulq_f32(self.twiddle15im, x10m21); + let t_b14_11 = vmulq_f32(self.twiddle1im, x11m20); + let t_b14_12 = vmulq_f32(self.twiddle13im, x12m19); + let t_b14_13 = vmulq_f32(self.twiddle4im, x13m18); + let t_b14_14 = vmulq_f32(self.twiddle10im, x14m17); + let t_b14_15 = vmulq_f32(self.twiddle7im, x15m16); + let t_b15_1 = vmulq_f32(self.twiddle15im, x1m30); + let t_b15_2 = vmulq_f32(self.twiddle1im, x2m29); + let t_b15_3 = vmulq_f32(self.twiddle14im, x3m28); + let t_b15_4 = vmulq_f32(self.twiddle2im, x4m27); + let t_b15_5 = vmulq_f32(self.twiddle13im, x5m26); + let t_b15_6 = vmulq_f32(self.twiddle3im, x6m25); + let t_b15_7 = vmulq_f32(self.twiddle12im, x7m24); + let t_b15_8 = vmulq_f32(self.twiddle4im, x8m23); + let t_b15_9 = vmulq_f32(self.twiddle11im, x9m22); + let t_b15_10 = vmulq_f32(self.twiddle5im, x10m21); + let t_b15_11 = vmulq_f32(self.twiddle10im, x11m20); + let t_b15_12 = vmulq_f32(self.twiddle6im, x12m19); + let t_b15_13 = vmulq_f32(self.twiddle9im, x13m18); + let t_b15_14 = vmulq_f32(self.twiddle7im, x14m17); + let t_b15_15 = vmulq_f32(self.twiddle8im, x15m16); + + let x0 = values[0]; + let t_a1 = calc_f32!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11 + t_a1_12 + t_a1_13 + t_a1_14 + t_a1_15); + let t_a2 = calc_f32!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11 + t_a2_12 + t_a2_13 + t_a2_14 + t_a2_15); + let t_a3 = calc_f32!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11 + t_a3_12 + t_a3_13 + t_a3_14 + t_a3_15); + let t_a4 = calc_f32!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11 + t_a4_12 + t_a4_13 + t_a4_14 + t_a4_15); + let t_a5 = calc_f32!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11 + t_a5_12 + t_a5_13 + t_a5_14 + t_a5_15); + let t_a6 = calc_f32!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11 + t_a6_12 + t_a6_13 + t_a6_14 + t_a6_15); + let t_a7 = calc_f32!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11 + t_a7_12 + t_a7_13 + t_a7_14 + t_a7_15); + let t_a8 = calc_f32!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11 + t_a8_12 + t_a8_13 + t_a8_14 + t_a8_15); + let t_a9 = calc_f32!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11 + t_a9_12 + t_a9_13 + t_a9_14 + t_a9_15); + let t_a10 = calc_f32!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11 + t_a10_12 + t_a10_13 + t_a10_14 + t_a10_15); + let t_a11 = calc_f32!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11 + t_a11_12 + t_a11_13 + t_a11_14 + t_a11_15); + let t_a12 = calc_f32!(x0 + t_a12_1 + t_a12_2 + t_a12_3 + t_a12_4 + t_a12_5 + t_a12_6 + t_a12_7 + t_a12_8 + t_a12_9 + t_a12_10 + t_a12_11 + t_a12_12 + t_a12_13 + t_a12_14 + t_a12_15); + let t_a13 = calc_f32!(x0 + t_a13_1 + t_a13_2 + t_a13_3 + t_a13_4 + t_a13_5 + t_a13_6 + t_a13_7 + t_a13_8 + t_a13_9 + t_a13_10 + t_a13_11 + t_a13_12 + t_a13_13 + t_a13_14 + t_a13_15); + let t_a14 = calc_f32!(x0 + t_a14_1 + t_a14_2 + t_a14_3 + t_a14_4 + t_a14_5 + t_a14_6 + t_a14_7 + t_a14_8 + t_a14_9 + t_a14_10 + t_a14_11 + t_a14_12 + t_a14_13 + t_a14_14 + t_a14_15); + let t_a15 = calc_f32!(x0 + t_a15_1 + t_a15_2 + t_a15_3 + t_a15_4 + t_a15_5 + t_a15_6 + t_a15_7 + t_a15_8 + t_a15_9 + t_a15_10 + t_a15_11 + t_a15_12 + t_a15_13 + t_a15_14 + t_a15_15); + + let t_b1 = calc_f32!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11 + t_b1_12 + t_b1_13 + t_b1_14 + t_b1_15); + let t_b2 = calc_f32!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 + t_b2_6 + t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11 - t_b2_12 - t_b2_13 - t_b2_14 - t_b2_15); + let t_b3 = calc_f32!(t_b3_1 + t_b3_2 + t_b3_3 + t_b3_4 + t_b3_5 - t_b3_6 - t_b3_7 - t_b3_8 - t_b3_9 - t_b3_10 + t_b3_11 + t_b3_12 + t_b3_13 + t_b3_14 + t_b3_15); + let t_b4 = calc_f32!(t_b4_1 + t_b4_2 + t_b4_3 - t_b4_4 - t_b4_5 - t_b4_6 - t_b4_7 + t_b4_8 + t_b4_9 + t_b4_10 + t_b4_11 - t_b4_12 - t_b4_13 - t_b4_14 - t_b4_15); + let t_b5 = calc_f32!(t_b5_1 + t_b5_2 + t_b5_3 - t_b5_4 - t_b5_5 - t_b5_6 + t_b5_7 + t_b5_8 + t_b5_9 - t_b5_10 - t_b5_11 - t_b5_12 + t_b5_13 + t_b5_14 + t_b5_15); + let t_b6 = calc_f32!(t_b6_1 + t_b6_2 - t_b6_3 - t_b6_4 - t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9 - t_b6_10 + t_b6_11 + t_b6_12 - t_b6_13 - t_b6_14 - t_b6_15); + let t_b7 = calc_f32!(t_b7_1 + t_b7_2 - t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9 + t_b7_10 + t_b7_11 - t_b7_12 - t_b7_13 + t_b7_14 + t_b7_15); + let t_b8 = calc_f32!(t_b8_1 - t_b8_2 - t_b8_3 + t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 + t_b8_9 - t_b8_10 - t_b8_11 + t_b8_12 + t_b8_13 - t_b8_14 - t_b8_15); + let t_b9 = calc_f32!(t_b9_1 - t_b9_2 - t_b9_3 + t_b9_4 + t_b9_5 - t_b9_6 + t_b9_7 + t_b9_8 - t_b9_9 - t_b9_10 + t_b9_11 + t_b9_12 - t_b9_13 + t_b9_14 + t_b9_15); + let t_b10 = calc_f32!(t_b10_1 - t_b10_2 - t_b10_3 + t_b10_4 - t_b10_5 - t_b10_6 + t_b10_7 - t_b10_8 - t_b10_9 + t_b10_10 - t_b10_11 - t_b10_12 + t_b10_13 - t_b10_14 - t_b10_15); + let t_b11 = calc_f32!(t_b11_1 - t_b11_2 + t_b11_3 + t_b11_4 - t_b11_5 + t_b11_6 + t_b11_7 - t_b11_8 + t_b11_9 - t_b11_10 - t_b11_11 + t_b11_12 - t_b11_13 - t_b11_14 + t_b11_15); + let t_b12 = calc_f32!(t_b12_1 - t_b12_2 + t_b12_3 - t_b12_4 - t_b12_5 + t_b12_6 - t_b12_7 + t_b12_8 + t_b12_9 - t_b12_10 + t_b12_11 - t_b12_12 + t_b12_13 + t_b12_14 - t_b12_15); + let t_b13 = calc_f32!(t_b13_1 - t_b13_2 + t_b13_3 - t_b13_4 + t_b13_5 - t_b13_6 - t_b13_7 + t_b13_8 - t_b13_9 + t_b13_10 - t_b13_11 + t_b13_12 + t_b13_13 - t_b13_14 + t_b13_15); + let t_b14 = calc_f32!(t_b14_1 - t_b14_2 + t_b14_3 - t_b14_4 + t_b14_5 - t_b14_6 + t_b14_7 - t_b14_8 + t_b14_9 - t_b14_10 - t_b14_11 + t_b14_12 - t_b14_13 + t_b14_14 - t_b14_15); + let t_b15 = calc_f32!(t_b15_1 - t_b15_2 + t_b15_3 - t_b15_4 + t_b15_5 - t_b15_6 + t_b15_7 - t_b15_8 + t_b15_9 - t_b15_10 + t_b15_11 - t_b15_12 + t_b15_13 - t_b15_14 + t_b15_15); + + let t_b1_rot = self.rotate.rotate_both(t_b1); + let t_b2_rot = self.rotate.rotate_both(t_b2); + let t_b3_rot = self.rotate.rotate_both(t_b3); + let t_b4_rot = self.rotate.rotate_both(t_b4); + let t_b5_rot = self.rotate.rotate_both(t_b5); + let t_b6_rot = self.rotate.rotate_both(t_b6); + let t_b7_rot = self.rotate.rotate_both(t_b7); + let t_b8_rot = self.rotate.rotate_both(t_b8); + let t_b9_rot = self.rotate.rotate_both(t_b9); + let t_b10_rot = self.rotate.rotate_both(t_b10); + let t_b11_rot = self.rotate.rotate_both(t_b11); + let t_b12_rot = self.rotate.rotate_both(t_b12); + let t_b13_rot = self.rotate.rotate_both(t_b13); + let t_b14_rot = self.rotate.rotate_both(t_b14); + let t_b15_rot = self.rotate.rotate_both(t_b15); + + let y0 = calc_f32!(x0 + x1p30 + x2p29 + x3p28 + x4p27 + x5p26 + x6p25 + x7p24 + x8p23 + x9p22 + x10p21 + x11p20 + x12p19 + x13p18 + x14p17 + x15p16); + let [y1, y30] = parallel_fft2_interleaved_f32(t_a1, t_b1_rot); + let [y2, y29] = parallel_fft2_interleaved_f32(t_a2, t_b2_rot); + let [y3, y28] = parallel_fft2_interleaved_f32(t_a3, t_b3_rot); + let [y4, y27] = parallel_fft2_interleaved_f32(t_a4, t_b4_rot); + let [y5, y26] = parallel_fft2_interleaved_f32(t_a5, t_b5_rot); + let [y6, y25] = parallel_fft2_interleaved_f32(t_a6, t_b6_rot); + let [y7, y24] = parallel_fft2_interleaved_f32(t_a7, t_b7_rot); + let [y8, y23] = parallel_fft2_interleaved_f32(t_a8, t_b8_rot); + let [y9, y22] = parallel_fft2_interleaved_f32(t_a9, t_b9_rot); + let [y10, y21] = parallel_fft2_interleaved_f32(t_a10, t_b10_rot); + let [y11, y20] = parallel_fft2_interleaved_f32(t_a11, t_b11_rot); + let [y12, y19] = parallel_fft2_interleaved_f32(t_a12, t_b12_rot); + let [y13, y18] = parallel_fft2_interleaved_f32(t_a13, t_b13_rot); + let [y14, y17] = parallel_fft2_interleaved_f32(t_a14, t_b14_rot); + let [y15, y16] = parallel_fft2_interleaved_f32(t_a15, t_b15_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30] + } +} + +// _____ _ __ _ _ _ _ _ +// |___ // | / /_ | || | | |__ (_) |_ +// |_ \| | _____ | '_ \| || |_| '_ \| | __| +// ___) | | |_____| | (_) |__ _| |_) | | |_ +// |____/|_| \___/ |_| |_.__/|_|\__| +// + +pub struct NeonF64Butterfly31 { + direction: FftDirection, + _phantom: std::marker::PhantomData, + rotate: Rotate90F64, + twiddle1re: float64x2_t, + twiddle1im: float64x2_t, + twiddle2re: float64x2_t, + twiddle2im: float64x2_t, + twiddle3re: float64x2_t, + twiddle3im: float64x2_t, + twiddle4re: float64x2_t, + twiddle4im: float64x2_t, + twiddle5re: float64x2_t, + twiddle5im: float64x2_t, + twiddle6re: float64x2_t, + twiddle6im: float64x2_t, + twiddle7re: float64x2_t, + twiddle7im: float64x2_t, + twiddle8re: float64x2_t, + twiddle8im: float64x2_t, + twiddle9re: float64x2_t, + twiddle9im: float64x2_t, + twiddle10re: float64x2_t, + twiddle10im: float64x2_t, + twiddle11re: float64x2_t, + twiddle11im: float64x2_t, + twiddle12re: float64x2_t, + twiddle12im: float64x2_t, + twiddle13re: float64x2_t, + twiddle13im: float64x2_t, + twiddle14re: float64x2_t, + twiddle14im: float64x2_t, + twiddle15re: float64x2_t, + twiddle15im: float64x2_t, +} + +boilerplate_fft_neon_f64_butterfly!(NeonF64Butterfly31, 31, |this: &NeonF64Butterfly31<_>| this + .direction); +boilerplate_fft_neon_common_butterfly!(NeonF64Butterfly31, 31, |this: &NeonF64Butterfly31<_>| this + .direction); +impl NeonF64Butterfly31 { + #[inline(always)] + pub fn new(direction: FftDirection) -> Self { + assert_f64::(); + let rotate = Rotate90F64::new(true); + let tw1: Complex = twiddles::compute_twiddle(1, 31, direction); + let tw2: Complex = twiddles::compute_twiddle(2, 31, direction); + let tw3: Complex = twiddles::compute_twiddle(3, 31, direction); + let tw4: Complex = twiddles::compute_twiddle(4, 31, direction); + let tw5: Complex = twiddles::compute_twiddle(5, 31, direction); + let tw6: Complex = twiddles::compute_twiddle(6, 31, direction); + let tw7: Complex = twiddles::compute_twiddle(7, 31, direction); + let tw8: Complex = twiddles::compute_twiddle(8, 31, direction); + let tw9: Complex = twiddles::compute_twiddle(9, 31, direction); + let tw10: Complex = twiddles::compute_twiddle(10, 31, direction); + let tw11: Complex = twiddles::compute_twiddle(11, 31, direction); + let tw12: Complex = twiddles::compute_twiddle(12, 31, direction); + let tw13: Complex = twiddles::compute_twiddle(13, 31, direction); + let tw14: Complex = twiddles::compute_twiddle(14, 31, direction); + let tw15: Complex = twiddles::compute_twiddle(15, 31, direction); + let twiddle1re = unsafe { vmovq_n_f64(tw1.re) }; + let twiddle1im = unsafe { vmovq_n_f64(tw1.im) }; + let twiddle2re = unsafe { vmovq_n_f64(tw2.re) }; + let twiddle2im = unsafe { vmovq_n_f64(tw2.im) }; + let twiddle3re = unsafe { vmovq_n_f64(tw3.re) }; + let twiddle3im = unsafe { vmovq_n_f64(tw3.im) }; + let twiddle4re = unsafe { vmovq_n_f64(tw4.re) }; + let twiddle4im = unsafe { vmovq_n_f64(tw4.im) }; + let twiddle5re = unsafe { vmovq_n_f64(tw5.re) }; + let twiddle5im = unsafe { vmovq_n_f64(tw5.im) }; + let twiddle6re = unsafe { vmovq_n_f64(tw6.re) }; + let twiddle6im = unsafe { vmovq_n_f64(tw6.im) }; + let twiddle7re = unsafe { vmovq_n_f64(tw7.re) }; + let twiddle7im = unsafe { vmovq_n_f64(tw7.im) }; + let twiddle8re = unsafe { vmovq_n_f64(tw8.re) }; + let twiddle8im = unsafe { vmovq_n_f64(tw8.im) }; + let twiddle9re = unsafe { vmovq_n_f64(tw9.re) }; + let twiddle9im = unsafe { vmovq_n_f64(tw9.im) }; + let twiddle10re = unsafe { vmovq_n_f64(tw10.re) }; + let twiddle10im = unsafe { vmovq_n_f64(tw10.im) }; + let twiddle11re = unsafe { vmovq_n_f64(tw11.re) }; + let twiddle11im = unsafe { vmovq_n_f64(tw11.im) }; + let twiddle12re = unsafe { vmovq_n_f64(tw12.re) }; + let twiddle12im = unsafe { vmovq_n_f64(tw12.im) }; + let twiddle13re = unsafe { vmovq_n_f64(tw13.re) }; + let twiddle13im = unsafe { vmovq_n_f64(tw13.im) }; + let twiddle14re = unsafe { vmovq_n_f64(tw14.re) }; + let twiddle14im = unsafe { vmovq_n_f64(tw14.im) }; + let twiddle15re = unsafe { vmovq_n_f64(tw15.re) }; + let twiddle15im = unsafe { vmovq_n_f64(tw15.im) }; + + Self { + direction, + _phantom: std::marker::PhantomData, + rotate, + twiddle1re, + twiddle1im, + twiddle2re, + twiddle2im, + twiddle3re, + twiddle3im, + twiddle4re, + twiddle4im, + twiddle5re, + twiddle5im, + twiddle6re, + twiddle6im, + twiddle7re, + twiddle7im, + twiddle8re, + twiddle8im, + twiddle9re, + twiddle9im, + twiddle10re, + twiddle10im, + twiddle11re, + twiddle11im, + twiddle12re, + twiddle12im, + twiddle13re, + twiddle13im, + twiddle14re, + twiddle14im, + twiddle15re, + twiddle15im, + } + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_contiguous( + &self, + input: RawSlice>, + output: RawSliceMut>, + ) { + let values = read_complex_to_array!(input, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}); + + let out = self.perform_fft_direct(values); + + write_complex_to_array!(out, output, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}); + } + + #[inline(always)] + pub(crate) unsafe fn perform_fft_direct(&self, values: [float64x2_t; 31]) -> [float64x2_t; 31] { + let [x1p30, x1m30] = solo_fft2_f64(values[1], values[30]); + let [x2p29, x2m29] = solo_fft2_f64(values[2], values[29]); + let [x3p28, x3m28] = solo_fft2_f64(values[3], values[28]); + let [x4p27, x4m27] = solo_fft2_f64(values[4], values[27]); + let [x5p26, x5m26] = solo_fft2_f64(values[5], values[26]); + let [x6p25, x6m25] = solo_fft2_f64(values[6], values[25]); + let [x7p24, x7m24] = solo_fft2_f64(values[7], values[24]); + let [x8p23, x8m23] = solo_fft2_f64(values[8], values[23]); + let [x9p22, x9m22] = solo_fft2_f64(values[9], values[22]); + let [x10p21, x10m21] = solo_fft2_f64(values[10], values[21]); + let [x11p20, x11m20] = solo_fft2_f64(values[11], values[20]); + let [x12p19, x12m19] = solo_fft2_f64(values[12], values[19]); + let [x13p18, x13m18] = solo_fft2_f64(values[13], values[18]); + let [x14p17, x14m17] = solo_fft2_f64(values[14], values[17]); + let [x15p16, x15m16] = solo_fft2_f64(values[15], values[16]); + + let t_a1_1 = vmulq_f64(self.twiddle1re, x1p30); + let t_a1_2 = vmulq_f64(self.twiddle2re, x2p29); + let t_a1_3 = vmulq_f64(self.twiddle3re, x3p28); + let t_a1_4 = vmulq_f64(self.twiddle4re, x4p27); + let t_a1_5 = vmulq_f64(self.twiddle5re, x5p26); + let t_a1_6 = vmulq_f64(self.twiddle6re, x6p25); + let t_a1_7 = vmulq_f64(self.twiddle7re, x7p24); + let t_a1_8 = vmulq_f64(self.twiddle8re, x8p23); + let t_a1_9 = vmulq_f64(self.twiddle9re, x9p22); + let t_a1_10 = vmulq_f64(self.twiddle10re, x10p21); + let t_a1_11 = vmulq_f64(self.twiddle11re, x11p20); + let t_a1_12 = vmulq_f64(self.twiddle12re, x12p19); + let t_a1_13 = vmulq_f64(self.twiddle13re, x13p18); + let t_a1_14 = vmulq_f64(self.twiddle14re, x14p17); + let t_a1_15 = vmulq_f64(self.twiddle15re, x15p16); + let t_a2_1 = vmulq_f64(self.twiddle2re, x1p30); + let t_a2_2 = vmulq_f64(self.twiddle4re, x2p29); + let t_a2_3 = vmulq_f64(self.twiddle6re, x3p28); + let t_a2_4 = vmulq_f64(self.twiddle8re, x4p27); + let t_a2_5 = vmulq_f64(self.twiddle10re, x5p26); + let t_a2_6 = vmulq_f64(self.twiddle12re, x6p25); + let t_a2_7 = vmulq_f64(self.twiddle14re, x7p24); + let t_a2_8 = vmulq_f64(self.twiddle15re, x8p23); + let t_a2_9 = vmulq_f64(self.twiddle13re, x9p22); + let t_a2_10 = vmulq_f64(self.twiddle11re, x10p21); + let t_a2_11 = vmulq_f64(self.twiddle9re, x11p20); + let t_a2_12 = vmulq_f64(self.twiddle7re, x12p19); + let t_a2_13 = vmulq_f64(self.twiddle5re, x13p18); + let t_a2_14 = vmulq_f64(self.twiddle3re, x14p17); + let t_a2_15 = vmulq_f64(self.twiddle1re, x15p16); + let t_a3_1 = vmulq_f64(self.twiddle3re, x1p30); + let t_a3_2 = vmulq_f64(self.twiddle6re, x2p29); + let t_a3_3 = vmulq_f64(self.twiddle9re, x3p28); + let t_a3_4 = vmulq_f64(self.twiddle12re, x4p27); + let t_a3_5 = vmulq_f64(self.twiddle15re, x5p26); + let t_a3_6 = vmulq_f64(self.twiddle13re, x6p25); + let t_a3_7 = vmulq_f64(self.twiddle10re, x7p24); + let t_a3_8 = vmulq_f64(self.twiddle7re, x8p23); + let t_a3_9 = vmulq_f64(self.twiddle4re, x9p22); + let t_a3_10 = vmulq_f64(self.twiddle1re, x10p21); + let t_a3_11 = vmulq_f64(self.twiddle2re, x11p20); + let t_a3_12 = vmulq_f64(self.twiddle5re, x12p19); + let t_a3_13 = vmulq_f64(self.twiddle8re, x13p18); + let t_a3_14 = vmulq_f64(self.twiddle11re, x14p17); + let t_a3_15 = vmulq_f64(self.twiddle14re, x15p16); + let t_a4_1 = vmulq_f64(self.twiddle4re, x1p30); + let t_a4_2 = vmulq_f64(self.twiddle8re, x2p29); + let t_a4_3 = vmulq_f64(self.twiddle12re, x3p28); + let t_a4_4 = vmulq_f64(self.twiddle15re, x4p27); + let t_a4_5 = vmulq_f64(self.twiddle11re, x5p26); + let t_a4_6 = vmulq_f64(self.twiddle7re, x6p25); + let t_a4_7 = vmulq_f64(self.twiddle3re, x7p24); + let t_a4_8 = vmulq_f64(self.twiddle1re, x8p23); + let t_a4_9 = vmulq_f64(self.twiddle5re, x9p22); + let t_a4_10 = vmulq_f64(self.twiddle9re, x10p21); + let t_a4_11 = vmulq_f64(self.twiddle13re, x11p20); + let t_a4_12 = vmulq_f64(self.twiddle14re, x12p19); + let t_a4_13 = vmulq_f64(self.twiddle10re, x13p18); + let t_a4_14 = vmulq_f64(self.twiddle6re, x14p17); + let t_a4_15 = vmulq_f64(self.twiddle2re, x15p16); + let t_a5_1 = vmulq_f64(self.twiddle5re, x1p30); + let t_a5_2 = vmulq_f64(self.twiddle10re, x2p29); + let t_a5_3 = vmulq_f64(self.twiddle15re, x3p28); + let t_a5_4 = vmulq_f64(self.twiddle11re, x4p27); + let t_a5_5 = vmulq_f64(self.twiddle6re, x5p26); + let t_a5_6 = vmulq_f64(self.twiddle1re, x6p25); + let t_a5_7 = vmulq_f64(self.twiddle4re, x7p24); + let t_a5_8 = vmulq_f64(self.twiddle9re, x8p23); + let t_a5_9 = vmulq_f64(self.twiddle14re, x9p22); + let t_a5_10 = vmulq_f64(self.twiddle12re, x10p21); + let t_a5_11 = vmulq_f64(self.twiddle7re, x11p20); + let t_a5_12 = vmulq_f64(self.twiddle2re, x12p19); + let t_a5_13 = vmulq_f64(self.twiddle3re, x13p18); + let t_a5_14 = vmulq_f64(self.twiddle8re, x14p17); + let t_a5_15 = vmulq_f64(self.twiddle13re, x15p16); + let t_a6_1 = vmulq_f64(self.twiddle6re, x1p30); + let t_a6_2 = vmulq_f64(self.twiddle12re, x2p29); + let t_a6_3 = vmulq_f64(self.twiddle13re, x3p28); + let t_a6_4 = vmulq_f64(self.twiddle7re, x4p27); + let t_a6_5 = vmulq_f64(self.twiddle1re, x5p26); + let t_a6_6 = vmulq_f64(self.twiddle5re, x6p25); + let t_a6_7 = vmulq_f64(self.twiddle11re, x7p24); + let t_a6_8 = vmulq_f64(self.twiddle14re, x8p23); + let t_a6_9 = vmulq_f64(self.twiddle8re, x9p22); + let t_a6_10 = vmulq_f64(self.twiddle2re, x10p21); + let t_a6_11 = vmulq_f64(self.twiddle4re, x11p20); + let t_a6_12 = vmulq_f64(self.twiddle10re, x12p19); + let t_a6_13 = vmulq_f64(self.twiddle15re, x13p18); + let t_a6_14 = vmulq_f64(self.twiddle9re, x14p17); + let t_a6_15 = vmulq_f64(self.twiddle3re, x15p16); + let t_a7_1 = vmulq_f64(self.twiddle7re, x1p30); + let t_a7_2 = vmulq_f64(self.twiddle14re, x2p29); + let t_a7_3 = vmulq_f64(self.twiddle10re, x3p28); + let t_a7_4 = vmulq_f64(self.twiddle3re, x4p27); + let t_a7_5 = vmulq_f64(self.twiddle4re, x5p26); + let t_a7_6 = vmulq_f64(self.twiddle11re, x6p25); + let t_a7_7 = vmulq_f64(self.twiddle13re, x7p24); + let t_a7_8 = vmulq_f64(self.twiddle6re, x8p23); + let t_a7_9 = vmulq_f64(self.twiddle1re, x9p22); + let t_a7_10 = vmulq_f64(self.twiddle8re, x10p21); + let t_a7_11 = vmulq_f64(self.twiddle15re, x11p20); + let t_a7_12 = vmulq_f64(self.twiddle9re, x12p19); + let t_a7_13 = vmulq_f64(self.twiddle2re, x13p18); + let t_a7_14 = vmulq_f64(self.twiddle5re, x14p17); + let t_a7_15 = vmulq_f64(self.twiddle12re, x15p16); + let t_a8_1 = vmulq_f64(self.twiddle8re, x1p30); + let t_a8_2 = vmulq_f64(self.twiddle15re, x2p29); + let t_a8_3 = vmulq_f64(self.twiddle7re, x3p28); + let t_a8_4 = vmulq_f64(self.twiddle1re, x4p27); + let t_a8_5 = vmulq_f64(self.twiddle9re, x5p26); + let t_a8_6 = vmulq_f64(self.twiddle14re, x6p25); + let t_a8_7 = vmulq_f64(self.twiddle6re, x7p24); + let t_a8_8 = vmulq_f64(self.twiddle2re, x8p23); + let t_a8_9 = vmulq_f64(self.twiddle10re, x9p22); + let t_a8_10 = vmulq_f64(self.twiddle13re, x10p21); + let t_a8_11 = vmulq_f64(self.twiddle5re, x11p20); + let t_a8_12 = vmulq_f64(self.twiddle3re, x12p19); + let t_a8_13 = vmulq_f64(self.twiddle11re, x13p18); + let t_a8_14 = vmulq_f64(self.twiddle12re, x14p17); + let t_a8_15 = vmulq_f64(self.twiddle4re, x15p16); + let t_a9_1 = vmulq_f64(self.twiddle9re, x1p30); + let t_a9_2 = vmulq_f64(self.twiddle13re, x2p29); + let t_a9_3 = vmulq_f64(self.twiddle4re, x3p28); + let t_a9_4 = vmulq_f64(self.twiddle5re, x4p27); + let t_a9_5 = vmulq_f64(self.twiddle14re, x5p26); + let t_a9_6 = vmulq_f64(self.twiddle8re, x6p25); + let t_a9_7 = vmulq_f64(self.twiddle1re, x7p24); + let t_a9_8 = vmulq_f64(self.twiddle10re, x8p23); + let t_a9_9 = vmulq_f64(self.twiddle12re, x9p22); + let t_a9_10 = vmulq_f64(self.twiddle3re, x10p21); + let t_a9_11 = vmulq_f64(self.twiddle6re, x11p20); + let t_a9_12 = vmulq_f64(self.twiddle15re, x12p19); + let t_a9_13 = vmulq_f64(self.twiddle7re, x13p18); + let t_a9_14 = vmulq_f64(self.twiddle2re, x14p17); + let t_a9_15 = vmulq_f64(self.twiddle11re, x15p16); + let t_a10_1 = vmulq_f64(self.twiddle10re, x1p30); + let t_a10_2 = vmulq_f64(self.twiddle11re, x2p29); + let t_a10_3 = vmulq_f64(self.twiddle1re, x3p28); + let t_a10_4 = vmulq_f64(self.twiddle9re, x4p27); + let t_a10_5 = vmulq_f64(self.twiddle12re, x5p26); + let t_a10_6 = vmulq_f64(self.twiddle2re, x6p25); + let t_a10_7 = vmulq_f64(self.twiddle8re, x7p24); + let t_a10_8 = vmulq_f64(self.twiddle13re, x8p23); + let t_a10_9 = vmulq_f64(self.twiddle3re, x9p22); + let t_a10_10 = vmulq_f64(self.twiddle7re, x10p21); + let t_a10_11 = vmulq_f64(self.twiddle14re, x11p20); + let t_a10_12 = vmulq_f64(self.twiddle4re, x12p19); + let t_a10_13 = vmulq_f64(self.twiddle6re, x13p18); + let t_a10_14 = vmulq_f64(self.twiddle15re, x14p17); + let t_a10_15 = vmulq_f64(self.twiddle5re, x15p16); + let t_a11_1 = vmulq_f64(self.twiddle11re, x1p30); + let t_a11_2 = vmulq_f64(self.twiddle9re, x2p29); + let t_a11_3 = vmulq_f64(self.twiddle2re, x3p28); + let t_a11_4 = vmulq_f64(self.twiddle13re, x4p27); + let t_a11_5 = vmulq_f64(self.twiddle7re, x5p26); + let t_a11_6 = vmulq_f64(self.twiddle4re, x6p25); + let t_a11_7 = vmulq_f64(self.twiddle15re, x7p24); + let t_a11_8 = vmulq_f64(self.twiddle5re, x8p23); + let t_a11_9 = vmulq_f64(self.twiddle6re, x9p22); + let t_a11_10 = vmulq_f64(self.twiddle14re, x10p21); + let t_a11_11 = vmulq_f64(self.twiddle3re, x11p20); + let t_a11_12 = vmulq_f64(self.twiddle8re, x12p19); + let t_a11_13 = vmulq_f64(self.twiddle12re, x13p18); + let t_a11_14 = vmulq_f64(self.twiddle1re, x14p17); + let t_a11_15 = vmulq_f64(self.twiddle10re, x15p16); + let t_a12_1 = vmulq_f64(self.twiddle12re, x1p30); + let t_a12_2 = vmulq_f64(self.twiddle7re, x2p29); + let t_a12_3 = vmulq_f64(self.twiddle5re, x3p28); + let t_a12_4 = vmulq_f64(self.twiddle14re, x4p27); + let t_a12_5 = vmulq_f64(self.twiddle2re, x5p26); + let t_a12_6 = vmulq_f64(self.twiddle10re, x6p25); + let t_a12_7 = vmulq_f64(self.twiddle9re, x7p24); + let t_a12_8 = vmulq_f64(self.twiddle3re, x8p23); + let t_a12_9 = vmulq_f64(self.twiddle15re, x9p22); + let t_a12_10 = vmulq_f64(self.twiddle4re, x10p21); + let t_a12_11 = vmulq_f64(self.twiddle8re, x11p20); + let t_a12_12 = vmulq_f64(self.twiddle11re, x12p19); + let t_a12_13 = vmulq_f64(self.twiddle1re, x13p18); + let t_a12_14 = vmulq_f64(self.twiddle13re, x14p17); + let t_a12_15 = vmulq_f64(self.twiddle6re, x15p16); + let t_a13_1 = vmulq_f64(self.twiddle13re, x1p30); + let t_a13_2 = vmulq_f64(self.twiddle5re, x2p29); + let t_a13_3 = vmulq_f64(self.twiddle8re, x3p28); + let t_a13_4 = vmulq_f64(self.twiddle10re, x4p27); + let t_a13_5 = vmulq_f64(self.twiddle3re, x5p26); + let t_a13_6 = vmulq_f64(self.twiddle15re, x6p25); + let t_a13_7 = vmulq_f64(self.twiddle2re, x7p24); + let t_a13_8 = vmulq_f64(self.twiddle11re, x8p23); + let t_a13_9 = vmulq_f64(self.twiddle7re, x9p22); + let t_a13_10 = vmulq_f64(self.twiddle6re, x10p21); + let t_a13_11 = vmulq_f64(self.twiddle12re, x11p20); + let t_a13_12 = vmulq_f64(self.twiddle1re, x12p19); + let t_a13_13 = vmulq_f64(self.twiddle14re, x13p18); + let t_a13_14 = vmulq_f64(self.twiddle4re, x14p17); + let t_a13_15 = vmulq_f64(self.twiddle9re, x15p16); + let t_a14_1 = vmulq_f64(self.twiddle14re, x1p30); + let t_a14_2 = vmulq_f64(self.twiddle3re, x2p29); + let t_a14_3 = vmulq_f64(self.twiddle11re, x3p28); + let t_a14_4 = vmulq_f64(self.twiddle6re, x4p27); + let t_a14_5 = vmulq_f64(self.twiddle8re, x5p26); + let t_a14_6 = vmulq_f64(self.twiddle9re, x6p25); + let t_a14_7 = vmulq_f64(self.twiddle5re, x7p24); + let t_a14_8 = vmulq_f64(self.twiddle12re, x8p23); + let t_a14_9 = vmulq_f64(self.twiddle2re, x9p22); + let t_a14_10 = vmulq_f64(self.twiddle15re, x10p21); + let t_a14_11 = vmulq_f64(self.twiddle1re, x11p20); + let t_a14_12 = vmulq_f64(self.twiddle13re, x12p19); + let t_a14_13 = vmulq_f64(self.twiddle4re, x13p18); + let t_a14_14 = vmulq_f64(self.twiddle10re, x14p17); + let t_a14_15 = vmulq_f64(self.twiddle7re, x15p16); + let t_a15_1 = vmulq_f64(self.twiddle15re, x1p30); + let t_a15_2 = vmulq_f64(self.twiddle1re, x2p29); + let t_a15_3 = vmulq_f64(self.twiddle14re, x3p28); + let t_a15_4 = vmulq_f64(self.twiddle2re, x4p27); + let t_a15_5 = vmulq_f64(self.twiddle13re, x5p26); + let t_a15_6 = vmulq_f64(self.twiddle3re, x6p25); + let t_a15_7 = vmulq_f64(self.twiddle12re, x7p24); + let t_a15_8 = vmulq_f64(self.twiddle4re, x8p23); + let t_a15_9 = vmulq_f64(self.twiddle11re, x9p22); + let t_a15_10 = vmulq_f64(self.twiddle5re, x10p21); + let t_a15_11 = vmulq_f64(self.twiddle10re, x11p20); + let t_a15_12 = vmulq_f64(self.twiddle6re, x12p19); + let t_a15_13 = vmulq_f64(self.twiddle9re, x13p18); + let t_a15_14 = vmulq_f64(self.twiddle7re, x14p17); + let t_a15_15 = vmulq_f64(self.twiddle8re, x15p16); + + let t_b1_1 = vmulq_f64(self.twiddle1im, x1m30); + let t_b1_2 = vmulq_f64(self.twiddle2im, x2m29); + let t_b1_3 = vmulq_f64(self.twiddle3im, x3m28); + let t_b1_4 = vmulq_f64(self.twiddle4im, x4m27); + let t_b1_5 = vmulq_f64(self.twiddle5im, x5m26); + let t_b1_6 = vmulq_f64(self.twiddle6im, x6m25); + let t_b1_7 = vmulq_f64(self.twiddle7im, x7m24); + let t_b1_8 = vmulq_f64(self.twiddle8im, x8m23); + let t_b1_9 = vmulq_f64(self.twiddle9im, x9m22); + let t_b1_10 = vmulq_f64(self.twiddle10im, x10m21); + let t_b1_11 = vmulq_f64(self.twiddle11im, x11m20); + let t_b1_12 = vmulq_f64(self.twiddle12im, x12m19); + let t_b1_13 = vmulq_f64(self.twiddle13im, x13m18); + let t_b1_14 = vmulq_f64(self.twiddle14im, x14m17); + let t_b1_15 = vmulq_f64(self.twiddle15im, x15m16); + let t_b2_1 = vmulq_f64(self.twiddle2im, x1m30); + let t_b2_2 = vmulq_f64(self.twiddle4im, x2m29); + let t_b2_3 = vmulq_f64(self.twiddle6im, x3m28); + let t_b2_4 = vmulq_f64(self.twiddle8im, x4m27); + let t_b2_5 = vmulq_f64(self.twiddle10im, x5m26); + let t_b2_6 = vmulq_f64(self.twiddle12im, x6m25); + let t_b2_7 = vmulq_f64(self.twiddle14im, x7m24); + let t_b2_8 = vmulq_f64(self.twiddle15im, x8m23); + let t_b2_9 = vmulq_f64(self.twiddle13im, x9m22); + let t_b2_10 = vmulq_f64(self.twiddle11im, x10m21); + let t_b2_11 = vmulq_f64(self.twiddle9im, x11m20); + let t_b2_12 = vmulq_f64(self.twiddle7im, x12m19); + let t_b2_13 = vmulq_f64(self.twiddle5im, x13m18); + let t_b2_14 = vmulq_f64(self.twiddle3im, x14m17); + let t_b2_15 = vmulq_f64(self.twiddle1im, x15m16); + let t_b3_1 = vmulq_f64(self.twiddle3im, x1m30); + let t_b3_2 = vmulq_f64(self.twiddle6im, x2m29); + let t_b3_3 = vmulq_f64(self.twiddle9im, x3m28); + let t_b3_4 = vmulq_f64(self.twiddle12im, x4m27); + let t_b3_5 = vmulq_f64(self.twiddle15im, x5m26); + let t_b3_6 = vmulq_f64(self.twiddle13im, x6m25); + let t_b3_7 = vmulq_f64(self.twiddle10im, x7m24); + let t_b3_8 = vmulq_f64(self.twiddle7im, x8m23); + let t_b3_9 = vmulq_f64(self.twiddle4im, x9m22); + let t_b3_10 = vmulq_f64(self.twiddle1im, x10m21); + let t_b3_11 = vmulq_f64(self.twiddle2im, x11m20); + let t_b3_12 = vmulq_f64(self.twiddle5im, x12m19); + let t_b3_13 = vmulq_f64(self.twiddle8im, x13m18); + let t_b3_14 = vmulq_f64(self.twiddle11im, x14m17); + let t_b3_15 = vmulq_f64(self.twiddle14im, x15m16); + let t_b4_1 = vmulq_f64(self.twiddle4im, x1m30); + let t_b4_2 = vmulq_f64(self.twiddle8im, x2m29); + let t_b4_3 = vmulq_f64(self.twiddle12im, x3m28); + let t_b4_4 = vmulq_f64(self.twiddle15im, x4m27); + let t_b4_5 = vmulq_f64(self.twiddle11im, x5m26); + let t_b4_6 = vmulq_f64(self.twiddle7im, x6m25); + let t_b4_7 = vmulq_f64(self.twiddle3im, x7m24); + let t_b4_8 = vmulq_f64(self.twiddle1im, x8m23); + let t_b4_9 = vmulq_f64(self.twiddle5im, x9m22); + let t_b4_10 = vmulq_f64(self.twiddle9im, x10m21); + let t_b4_11 = vmulq_f64(self.twiddle13im, x11m20); + let t_b4_12 = vmulq_f64(self.twiddle14im, x12m19); + let t_b4_13 = vmulq_f64(self.twiddle10im, x13m18); + let t_b4_14 = vmulq_f64(self.twiddle6im, x14m17); + let t_b4_15 = vmulq_f64(self.twiddle2im, x15m16); + let t_b5_1 = vmulq_f64(self.twiddle5im, x1m30); + let t_b5_2 = vmulq_f64(self.twiddle10im, x2m29); + let t_b5_3 = vmulq_f64(self.twiddle15im, x3m28); + let t_b5_4 = vmulq_f64(self.twiddle11im, x4m27); + let t_b5_5 = vmulq_f64(self.twiddle6im, x5m26); + let t_b5_6 = vmulq_f64(self.twiddle1im, x6m25); + let t_b5_7 = vmulq_f64(self.twiddle4im, x7m24); + let t_b5_8 = vmulq_f64(self.twiddle9im, x8m23); + let t_b5_9 = vmulq_f64(self.twiddle14im, x9m22); + let t_b5_10 = vmulq_f64(self.twiddle12im, x10m21); + let t_b5_11 = vmulq_f64(self.twiddle7im, x11m20); + let t_b5_12 = vmulq_f64(self.twiddle2im, x12m19); + let t_b5_13 = vmulq_f64(self.twiddle3im, x13m18); + let t_b5_14 = vmulq_f64(self.twiddle8im, x14m17); + let t_b5_15 = vmulq_f64(self.twiddle13im, x15m16); + let t_b6_1 = vmulq_f64(self.twiddle6im, x1m30); + let t_b6_2 = vmulq_f64(self.twiddle12im, x2m29); + let t_b6_3 = vmulq_f64(self.twiddle13im, x3m28); + let t_b6_4 = vmulq_f64(self.twiddle7im, x4m27); + let t_b6_5 = vmulq_f64(self.twiddle1im, x5m26); + let t_b6_6 = vmulq_f64(self.twiddle5im, x6m25); + let t_b6_7 = vmulq_f64(self.twiddle11im, x7m24); + let t_b6_8 = vmulq_f64(self.twiddle14im, x8m23); + let t_b6_9 = vmulq_f64(self.twiddle8im, x9m22); + let t_b6_10 = vmulq_f64(self.twiddle2im, x10m21); + let t_b6_11 = vmulq_f64(self.twiddle4im, x11m20); + let t_b6_12 = vmulq_f64(self.twiddle10im, x12m19); + let t_b6_13 = vmulq_f64(self.twiddle15im, x13m18); + let t_b6_14 = vmulq_f64(self.twiddle9im, x14m17); + let t_b6_15 = vmulq_f64(self.twiddle3im, x15m16); + let t_b7_1 = vmulq_f64(self.twiddle7im, x1m30); + let t_b7_2 = vmulq_f64(self.twiddle14im, x2m29); + let t_b7_3 = vmulq_f64(self.twiddle10im, x3m28); + let t_b7_4 = vmulq_f64(self.twiddle3im, x4m27); + let t_b7_5 = vmulq_f64(self.twiddle4im, x5m26); + let t_b7_6 = vmulq_f64(self.twiddle11im, x6m25); + let t_b7_7 = vmulq_f64(self.twiddle13im, x7m24); + let t_b7_8 = vmulq_f64(self.twiddle6im, x8m23); + let t_b7_9 = vmulq_f64(self.twiddle1im, x9m22); + let t_b7_10 = vmulq_f64(self.twiddle8im, x10m21); + let t_b7_11 = vmulq_f64(self.twiddle15im, x11m20); + let t_b7_12 = vmulq_f64(self.twiddle9im, x12m19); + let t_b7_13 = vmulq_f64(self.twiddle2im, x13m18); + let t_b7_14 = vmulq_f64(self.twiddle5im, x14m17); + let t_b7_15 = vmulq_f64(self.twiddle12im, x15m16); + let t_b8_1 = vmulq_f64(self.twiddle8im, x1m30); + let t_b8_2 = vmulq_f64(self.twiddle15im, x2m29); + let t_b8_3 = vmulq_f64(self.twiddle7im, x3m28); + let t_b8_4 = vmulq_f64(self.twiddle1im, x4m27); + let t_b8_5 = vmulq_f64(self.twiddle9im, x5m26); + let t_b8_6 = vmulq_f64(self.twiddle14im, x6m25); + let t_b8_7 = vmulq_f64(self.twiddle6im, x7m24); + let t_b8_8 = vmulq_f64(self.twiddle2im, x8m23); + let t_b8_9 = vmulq_f64(self.twiddle10im, x9m22); + let t_b8_10 = vmulq_f64(self.twiddle13im, x10m21); + let t_b8_11 = vmulq_f64(self.twiddle5im, x11m20); + let t_b8_12 = vmulq_f64(self.twiddle3im, x12m19); + let t_b8_13 = vmulq_f64(self.twiddle11im, x13m18); + let t_b8_14 = vmulq_f64(self.twiddle12im, x14m17); + let t_b8_15 = vmulq_f64(self.twiddle4im, x15m16); + let t_b9_1 = vmulq_f64(self.twiddle9im, x1m30); + let t_b9_2 = vmulq_f64(self.twiddle13im, x2m29); + let t_b9_3 = vmulq_f64(self.twiddle4im, x3m28); + let t_b9_4 = vmulq_f64(self.twiddle5im, x4m27); + let t_b9_5 = vmulq_f64(self.twiddle14im, x5m26); + let t_b9_6 = vmulq_f64(self.twiddle8im, x6m25); + let t_b9_7 = vmulq_f64(self.twiddle1im, x7m24); + let t_b9_8 = vmulq_f64(self.twiddle10im, x8m23); + let t_b9_9 = vmulq_f64(self.twiddle12im, x9m22); + let t_b9_10 = vmulq_f64(self.twiddle3im, x10m21); + let t_b9_11 = vmulq_f64(self.twiddle6im, x11m20); + let t_b9_12 = vmulq_f64(self.twiddle15im, x12m19); + let t_b9_13 = vmulq_f64(self.twiddle7im, x13m18); + let t_b9_14 = vmulq_f64(self.twiddle2im, x14m17); + let t_b9_15 = vmulq_f64(self.twiddle11im, x15m16); + let t_b10_1 = vmulq_f64(self.twiddle10im, x1m30); + let t_b10_2 = vmulq_f64(self.twiddle11im, x2m29); + let t_b10_3 = vmulq_f64(self.twiddle1im, x3m28); + let t_b10_4 = vmulq_f64(self.twiddle9im, x4m27); + let t_b10_5 = vmulq_f64(self.twiddle12im, x5m26); + let t_b10_6 = vmulq_f64(self.twiddle2im, x6m25); + let t_b10_7 = vmulq_f64(self.twiddle8im, x7m24); + let t_b10_8 = vmulq_f64(self.twiddle13im, x8m23); + let t_b10_9 = vmulq_f64(self.twiddle3im, x9m22); + let t_b10_10 = vmulq_f64(self.twiddle7im, x10m21); + let t_b10_11 = vmulq_f64(self.twiddle14im, x11m20); + let t_b10_12 = vmulq_f64(self.twiddle4im, x12m19); + let t_b10_13 = vmulq_f64(self.twiddle6im, x13m18); + let t_b10_14 = vmulq_f64(self.twiddle15im, x14m17); + let t_b10_15 = vmulq_f64(self.twiddle5im, x15m16); + let t_b11_1 = vmulq_f64(self.twiddle11im, x1m30); + let t_b11_2 = vmulq_f64(self.twiddle9im, x2m29); + let t_b11_3 = vmulq_f64(self.twiddle2im, x3m28); + let t_b11_4 = vmulq_f64(self.twiddle13im, x4m27); + let t_b11_5 = vmulq_f64(self.twiddle7im, x5m26); + let t_b11_6 = vmulq_f64(self.twiddle4im, x6m25); + let t_b11_7 = vmulq_f64(self.twiddle15im, x7m24); + let t_b11_8 = vmulq_f64(self.twiddle5im, x8m23); + let t_b11_9 = vmulq_f64(self.twiddle6im, x9m22); + let t_b11_10 = vmulq_f64(self.twiddle14im, x10m21); + let t_b11_11 = vmulq_f64(self.twiddle3im, x11m20); + let t_b11_12 = vmulq_f64(self.twiddle8im, x12m19); + let t_b11_13 = vmulq_f64(self.twiddle12im, x13m18); + let t_b11_14 = vmulq_f64(self.twiddle1im, x14m17); + let t_b11_15 = vmulq_f64(self.twiddle10im, x15m16); + let t_b12_1 = vmulq_f64(self.twiddle12im, x1m30); + let t_b12_2 = vmulq_f64(self.twiddle7im, x2m29); + let t_b12_3 = vmulq_f64(self.twiddle5im, x3m28); + let t_b12_4 = vmulq_f64(self.twiddle14im, x4m27); + let t_b12_5 = vmulq_f64(self.twiddle2im, x5m26); + let t_b12_6 = vmulq_f64(self.twiddle10im, x6m25); + let t_b12_7 = vmulq_f64(self.twiddle9im, x7m24); + let t_b12_8 = vmulq_f64(self.twiddle3im, x8m23); + let t_b12_9 = vmulq_f64(self.twiddle15im, x9m22); + let t_b12_10 = vmulq_f64(self.twiddle4im, x10m21); + let t_b12_11 = vmulq_f64(self.twiddle8im, x11m20); + let t_b12_12 = vmulq_f64(self.twiddle11im, x12m19); + let t_b12_13 = vmulq_f64(self.twiddle1im, x13m18); + let t_b12_14 = vmulq_f64(self.twiddle13im, x14m17); + let t_b12_15 = vmulq_f64(self.twiddle6im, x15m16); + let t_b13_1 = vmulq_f64(self.twiddle13im, x1m30); + let t_b13_2 = vmulq_f64(self.twiddle5im, x2m29); + let t_b13_3 = vmulq_f64(self.twiddle8im, x3m28); + let t_b13_4 = vmulq_f64(self.twiddle10im, x4m27); + let t_b13_5 = vmulq_f64(self.twiddle3im, x5m26); + let t_b13_6 = vmulq_f64(self.twiddle15im, x6m25); + let t_b13_7 = vmulq_f64(self.twiddle2im, x7m24); + let t_b13_8 = vmulq_f64(self.twiddle11im, x8m23); + let t_b13_9 = vmulq_f64(self.twiddle7im, x9m22); + let t_b13_10 = vmulq_f64(self.twiddle6im, x10m21); + let t_b13_11 = vmulq_f64(self.twiddle12im, x11m20); + let t_b13_12 = vmulq_f64(self.twiddle1im, x12m19); + let t_b13_13 = vmulq_f64(self.twiddle14im, x13m18); + let t_b13_14 = vmulq_f64(self.twiddle4im, x14m17); + let t_b13_15 = vmulq_f64(self.twiddle9im, x15m16); + let t_b14_1 = vmulq_f64(self.twiddle14im, x1m30); + let t_b14_2 = vmulq_f64(self.twiddle3im, x2m29); + let t_b14_3 = vmulq_f64(self.twiddle11im, x3m28); + let t_b14_4 = vmulq_f64(self.twiddle6im, x4m27); + let t_b14_5 = vmulq_f64(self.twiddle8im, x5m26); + let t_b14_6 = vmulq_f64(self.twiddle9im, x6m25); + let t_b14_7 = vmulq_f64(self.twiddle5im, x7m24); + let t_b14_8 = vmulq_f64(self.twiddle12im, x8m23); + let t_b14_9 = vmulq_f64(self.twiddle2im, x9m22); + let t_b14_10 = vmulq_f64(self.twiddle15im, x10m21); + let t_b14_11 = vmulq_f64(self.twiddle1im, x11m20); + let t_b14_12 = vmulq_f64(self.twiddle13im, x12m19); + let t_b14_13 = vmulq_f64(self.twiddle4im, x13m18); + let t_b14_14 = vmulq_f64(self.twiddle10im, x14m17); + let t_b14_15 = vmulq_f64(self.twiddle7im, x15m16); + let t_b15_1 = vmulq_f64(self.twiddle15im, x1m30); + let t_b15_2 = vmulq_f64(self.twiddle1im, x2m29); + let t_b15_3 = vmulq_f64(self.twiddle14im, x3m28); + let t_b15_4 = vmulq_f64(self.twiddle2im, x4m27); + let t_b15_5 = vmulq_f64(self.twiddle13im, x5m26); + let t_b15_6 = vmulq_f64(self.twiddle3im, x6m25); + let t_b15_7 = vmulq_f64(self.twiddle12im, x7m24); + let t_b15_8 = vmulq_f64(self.twiddle4im, x8m23); + let t_b15_9 = vmulq_f64(self.twiddle11im, x9m22); + let t_b15_10 = vmulq_f64(self.twiddle5im, x10m21); + let t_b15_11 = vmulq_f64(self.twiddle10im, x11m20); + let t_b15_12 = vmulq_f64(self.twiddle6im, x12m19); + let t_b15_13 = vmulq_f64(self.twiddle9im, x13m18); + let t_b15_14 = vmulq_f64(self.twiddle7im, x14m17); + let t_b15_15 = vmulq_f64(self.twiddle8im, x15m16); + + let x0 = values[0]; + let t_a1 = calc_f64!(x0 + t_a1_1 + t_a1_2 + t_a1_3 + t_a1_4 + t_a1_5 + t_a1_6 + t_a1_7 + t_a1_8 + t_a1_9 + t_a1_10 + t_a1_11 + t_a1_12 + t_a1_13 + t_a1_14 + t_a1_15); + let t_a2 = calc_f64!(x0 + t_a2_1 + t_a2_2 + t_a2_3 + t_a2_4 + t_a2_5 + t_a2_6 + t_a2_7 + t_a2_8 + t_a2_9 + t_a2_10 + t_a2_11 + t_a2_12 + t_a2_13 + t_a2_14 + t_a2_15); + let t_a3 = calc_f64!(x0 + t_a3_1 + t_a3_2 + t_a3_3 + t_a3_4 + t_a3_5 + t_a3_6 + t_a3_7 + t_a3_8 + t_a3_9 + t_a3_10 + t_a3_11 + t_a3_12 + t_a3_13 + t_a3_14 + t_a3_15); + let t_a4 = calc_f64!(x0 + t_a4_1 + t_a4_2 + t_a4_3 + t_a4_4 + t_a4_5 + t_a4_6 + t_a4_7 + t_a4_8 + t_a4_9 + t_a4_10 + t_a4_11 + t_a4_12 + t_a4_13 + t_a4_14 + t_a4_15); + let t_a5 = calc_f64!(x0 + t_a5_1 + t_a5_2 + t_a5_3 + t_a5_4 + t_a5_5 + t_a5_6 + t_a5_7 + t_a5_8 + t_a5_9 + t_a5_10 + t_a5_11 + t_a5_12 + t_a5_13 + t_a5_14 + t_a5_15); + let t_a6 = calc_f64!(x0 + t_a6_1 + t_a6_2 + t_a6_3 + t_a6_4 + t_a6_5 + t_a6_6 + t_a6_7 + t_a6_8 + t_a6_9 + t_a6_10 + t_a6_11 + t_a6_12 + t_a6_13 + t_a6_14 + t_a6_15); + let t_a7 = calc_f64!(x0 + t_a7_1 + t_a7_2 + t_a7_3 + t_a7_4 + t_a7_5 + t_a7_6 + t_a7_7 + t_a7_8 + t_a7_9 + t_a7_10 + t_a7_11 + t_a7_12 + t_a7_13 + t_a7_14 + t_a7_15); + let t_a8 = calc_f64!(x0 + t_a8_1 + t_a8_2 + t_a8_3 + t_a8_4 + t_a8_5 + t_a8_6 + t_a8_7 + t_a8_8 + t_a8_9 + t_a8_10 + t_a8_11 + t_a8_12 + t_a8_13 + t_a8_14 + t_a8_15); + let t_a9 = calc_f64!(x0 + t_a9_1 + t_a9_2 + t_a9_3 + t_a9_4 + t_a9_5 + t_a9_6 + t_a9_7 + t_a9_8 + t_a9_9 + t_a9_10 + t_a9_11 + t_a9_12 + t_a9_13 + t_a9_14 + t_a9_15); + let t_a10 = calc_f64!(x0 + t_a10_1 + t_a10_2 + t_a10_3 + t_a10_4 + t_a10_5 + t_a10_6 + t_a10_7 + t_a10_8 + t_a10_9 + t_a10_10 + t_a10_11 + t_a10_12 + t_a10_13 + t_a10_14 + t_a10_15); + let t_a11 = calc_f64!(x0 + t_a11_1 + t_a11_2 + t_a11_3 + t_a11_4 + t_a11_5 + t_a11_6 + t_a11_7 + t_a11_8 + t_a11_9 + t_a11_10 + t_a11_11 + t_a11_12 + t_a11_13 + t_a11_14 + t_a11_15); + let t_a12 = calc_f64!(x0 + t_a12_1 + t_a12_2 + t_a12_3 + t_a12_4 + t_a12_5 + t_a12_6 + t_a12_7 + t_a12_8 + t_a12_9 + t_a12_10 + t_a12_11 + t_a12_12 + t_a12_13 + t_a12_14 + t_a12_15); + let t_a13 = calc_f64!(x0 + t_a13_1 + t_a13_2 + t_a13_3 + t_a13_4 + t_a13_5 + t_a13_6 + t_a13_7 + t_a13_8 + t_a13_9 + t_a13_10 + t_a13_11 + t_a13_12 + t_a13_13 + t_a13_14 + t_a13_15); + let t_a14 = calc_f64!(x0 + t_a14_1 + t_a14_2 + t_a14_3 + t_a14_4 + t_a14_5 + t_a14_6 + t_a14_7 + t_a14_8 + t_a14_9 + t_a14_10 + t_a14_11 + t_a14_12 + t_a14_13 + t_a14_14 + t_a14_15); + let t_a15 = calc_f64!(x0 + t_a15_1 + t_a15_2 + t_a15_3 + t_a15_4 + t_a15_5 + t_a15_6 + t_a15_7 + t_a15_8 + t_a15_9 + t_a15_10 + t_a15_11 + t_a15_12 + t_a15_13 + t_a15_14 + t_a15_15); + + let t_b1 = calc_f64!(t_b1_1 + t_b1_2 + t_b1_3 + t_b1_4 + t_b1_5 + t_b1_6 + t_b1_7 + t_b1_8 + t_b1_9 + t_b1_10 + t_b1_11 + t_b1_12 + t_b1_13 + t_b1_14 + t_b1_15); + let t_b2 = calc_f64!(t_b2_1 + t_b2_2 + t_b2_3 + t_b2_4 + t_b2_5 + t_b2_6 + t_b2_7 - t_b2_8 - t_b2_9 - t_b2_10 - t_b2_11 - t_b2_12 - t_b2_13 - t_b2_14 - t_b2_15); + let t_b3 = calc_f64!(t_b3_1 + t_b3_2 + t_b3_3 + t_b3_4 + t_b3_5 - t_b3_6 - t_b3_7 - t_b3_8 - t_b3_9 - t_b3_10 + t_b3_11 + t_b3_12 + t_b3_13 + t_b3_14 + t_b3_15); + let t_b4 = calc_f64!(t_b4_1 + t_b4_2 + t_b4_3 - t_b4_4 - t_b4_5 - t_b4_6 - t_b4_7 + t_b4_8 + t_b4_9 + t_b4_10 + t_b4_11 - t_b4_12 - t_b4_13 - t_b4_14 - t_b4_15); + let t_b5 = calc_f64!(t_b5_1 + t_b5_2 + t_b5_3 - t_b5_4 - t_b5_5 - t_b5_6 + t_b5_7 + t_b5_8 + t_b5_9 - t_b5_10 - t_b5_11 - t_b5_12 + t_b5_13 + t_b5_14 + t_b5_15); + let t_b6 = calc_f64!(t_b6_1 + t_b6_2 - t_b6_3 - t_b6_4 - t_b6_5 + t_b6_6 + t_b6_7 - t_b6_8 - t_b6_9 - t_b6_10 + t_b6_11 + t_b6_12 - t_b6_13 - t_b6_14 - t_b6_15); + let t_b7 = calc_f64!(t_b7_1 + t_b7_2 - t_b7_3 - t_b7_4 + t_b7_5 + t_b7_6 - t_b7_7 - t_b7_8 + t_b7_9 + t_b7_10 + t_b7_11 - t_b7_12 - t_b7_13 + t_b7_14 + t_b7_15); + let t_b8 = calc_f64!(t_b8_1 - t_b8_2 - t_b8_3 + t_b8_4 + t_b8_5 - t_b8_6 - t_b8_7 + t_b8_8 + t_b8_9 - t_b8_10 - t_b8_11 + t_b8_12 + t_b8_13 - t_b8_14 - t_b8_15); + let t_b9 = calc_f64!(t_b9_1 - t_b9_2 - t_b9_3 + t_b9_4 + t_b9_5 - t_b9_6 + t_b9_7 + t_b9_8 - t_b9_9 - t_b9_10 + t_b9_11 + t_b9_12 - t_b9_13 + t_b9_14 + t_b9_15); + let t_b10 = calc_f64!(t_b10_1 - t_b10_2 - t_b10_3 + t_b10_4 - t_b10_5 - t_b10_6 + t_b10_7 - t_b10_8 - t_b10_9 + t_b10_10 - t_b10_11 - t_b10_12 + t_b10_13 - t_b10_14 - t_b10_15); + let t_b11 = calc_f64!(t_b11_1 - t_b11_2 + t_b11_3 + t_b11_4 - t_b11_5 + t_b11_6 + t_b11_7 - t_b11_8 + t_b11_9 - t_b11_10 - t_b11_11 + t_b11_12 - t_b11_13 - t_b11_14 + t_b11_15); + let t_b12 = calc_f64!(t_b12_1 - t_b12_2 + t_b12_3 - t_b12_4 - t_b12_5 + t_b12_6 - t_b12_7 + t_b12_8 + t_b12_9 - t_b12_10 + t_b12_11 - t_b12_12 + t_b12_13 + t_b12_14 - t_b12_15); + let t_b13 = calc_f64!(t_b13_1 - t_b13_2 + t_b13_3 - t_b13_4 + t_b13_5 - t_b13_6 - t_b13_7 + t_b13_8 - t_b13_9 + t_b13_10 - t_b13_11 + t_b13_12 + t_b13_13 - t_b13_14 + t_b13_15); + let t_b14 = calc_f64!(t_b14_1 - t_b14_2 + t_b14_3 - t_b14_4 + t_b14_5 - t_b14_6 + t_b14_7 - t_b14_8 + t_b14_9 - t_b14_10 - t_b14_11 + t_b14_12 - t_b14_13 + t_b14_14 - t_b14_15); + let t_b15 = calc_f64!(t_b15_1 - t_b15_2 + t_b15_3 - t_b15_4 + t_b15_5 - t_b15_6 + t_b15_7 - t_b15_8 + t_b15_9 - t_b15_10 + t_b15_11 - t_b15_12 + t_b15_13 - t_b15_14 + t_b15_15); + + let t_b1_rot = self.rotate.rotate(t_b1); + let t_b2_rot = self.rotate.rotate(t_b2); + let t_b3_rot = self.rotate.rotate(t_b3); + let t_b4_rot = self.rotate.rotate(t_b4); + let t_b5_rot = self.rotate.rotate(t_b5); + let t_b6_rot = self.rotate.rotate(t_b6); + let t_b7_rot = self.rotate.rotate(t_b7); + let t_b8_rot = self.rotate.rotate(t_b8); + let t_b9_rot = self.rotate.rotate(t_b9); + let t_b10_rot = self.rotate.rotate(t_b10); + let t_b11_rot = self.rotate.rotate(t_b11); + let t_b12_rot = self.rotate.rotate(t_b12); + let t_b13_rot = self.rotate.rotate(t_b13); + let t_b14_rot = self.rotate.rotate(t_b14); + let t_b15_rot = self.rotate.rotate(t_b15); + + let y0 = calc_f64!(x0 + x1p30 + x2p29 + x3p28 + x4p27 + x5p26 + x6p25 + x7p24 + x8p23 + x9p22 + x10p21 + x11p20 + x12p19 + x13p18 + x14p17 + x15p16); + let [y1, y30] = solo_fft2_f64(t_a1, t_b1_rot); + let [y2, y29] = solo_fft2_f64(t_a2, t_b2_rot); + let [y3, y28] = solo_fft2_f64(t_a3, t_b3_rot); + let [y4, y27] = solo_fft2_f64(t_a4, t_b4_rot); + let [y5, y26] = solo_fft2_f64(t_a5, t_b5_rot); + let [y6, y25] = solo_fft2_f64(t_a6, t_b6_rot); + let [y7, y24] = solo_fft2_f64(t_a7, t_b7_rot); + let [y8, y23] = solo_fft2_f64(t_a8, t_b8_rot); + let [y9, y22] = solo_fft2_f64(t_a9, t_b9_rot); + let [y10, y21] = solo_fft2_f64(t_a10, t_b10_rot); + let [y11, y20] = solo_fft2_f64(t_a11, t_b11_rot); + let [y12, y19] = solo_fft2_f64(t_a12, t_b12_rot); + let [y13, y18] = solo_fft2_f64(t_a13, t_b13_rot); + let [y14, y17] = solo_fft2_f64(t_a14, t_b14_rot); + let [y15, y16] = solo_fft2_f64(t_a15, t_b15_rot); + [y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30] + } +} + +// _____ _____ ____ _____ ____ +// |_ _| ____/ ___|_ _/ ___| +// | | | _| \___ \ | | \___ \ +// | | | |___ ___) || | ___) | +// |_| |_____|____/ |_| |____/ +// + +#[cfg(test)] +mod unit_tests { + use super::*; + use crate::test_utils::check_fft_algorithm; + + //the tests for all butterflies will be identical except for the identifiers used and size + //so it's ideal for a macro + macro_rules! test_butterfly_32_func { + ($test_name:ident, $struct_name:ident, $size:expr) => { + #[test] + fn $test_name() { + let butterfly = $struct_name::new(FftDirection::Forward); + check_fft_algorithm::(&butterfly, $size, FftDirection::Forward); + + let butterfly_direction = $struct_name::new(FftDirection::Inverse); + check_fft_algorithm::(&butterfly_direction, $size, FftDirection::Inverse); + } + }; + } + test_butterfly_32_func!(test_neonf32_butterfly7, NeonF32Butterfly7, 7); + test_butterfly_32_func!(test_neonf32_butterfly11, NeonF32Butterfly11, 11); + test_butterfly_32_func!(test_neonf32_butterfly13, NeonF32Butterfly13, 13); + test_butterfly_32_func!(test_neonf32_butterfly17, NeonF32Butterfly17, 17); + test_butterfly_32_func!(test_neonf32_butterfly19, NeonF32Butterfly19, 19); + test_butterfly_32_func!(test_neonf32_butterfly23, NeonF32Butterfly23, 23); + test_butterfly_32_func!(test_neonf32_butterfly29, NeonF32Butterfly29, 29); + test_butterfly_32_func!(test_neonf32_butterfly31, NeonF32Butterfly31, 31); + + //the tests for all butterflies will be identical except for the identifiers used and size + //so it's ideal for a macro + macro_rules! test_butterfly_64_func { + ($test_name:ident, $struct_name:ident, $size:expr) => { + #[test] + fn $test_name() { + let butterfly = $struct_name::new(FftDirection::Forward); + check_fft_algorithm::(&butterfly, $size, FftDirection::Forward); + + let butterfly_direction = $struct_name::new(FftDirection::Inverse); + check_fft_algorithm::(&butterfly_direction, $size, FftDirection::Inverse); + } + }; + } + test_butterfly_64_func!(test_neonf64_butterfly7, NeonF64Butterfly7, 7); + test_butterfly_64_func!(test_neonf64_butterfly11, NeonF64Butterfly11, 11); + test_butterfly_64_func!(test_neonf64_butterfly13, NeonF64Butterfly13, 13); + test_butterfly_64_func!(test_neonf64_butterfly17, NeonF64Butterfly17, 17); + test_butterfly_64_func!(test_neonf64_butterfly19, NeonF64Butterfly19, 19); + test_butterfly_64_func!(test_neonf64_butterfly23, NeonF64Butterfly23, 23); + test_butterfly_64_func!(test_neonf64_butterfly29, NeonF64Butterfly29, 29); + test_butterfly_64_func!(test_neonf64_butterfly31, NeonF64Butterfly31, 31); +} diff --git a/src/neon/neon_radix4.rs b/src/neon/neon_radix4.rs new file mode 100644 index 00000000..d6970764 --- /dev/null +++ b/src/neon/neon_radix4.rs @@ -0,0 +1,482 @@ +use num_complex::Complex; + +use core::arch::aarch64::*; + +use crate::algorithm::{bitreversed_transpose, reverse_bits}; +use crate::array_utils; +use crate::common::{fft_error_inplace, fft_error_outofplace}; +use crate::neon::neon_butterflies::{ + NeonF32Butterfly1, NeonF32Butterfly16, NeonF32Butterfly2, NeonF32Butterfly32, + NeonF32Butterfly4, NeonF32Butterfly8, +}; +use crate::neon::neon_butterflies::{ + NeonF64Butterfly1, NeonF64Butterfly16, NeonF64Butterfly2, NeonF64Butterfly32, + NeonF64Butterfly4, NeonF64Butterfly8, +}; +use crate::{common::FftNum, twiddles, FftDirection}; +use crate::{Direction, Fft, Length}; + +use super::neon_common::{assert_f32, assert_f64}; +use super::neon_utils::*; + +use super::neon_vector::{NeonArray, NeonArrayMut}; +use crate::array_utils::{RawSlice, RawSliceMut}; + +/// FFT algorithm optimized for power-of-two sizes, Neon accelerated version. +/// This is designed to be used via a Planner, and not created directly. + +const USE_BUTTERFLY32_FROM: usize = 262144; // Use length 32 butterfly starting from this length + +enum Neon32Butterfly { + Len1(NeonF32Butterfly1), + Len2(NeonF32Butterfly2), + Len4(NeonF32Butterfly4), + Len8(NeonF32Butterfly8), + Len16(NeonF32Butterfly16), + Len32(NeonF32Butterfly32), +} + +enum Neon64Butterfly { + Len1(NeonF64Butterfly1), + Len2(NeonF64Butterfly2), + Len4(NeonF64Butterfly4), + Len8(NeonF64Butterfly8), + Len16(NeonF64Butterfly16), + Len32(NeonF64Butterfly32), +} + +pub struct Neon32Radix4 { + _phantom: std::marker::PhantomData, + twiddles: Box<[float32x4_t]>, + + base_fft: Neon32Butterfly, + base_len: usize, + + len: usize, + direction: FftDirection, + bf4: NeonF32Butterfly4, + + shuffle_map: Box<[usize]>, +} + +impl Neon32Radix4 { + /// Preallocates necessary arrays and precomputes necessary data to efficiently compute the power-of-two FFT + pub fn new(len: usize, direction: FftDirection) -> Self { + assert!( + len.is_power_of_two(), + "Radix4 algorithm requires a power-of-two input size. Got {}", + len + ); + assert_f32::(); + + // figure out which base length we're going to use + let num_bits = len.trailing_zeros(); + let (base_len, base_fft) = match num_bits { + 0 => ( + len, + Neon32Butterfly::Len1(NeonF32Butterfly1::new(direction)), + ), + 1 => ( + len, + Neon32Butterfly::Len2(NeonF32Butterfly2::new(direction)), + ), + 2 => ( + len, + Neon32Butterfly::Len4(NeonF32Butterfly4::new(direction)), + ), + 3 => ( + len, + Neon32Butterfly::Len8(NeonF32Butterfly8::new(direction)), + ), + _ => { + if num_bits % 2 == 1 { + if len < USE_BUTTERFLY32_FROM { + (8, Neon32Butterfly::Len8(NeonF32Butterfly8::new(direction))) + } else { + ( + 32, + Neon32Butterfly::Len32(NeonF32Butterfly32::new(direction)), + ) + } + } else { + ( + 16, + Neon32Butterfly::Len16(NeonF32Butterfly16::new(direction)), + ) + } + } + }; + + // precompute the twiddle factors this algorithm will use. + // we're doing the same precomputation of twiddle factors as the mixed radix algorithm where width=4 and height=len/4 + // but mixed radix only does one step and then calls itself recusrively, and this algorithm does every layer all the way down + // so we're going to pack all the "layers" of twiddle factors into a single array, starting with the bottom layer and going up + let mut twiddle_stride = len / (base_len * 4); + let mut twiddle_factors = Vec::with_capacity(len * 2); + while twiddle_stride > 0 { + let num_rows = len / (twiddle_stride * 4); + for i in 0..num_rows / 2 { + for k in 1..4 { + let twiddle_a = twiddles::compute_twiddle::( + 2 * i * k * twiddle_stride, + len, + direction, + ); + let twiddle_b = twiddles::compute_twiddle::( + (2 * i + 1) * k * twiddle_stride, + len, + direction, + ); + let twiddles_packed = + unsafe { RawSlice::new(&[twiddle_a, twiddle_b]).load_complex(0) }; + twiddle_factors.push(twiddles_packed); + } + } + twiddle_stride >>= 2; + } + + // make a lookup table for the bit reverse shuffling + let rest_len = len / base_len; + let bitpairs = (rest_len.trailing_zeros() / 2) as usize; + let shuffle_map = (0..rest_len) + .map(|val| reverse_bits(val, bitpairs)) + .collect::>(); + + Self { + twiddles: twiddle_factors.into_boxed_slice(), + + base_fft, + base_len, + + len, + direction, + _phantom: std::marker::PhantomData, + bf4: NeonF32Butterfly4::::new(direction), + + shuffle_map: shuffle_map.into_boxed_slice(), + } + } + + //#[target_feature(enable = "neon")] + unsafe fn perform_fft_out_of_place( + &self, + signal: &[Complex], + spectrum: &mut [Complex], + _scratch: &mut [Complex], + ) { + // copy the data into the spectrum vector + if self.shuffle_map.len() < 4 { + spectrum.copy_from_slice(signal); + } else { + bitreversed_transpose(self.base_len, signal, spectrum, &self.shuffle_map); + } + + // Base-level FFTs + match &self.base_fft { + Neon32Butterfly::Len1(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon32Butterfly::Len2(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon32Butterfly::Len4(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon32Butterfly::Len8(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon32Butterfly::Len16(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon32Butterfly::Len32(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + }; + + // cross-FFTs + let mut current_size = self.base_len * 4; + let mut layer_twiddles: &[float32x4_t] = &self.twiddles; + + while current_size <= signal.len() { + let num_rows = signal.len() / current_size; + + for i in 0..num_rows { + butterfly_4_32( + &mut spectrum[i * current_size..], + layer_twiddles, + current_size / 4, + &self.bf4, + ) + } + + //skip past all the twiddle factors used in this layer + let twiddle_offset = (current_size * 3) / 8; + layer_twiddles = &layer_twiddles[twiddle_offset..]; + + current_size *= 4; + } + } +} +boilerplate_fft_neon_oop!(Neon32Radix4, |this: &Neon32Radix4<_>| this.len); + +//#[target_feature(enable = "neon")] +unsafe fn butterfly_4_32( + data: &mut [Complex], + twiddles: &[float32x4_t], + num_ffts: usize, + bf4: &NeonF32Butterfly4, +) { + let mut idx = 0usize; + let input: RawSlice> = RawSlice::new_transmuted(data); + let output: RawSliceMut> = RawSliceMut::new_transmuted(data); + for tw in twiddles.chunks_exact(6).take(num_ffts / 4) { + let scratch0 = input.load_complex(idx); + let scratch0b = input.load_complex(idx + 2); + let mut scratch1 = input.load_complex(idx + 1 * num_ffts); + let mut scratch1b = input.load_complex(idx + 2 + 1 * num_ffts); + let mut scratch2 = input.load_complex(idx + 2 * num_ffts); + let mut scratch2b = input.load_complex(idx + 2 + 2 * num_ffts); + let mut scratch3 = input.load_complex(idx + 3 * num_ffts); + let mut scratch3b = input.load_complex(idx + 2 + 3 * num_ffts); + + scratch1 = mul_complex_f32(scratch1, tw[0]); + scratch2 = mul_complex_f32(scratch2, tw[1]); + scratch3 = mul_complex_f32(scratch3, tw[2]); + scratch1b = mul_complex_f32(scratch1b, tw[3]); + scratch2b = mul_complex_f32(scratch2b, tw[4]); + scratch3b = mul_complex_f32(scratch3b, tw[5]); + + let scratch = bf4.perform_parallel_fft_direct(scratch0, scratch1, scratch2, scratch3); + let scratchb = bf4.perform_parallel_fft_direct(scratch0b, scratch1b, scratch2b, scratch3b); + + output.store_complex(scratch[0], idx); + output.store_complex(scratchb[0], idx + 2); + output.store_complex(scratch[1], idx + 1 * num_ffts); + output.store_complex(scratchb[1], idx + 2 + 1 * num_ffts); + output.store_complex(scratch[2], idx + 2 * num_ffts); + output.store_complex(scratchb[2], idx + 2 + 2 * num_ffts); + output.store_complex(scratch[3], idx + 3 * num_ffts); + output.store_complex(scratchb[3], idx + 2 + 3 * num_ffts); + + idx += 4; + } +} + +pub struct Neon64Radix4 { + _phantom: std::marker::PhantomData, + twiddles: Box<[float64x2_t]>, + + base_fft: Neon64Butterfly, + base_len: usize, + + len: usize, + direction: FftDirection, + bf4: NeonF64Butterfly4, + + shuffle_map: Box<[usize]>, +} + +impl Neon64Radix4 { + /// Preallocates necessary arrays and precomputes necessary data to efficiently compute the power-of-two FFT + pub fn new(len: usize, direction: FftDirection) -> Self { + assert!( + len.is_power_of_two(), + "Radix4 algorithm requires a power-of-two input size. Got {}", + len + ); + + assert_f64::(); + + // figure out which base length we're going to use + let num_bits = len.trailing_zeros(); + let (base_len, base_fft) = match num_bits { + 0 => ( + len, + Neon64Butterfly::Len1(NeonF64Butterfly1::new(direction)), + ), + 1 => ( + len, + Neon64Butterfly::Len2(NeonF64Butterfly2::new(direction)), + ), + 2 => ( + len, + Neon64Butterfly::Len4(NeonF64Butterfly4::new(direction)), + ), + 3 => ( + len, + Neon64Butterfly::Len8(NeonF64Butterfly8::new(direction)), + ), + _ => { + if num_bits % 2 == 1 { + if len < USE_BUTTERFLY32_FROM { + (8, Neon64Butterfly::Len8(NeonF64Butterfly8::new(direction))) + } else { + ( + 32, + Neon64Butterfly::Len32(NeonF64Butterfly32::new(direction)), + ) + } + } else { + ( + 16, + Neon64Butterfly::Len16(NeonF64Butterfly16::new(direction)), + ) + } + } + }; + + // precompute the twiddle factors this algorithm will use. + // we're doing the same precomputation of twiddle factors as the mixed radix algorithm where width=4 and height=len/4 + // but mixed radix only does one step and then calls itself recusrively, and this algorithm does every layer all the way down + // so we're going to pack all the "layers" of twiddle factors into a single array, starting with the bottom layer and going up + let mut twiddle_stride = len / (base_len * 4); + let mut twiddle_factors = Vec::with_capacity(len * 2); + while twiddle_stride > 0 { + let num_rows = len / (twiddle_stride * 4); + for i in 0..num_rows { + for k in 1..4 { + let twiddle = + twiddles::compute_twiddle::(i * k * twiddle_stride, len, direction); + let twiddle_packed = unsafe { RawSlice::new(&[twiddle]).load_complex(0) }; + twiddle_factors.push(twiddle_packed); + } + } + twiddle_stride >>= 2; + } + + // make a lookup table for the bit reverse shuffling + let rest_len = len / base_len; + let bitpairs = (rest_len.trailing_zeros() / 2) as usize; + let shuffle_map = (0..rest_len) + .map(|val| reverse_bits(val, bitpairs)) + .collect::>(); + + Self { + twiddles: twiddle_factors.into_boxed_slice(), + + base_fft, + base_len, + + len, + direction, + _phantom: std::marker::PhantomData, + bf4: NeonF64Butterfly4::::new(direction), + + shuffle_map: shuffle_map.into_boxed_slice(), + } + } + + //#[target_feature(enable = "neon")] + unsafe fn perform_fft_out_of_place( + &self, + signal: &[Complex], + spectrum: &mut [Complex], + _scratch: &mut [Complex], + ) { + // copy the data into the spectrum vector + if self.shuffle_map.len() < 4 { + spectrum.copy_from_slice(signal); + } else { + bitreversed_transpose(self.base_len, signal, spectrum, &self.shuffle_map); + } + + // Base-level FFTs + match &self.base_fft { + Neon64Butterfly::Len1(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon64Butterfly::Len2(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon64Butterfly::Len4(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon64Butterfly::Len8(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon64Butterfly::Len16(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + Neon64Butterfly::Len32(bf) => bf.perform_fft_butterfly_multi(spectrum).unwrap(), + } + + // cross-FFTs + let mut current_size = self.base_len * 4; + let mut layer_twiddles: &[float64x2_t] = &self.twiddles; + + while current_size <= signal.len() { + let num_rows = signal.len() / current_size; + + for i in 0..num_rows { + butterfly_4_64( + &mut spectrum[i * current_size..], + layer_twiddles, + current_size / 4, + &self.bf4, + ) + } + + //skip past all the twiddle factors used in this layer + let twiddle_offset = (current_size * 3) / 4; + layer_twiddles = &layer_twiddles[twiddle_offset..]; + + current_size *= 4; + } + } +} +boilerplate_fft_neon_oop!(Neon64Radix4, |this: &Neon64Radix4<_>| this.len); + +//#[target_feature(enable = "neon")] +unsafe fn butterfly_4_64( + data: &mut [Complex], + twiddles: &[float64x2_t], + num_ffts: usize, + bf4: &NeonF64Butterfly4, +) { + let mut idx = 0usize; + let input: RawSlice> = RawSlice::new_transmuted(data); + let output: RawSliceMut> = RawSliceMut::new_transmuted(data); + for tw in twiddles.chunks_exact(6).take(num_ffts / 2) { + let scratch0 = input.load_complex(idx); + let scratch0b = input.load_complex(idx + 1); + let mut scratch1 = input.load_complex(idx + 1 * num_ffts); + let mut scratch1b = input.load_complex(idx + 1 + 1 * num_ffts); + let mut scratch2 = input.load_complex(idx + 2 * num_ffts); + let mut scratch2b = input.load_complex(idx + 1 + 2 * num_ffts); + let mut scratch3 = input.load_complex(idx + 3 * num_ffts); + let mut scratch3b = input.load_complex(idx + 1 + 3 * num_ffts); + + scratch1 = mul_complex_f64(scratch1, tw[0]); + scratch2 = mul_complex_f64(scratch2, tw[1]); + scratch3 = mul_complex_f64(scratch3, tw[2]); + scratch1b = mul_complex_f64(scratch1b, tw[3]); + scratch2b = mul_complex_f64(scratch2b, tw[4]); + scratch3b = mul_complex_f64(scratch3b, tw[5]); + + let scratch = bf4.perform_fft_direct(scratch0, scratch1, scratch2, scratch3); + let scratchb = bf4.perform_fft_direct(scratch0b, scratch1b, scratch2b, scratch3b); + + output.store_complex(scratch[0], idx); + output.store_complex(scratchb[0], idx + 1); + output.store_complex(scratch[1], idx + 1 * num_ffts); + output.store_complex(scratchb[1], idx + 1 + 1 * num_ffts); + output.store_complex(scratch[2], idx + 2 * num_ffts); + output.store_complex(scratchb[2], idx + 1 + 2 * num_ffts); + output.store_complex(scratch[3], idx + 3 * num_ffts); + output.store_complex(scratchb[3], idx + 1 + 3 * num_ffts); + + idx += 2; + } +} + +#[cfg(test)] +mod unit_tests { + use super::*; + use crate::test_utils::check_fft_algorithm; + + #[test] + fn test_neon_radix4_64() { + for pow in 4..12 { + let len = 1 << pow; + test_neon_radix4_64_with_length(len, FftDirection::Forward); + test_neon_radix4_64_with_length(len, FftDirection::Inverse); + } + } + + fn test_neon_radix4_64_with_length(len: usize, direction: FftDirection) { + let fft = Neon64Radix4::new(len, direction); + check_fft_algorithm::(&fft, len, direction); + } + + #[test] + fn test_neon_radix4_32() { + for pow in 0..12 { + let len = 1 << pow; + test_neon_radix4_32_with_length(len, FftDirection::Forward); + test_neon_radix4_32_with_length(len, FftDirection::Inverse); + } + } + + fn test_neon_radix4_32_with_length(len: usize, direction: FftDirection) { + let fft = Neon32Radix4::new(len, direction); + check_fft_algorithm::(&fft, len, direction); + } +} diff --git a/src/neon/neon_utils.rs b/src/neon/neon_utils.rs new file mode 100644 index 00000000..19b7b899 --- /dev/null +++ b/src/neon/neon_utils.rs @@ -0,0 +1,278 @@ +use core::arch::aarch64::*; + +// __ __ _ _ _________ _ _ _ +// | \/ | __ _| |_| |__ |___ /___ \| |__ (_) |_ +// | |\/| |/ _` | __| '_ \ _____ |_ \ __) | '_ \| | __| +// | | | | (_| | |_| | | | |_____| ___) / __/| |_) | | |_ +// |_| |_|\__,_|\__|_| |_| |____/_____|_.__/|_|\__| +// + +pub struct Rotate90F32 { + //sign_lo: float32x4_t, + sign_hi: float32x2_t, + sign_both: float32x4_t, +} + +impl Rotate90F32 { + pub fn new(positive: bool) -> Self { + // There doesn't seem to be any need for rotating just the first element, but let's keep the code just in case + //let sign_lo = unsafe { + // if positive { + // _mm_set_ps(0.0, 0.0, 0.0, -0.0) + // } + // else { + // _mm_set_ps(0.0, 0.0, -0.0, 0.0) + // } + //}; + let sign_hi = unsafe { + if positive { + vld1_f32([-0.0, 0.0].as_ptr()) + } else { + vld1_f32([0.0, -0.0].as_ptr()) + } + }; + let sign_both = unsafe { + if positive { + vld1q_f32([-0.0, 0.0, -0.0, 0.0].as_ptr()) + } else { + vld1q_f32([0.0, -0.0, 0.0, -0.0].as_ptr()) + } + }; + Self { + //sign_lo, + sign_hi, + sign_both, + } + } + + #[inline(always)] + pub unsafe fn rotate_hi(&self, values: float32x4_t) -> float32x4_t { + vcombine_f32( + vget_low_f32(values), + vreinterpret_f32_u32(veor_u32( + vrev64_u32(vreinterpret_u32_f32(vget_high_f32(values))), + vreinterpret_u32_f32(self.sign_hi), + )), + ) + } + + // There doesn't seem to be any need for rotating just the first element, but let's keep the code just in case + //#[inline(always)] + //pub unsafe fn rotate_lo(&self, values: __m128) -> __m128 { + // let temp = _mm_shuffle_ps(values, values, 0xE1); + // _mm_xor_ps(temp, self.sign_lo) + //} + + #[inline(always)] + pub unsafe fn rotate_both(&self, values: float32x4_t) -> float32x4_t { + let temp = vrev64q_f32(values); + vreinterpretq_f32_u32(veorq_u32( + vreinterpretq_u32_f32(temp), + vreinterpretq_u32_f32(self.sign_both), + )) + } +} + +// Pack low (1st) complex +// left: r1.re, r1.im, r2.re, r2.im +// right: l1.re, l1.im, l2.re, l2.im +// --> r1.re, r1.im, l1.re, l1.im +#[inline(always)] +pub unsafe fn extract_lo_lo_f32(left: float32x4_t, right: float32x4_t) -> float32x4_t { + //_mm_shuffle_ps(left, right, 0x44) + vreinterpretq_f32_f64(vtrn1q_f64( + vreinterpretq_f64_f32(left), + vreinterpretq_f64_f32(right), + )) +} + +// Pack high (2nd) complex +// left: r1.re, r1.im, r2.re, r2.im +// right: l1.re, l1.im, l2.re, l2.im +// --> r2.re, r2.im, l2.re, l2.im +#[inline(always)] +pub unsafe fn extract_hi_hi_f32(left: float32x4_t, right: float32x4_t) -> float32x4_t { + vreinterpretq_f32_f64(vtrn2q_f64( + vreinterpretq_f64_f32(left), + vreinterpretq_f64_f32(right), + )) +} + +// Pack low (1st) and high (2nd) complex +// left: r1.re, r1.im, r2.re, r2.im +// right: l1.re, l1.im, l2.re, l2.im +// --> r1.re, r1.im, l2.re, l2.im +#[inline(always)] +pub unsafe fn extract_lo_hi_f32(left: float32x4_t, right: float32x4_t) -> float32x4_t { + vcombine_f32(vget_low_f32(left), vget_high_f32(right)) +} + +// Pack high (2nd) and low (1st) complex +// left: r1.re, r1.im, r2.re, r2.im +// right: l1.re, l1.im, l2.re, l2.im +// --> r2.re, r2.im, l1.re, l1.im +#[inline(always)] +pub unsafe fn extract_hi_lo_f32(left: float32x4_t, right: float32x4_t) -> float32x4_t { + vcombine_f32(vget_high_f32(left), vget_low_f32(right)) +} + +// Reverse complex +// values: a.re, a.im, b.re, b.im +// --> b.re, b.im, a.re, a.im +#[inline(always)] +pub unsafe fn reverse_complex_elements_f32(values: float32x4_t) -> float32x4_t { + vcombine_f32(vget_high_f32(values), vget_low_f32(values)) +} + +// Reverse complex and then negate hi complex +// values: a.re, a.im, b.re, b.im +// --> b.re, b.im, -a.re, -a.im +#[inline(always)] +pub unsafe fn reverse_complex_and_negate_hi_f32(values: float32x4_t) -> float32x4_t { + vcombine_f32(vget_high_f32(values), vneg_f32(vget_low_f32(values))) +} + +// Invert sign of high (2nd) complex +// values: a.re, a.im, b.re, b.im +// --> a.re, a.im, -b.re, -b.im +//#[inline(always)] +//pub unsafe fn negate_hi_f32(values: float32x4_t) -> float32x4_t { +// vcombine_f32(vget_low_f32(values), vneg_f32(vget_high_f32(values))) +//} + +// Duplicate low (1st) complex +// values: a.re, a.im, b.re, b.im +// --> a.re, a.im, a.re, a.im +#[inline(always)] +pub unsafe fn duplicate_lo_f32(values: float32x4_t) -> float32x4_t { + vreinterpretq_f32_f64(vtrn1q_f64( + vreinterpretq_f64_f32(values), + vreinterpretq_f64_f32(values), + )) +} + +// Duplicate high (2nd) complex +// values: a.re, a.im, b.re, b.im +// --> b.re, b.im, b.re, b.im +#[inline(always)] +pub unsafe fn duplicate_hi_f32(values: float32x4_t) -> float32x4_t { + vreinterpretq_f32_f64(vtrn2q_f64( + vreinterpretq_f64_f32(values), + vreinterpretq_f64_f32(values), + )) +} + +// transpose a 2x2 complex matrix given as [x0, x1], [x2, x3] +// result is [x0, x2], [x1, x3] +#[inline(always)] +pub unsafe fn transpose_complex_2x2_f32(left: float32x4_t, right: float32x4_t) -> [float32x4_t; 2] { + let temp02 = extract_lo_lo_f32(left, right); + let temp13 = extract_hi_hi_f32(left, right); + [temp02, temp13] +} + +// Complex multiplication. +// Each input contains two complex values, which are multiplied in parallel. +#[inline(always)] +pub unsafe fn mul_complex_f32(left: float32x4_t, right: float32x4_t) -> float32x4_t { + // ARMv8.2-A introduced vcmulq_f32 and vcmlaq_f32 for complex multiplication, these intrinsics are not yet available. + let temp1 = vtrn1q_f32(right, right); + let temp2 = vtrn2q_f32(right, vnegq_f32(right)); + let temp3 = vmulq_f32(temp2, left); + let temp4 = vrev64q_f32(temp3); + vfmaq_f32(temp4, temp1, left) +} + +// __ __ _ _ __ _ _ _ _ _ +// | \/ | __ _| |_| |__ / /_ | || | | |__ (_) |_ +// | |\/| |/ _` | __| '_ \ _____ | '_ \| || |_| '_ \| | __| +// | | | | (_| | |_| | | | |_____| | (_) |__ _| |_) | | |_ +// |_| |_|\__,_|\__|_| |_| \___/ |_| |_.__/|_|\__| +// + +pub(crate) struct Rotate90F64 { + sign: float64x2_t, +} + +impl Rotate90F64 { + pub fn new(positive: bool) -> Self { + let sign = unsafe { + if positive { + vld1q_f64([-0.0, 0.0].as_ptr()) + } else { + vld1q_f64([0.0, -0.0].as_ptr()) + } + }; + Self { sign } + } + + #[inline(always)] + pub unsafe fn rotate(&self, values: float64x2_t) -> float64x2_t { + let temp = vcombine_f64(vget_high_f64(values), vget_low_f64(values)); + vreinterpretq_f64_u64(veorq_u64( + vreinterpretq_u64_f64(temp), + vreinterpretq_u64_f64(self.sign), + )) + } +} + +#[inline(always)] +pub unsafe fn mul_complex_f64(left: float64x2_t, right: float64x2_t) -> float64x2_t { + // ARMv8.2-A introduced vcmulq_f64 and vcmlaq_f64 for complex multiplication, these intrinsics are not yet available. + let temp = vcombine_f64(vneg_f64(vget_high_f64(left)), vget_low_f64(left)); + let sum = vmulq_laneq_f64::<0>(left, right); + vfmaq_laneq_f64::<1>(sum, temp, right) +} + +#[cfg(test)] +mod unit_tests { + use super::*; + use num_complex::Complex; + + #[test] + fn test_mul_complex_f64() { + unsafe { + let right = vld1q_f64([1.0, 2.0].as_ptr()); + let left = vld1q_f64([5.0, 7.0].as_ptr()); + let res = mul_complex_f64(left, right); + let expected = vld1q_f64([1.0 * 5.0 - 2.0 * 7.0, 1.0 * 7.0 + 2.0 * 5.0].as_ptr()); + assert_eq!( + std::mem::transmute::>(res), + std::mem::transmute::>(expected) + ); + } + } + + #[test] + fn test_mul_complex_f32() { + unsafe { + let val1 = Complex::::new(1.0, 2.5); + let val2 = Complex::::new(3.2, 4.75); + let val3 = Complex::::new(5.75, 6.25); + let val4 = Complex::::new(7.4, 8.5); + + let nbr2 = vld1q_f32([val3, val4].as_ptr() as *const f32); + let nbr1 = vld1q_f32([val1, val2].as_ptr() as *const f32); + let res = mul_complex_f32(nbr1, nbr2); + let res = std::mem::transmute::; 2]>(res); + let expected = [val1 * val3, val2 * val4]; + assert_eq!(res, expected); + } + } + + #[test] + fn test_pack() { + unsafe { + let nbr2 = vld1q_f32([5.0, 6.0, 7.0, 8.0].as_ptr()); + let nbr1 = vld1q_f32([1.0, 2.0, 3.0, 4.0].as_ptr()); + let first = extract_lo_lo_f32(nbr1, nbr2); + let second = extract_hi_hi_f32(nbr1, nbr2); + let first = std::mem::transmute::; 2]>(first); + let second = std::mem::transmute::; 2]>(second); + let first_expected = [Complex::new(1.0, 2.0), Complex::new(5.0, 6.0)]; + let second_expected = [Complex::new(3.0, 4.0), Complex::new(7.0, 8.0)]; + assert_eq!(first, first_expected); + assert_eq!(second, second_expected); + } + } +} diff --git a/src/neon/neon_vector.rs b/src/neon/neon_vector.rs new file mode 100644 index 00000000..a3fa0ae7 --- /dev/null +++ b/src/neon/neon_vector.rs @@ -0,0 +1,311 @@ +use core::arch::aarch64::*; +use num_complex::Complex; + +use crate::array_utils::{RawSlice, RawSliceMut}; + +// Read these indexes from an NeonArray and build an array of simd vectors. +// Takes a name of a vector to read from, and a list of indexes to read. +// This statement: +// ``` +// let values = read_complex_to_array!(input, {0, 1, 2, 3}); +// ``` +// is equivalent to: +// ``` +// let values = [ +// input.load_complex(0), +// input.load_complex(1), +// input.load_complex(2), +// input.load_complex(3), +// ]; +// ``` +macro_rules! read_complex_to_array { + ($input:ident, { $($idx:literal),* }) => { + [ + $( + $input.load_complex($idx), + )* + ] + } +} + +// Read these indexes from an NeonArray and build an array or partially filled simd vectors. +// Takes a name of a vector to read from, and a list of indexes to read. +// This statement: +// ``` +// let values = read_partial1_complex_to_array!(input, {0, 1, 2, 3}); +// ``` +// is equivalent to: +// ``` +// let values = [ +// input.load1_complex(0), +// input.load1_complex(1), +// input.load1_complex(2), +// input.load1_complex(3), +// ]; +// ``` +macro_rules! read_partial1_complex_to_array { + ($input:ident, { $($idx:literal),* }) => { + [ + $( + $input.load1_complex($idx), + )* + ] + } +} + +// Write these indexes of an array of simd vectors to the same indexes of an NeonArray. +// Takes a name of a vector to read from, one to write to, and a list of indexes. +// This statement: +// ``` +// let values = write_complex_to_array!(input, output, {0, 1, 2, 3}); +// ``` +// is equivalent to: +// ``` +// let values = [ +// output.store_complex(input[0], 0), +// output.store_complex(input[1], 1), +// output.store_complex(input[2], 2), +// output.store_complex(input[3], 3), +// ]; +// ``` +macro_rules! write_complex_to_array { + ($input:ident, $output:ident, { $($idx:literal),* }) => { + $( + $output.store_complex($input[$idx], $idx); + )* + } +} + +// Write the low half of these indexes of an array of simd vectors to the same indexes of an NeonArray. +// Takes a name of a vector to read from, one to write to, and a list of indexes. +// This statement: +// ``` +// let values = write_partial_lo_complex_to_array!(input, output, {0, 1, 2, 3}); +// ``` +// is equivalent to: +// ``` +// let values = [ +// output.store_partial_lo_complex(input[0], 0), +// output.store_partial_lo_complex(input[1], 1), +// output.store_partial_lo_complex(input[2], 2), +// output.store_partial_lo_complex(input[3], 3), +// ]; +// ``` +macro_rules! write_partial_lo_complex_to_array { + ($input:ident, $output:ident, { $($idx:literal),* }) => { + $( + $output.store_partial_lo_complex($input[$idx], $idx); + )* + } +} + +// Write these indexes of an array of simd vectors to the same indexes, multiplied by a stride, of an NeonArray. +// Takes a name of a vector to read from, one to write to, an integer stride, and a list of indexes. +// This statement: +// ``` +// let values = write_complex_to_array_separate!(input, output, {0, 1, 2, 3}); +// ``` +// is equivalent to: +// ``` +// let values = [ +// output.store_complex(input[0], 0), +// output.store_complex(input[1], 2), +// output.store_complex(input[2], 4), +// output.store_complex(input[3], 6), +// ]; +// ``` +macro_rules! write_complex_to_array_strided { + ($input:ident, $output:ident, $stride:literal, { $($idx:literal),* }) => { + $( + $output.store_complex($input[$idx], $idx*$stride); + )* + } +} + +// A trait to handle reading from an array of complex floats into Neon vectors. +// Neon works with 128-bit vectors, meaning a vector can hold two complex f32, +// or a single complex f64. +pub trait NeonArray { + type VectorType; + const COMPLEX_PER_VECTOR: usize; + // Load complex numbers from the array to fill a Neon vector. + unsafe fn load_complex(&self, index: usize) -> Self::VectorType; + // Load a single complex number from the array into a Neon vector, setting the unused elements to zero. + unsafe fn load_partial1_complex(&self, index: usize) -> Self::VectorType; + // Load a single complex number from the array, and copy it to all elements of a Neon vector. + unsafe fn load1_complex(&self, index: usize) -> Self::VectorType; +} + +impl NeonArray for RawSlice> { + type VectorType = float32x4_t; + const COMPLEX_PER_VECTOR: usize = 2; + + #[inline(always)] + unsafe fn load_complex(&self, index: usize) -> Self::VectorType { + debug_assert!(self.len() >= index + Self::COMPLEX_PER_VECTOR); + vld1q_f32(self.as_ptr().add(index) as *const f32) + } + + #[inline(always)] + unsafe fn load_partial1_complex(&self, index: usize) -> Self::VectorType { + debug_assert!(self.len() >= index + 1); + let temp = vmovq_n_f32(0.0); + vreinterpretq_f32_u64(vld1q_lane_u64::<0>( + self.as_ptr().add(index) as *const u64, + vreinterpretq_u64_f32(temp), + )) + } + + #[inline(always)] + unsafe fn load1_complex(&self, index: usize) -> Self::VectorType { + debug_assert!(self.len() >= index + 1); + vreinterpretq_f32_u64(vld1q_dup_u64(self.as_ptr().add(index) as *const u64)) + } +} + +impl NeonArray for RawSlice> { + type VectorType = float64x2_t; + const COMPLEX_PER_VECTOR: usize = 1; + + #[inline(always)] + unsafe fn load_complex(&self, index: usize) -> Self::VectorType { + debug_assert!(self.len() >= index + Self::COMPLEX_PER_VECTOR); + vld1q_f64(self.as_ptr().add(index) as *const f64) + } + + #[inline(always)] + unsafe fn load_partial1_complex(&self, _index: usize) -> Self::VectorType { + unimplemented!("Impossible to do a partial load of complex f64's"); + } + + #[inline(always)] + unsafe fn load1_complex(&self, _index: usize) -> Self::VectorType { + unimplemented!("Impossible to do a partial load of complex f64's"); + } +} + +// A trait to handle writing to an array of complex floats from Neon vectors. +// Neon works with 128-bit vectors, meaning a vector can hold two complex f32, +// or a single complex f64. +pub trait NeonArrayMut { + type VectorType; + const COMPLEX_PER_VECTOR: usize; + // Store all complex numbers from a Neon vector to the array. + unsafe fn store_complex(&self, vector: Self::VectorType, index: usize); + // Store the low complex number from a Neon vector to the array. + unsafe fn store_partial_lo_complex(&self, vector: Self::VectorType, index: usize); + // Store the high complex number from a Neon vector to the array. + unsafe fn store_partial_hi_complex(&self, vector: Self::VectorType, index: usize); +} + +impl NeonArrayMut for RawSliceMut> { + type VectorType = float32x4_t; + const COMPLEX_PER_VECTOR: usize = 2; + + #[inline(always)] + unsafe fn store_complex(&self, vector: Self::VectorType, index: usize) { + debug_assert!(self.len() >= index + Self::COMPLEX_PER_VECTOR); + vst1q_f32(self.as_mut_ptr().add(index) as *mut f32, vector); + } + + #[inline(always)] + unsafe fn store_partial_hi_complex(&self, vector: Self::VectorType, index: usize) { + debug_assert!(self.len() >= index + 1); + let high = vget_high_f32(vector); + vst1_f32(self.as_mut_ptr().add(index) as *mut f32, high); + } + + #[inline(always)] + unsafe fn store_partial_lo_complex(&self, vector: Self::VectorType, index: usize) { + debug_assert!(self.len() >= index + 1); + let low = vget_low_f32(vector); + vst1_f32(self.as_mut_ptr().add(index) as *mut f32, low); + } +} + +impl NeonArrayMut for RawSliceMut> { + type VectorType = float64x2_t; + const COMPLEX_PER_VECTOR: usize = 1; + + #[inline(always)] + unsafe fn store_complex(&self, vector: Self::VectorType, index: usize) { + debug_assert!(self.len() >= index + Self::COMPLEX_PER_VECTOR); + vst1q_f64(self.as_mut_ptr().add(index) as *mut f64, vector); + } + + #[inline(always)] + unsafe fn store_partial_hi_complex(&self, _vector: Self::VectorType, _index: usize) { + unimplemented!("Impossible to do a partial store of complex f64's"); + } + #[inline(always)] + unsafe fn store_partial_lo_complex(&self, _vector: Self::VectorType, _index: usize) { + unimplemented!("Impossible to do a partial store of complex f64's"); + } +} + +#[cfg(test)] +mod unit_tests { + use super::*; + + use num_complex::Complex; + + use crate::array_utils::{RawSlice, RawSliceMut}; + + #[test] + fn test_load_f64() { + unsafe { + let val1: Complex = Complex::new(1.0, 2.0); + let val2: Complex = Complex::new(3.0, 4.0); + let val3: Complex = Complex::new(5.0, 6.0); + let val4: Complex = Complex::new(7.0, 8.0); + let values = vec![val1, val2, val3, val4]; + let slice = RawSlice::new(&values); + let load1 = slice.load_complex(0); + let load2 = slice.load_complex(1); + let load3 = slice.load_complex(2); + let load4 = slice.load_complex(3); + assert_eq!( + val1, + std::mem::transmute::>(load1) + ); + assert_eq!( + val2, + std::mem::transmute::>(load2) + ); + assert_eq!( + val3, + std::mem::transmute::>(load3) + ); + assert_eq!( + val4, + std::mem::transmute::>(load4) + ); + } + } + + #[test] + fn test_store_f64() { + unsafe { + let val1: Complex = Complex::new(1.0, 2.0); + let val2: Complex = Complex::new(3.0, 4.0); + let val3: Complex = Complex::new(5.0, 6.0); + let val4: Complex = Complex::new(7.0, 8.0); + + let nbr1 = vld1q_f64(&val1 as *const _ as *const f64); + let nbr2 = vld1q_f64(&val2 as *const _ as *const f64); + let nbr3 = vld1q_f64(&val3 as *const _ as *const f64); + let nbr4 = vld1q_f64(&val4 as *const _ as *const f64); + + let mut values: Vec> = vec![Complex::new(0.0, 0.0); 4]; + let slice: RawSliceMut> = RawSliceMut::new_transmuted(&mut values); + slice.store_complex(nbr1, 0); + slice.store_complex(nbr2, 1); + slice.store_complex(nbr3, 2); + slice.store_complex(nbr4, 3); + assert_eq!(val1, values[0]); + assert_eq!(val2, values[1]); + assert_eq!(val3, values[2]); + assert_eq!(val4, values[3]); + } + } +} diff --git a/src/plan.rs b/src/plan.rs index 040b7029..0b8d249c 100644 --- a/src/plan.rs +++ b/src/plan.rs @@ -9,6 +9,7 @@ use crate::algorithm::*; use crate::Fft; use crate::FftPlannerAvx; +use crate::FftPlannerNeon; use crate::FftPlannerSse; use crate::math_utils::{PrimeFactor, PrimeFactors}; @@ -17,6 +18,7 @@ enum ChosenFftPlanner { Scalar(FftPlannerScalar), Avx(FftPlannerAvx), Sse(FftPlannerSse), + Neon(FftPlannerNeon), // todo: If we add NEON, avx-512 etc support, add more enum variants for them here } @@ -69,6 +71,10 @@ impl FftPlanner { Self { chosen_planner: ChosenFftPlanner::Sse(sse_planner), } + } else if let Ok(neon_planner) = FftPlannerNeon::new() { + Self { + chosen_planner: ChosenFftPlanner::Neon(neon_planner), + } } else { Self { chosen_planner: ChosenFftPlanner::Scalar(FftPlannerScalar::new()), @@ -86,6 +92,7 @@ impl FftPlanner { ChosenFftPlanner::Scalar(scalar_planner) => scalar_planner.plan_fft(len, direction), ChosenFftPlanner::Avx(avx_planner) => avx_planner.plan_fft(len, direction), ChosenFftPlanner::Sse(sse_planner) => sse_planner.plan_fft(len, direction), + ChosenFftPlanner::Neon(neon_planner) => neon_planner.plan_fft(len, direction), } } diff --git a/tools/p2comparison_neon.py b/tools/p2comparison_neon.py new file mode 100644 index 00000000..6879902f --- /dev/null +++ b/tools/p2comparison_neon.py @@ -0,0 +1,59 @@ +import sys +import math +from matplotlib import pyplot as plt + + +with open(sys.argv[1]) as f: + lines = f.readlines() + +results = {"f32": {"scalar": {}, "neon": {}}, "f64": {"scalar": {}, "neon": {}}} + +for line in lines: + if line.startswith("test ") and not line.startswith("test result"): + name, result = line.split("... bench:") + name = name.split()[1] + _, length, ftype, algo = name.split("_") + value = float(result.strip().split(" ")[0].replace(",", "")) + results[ftype][algo][float(length)] = value + +lengths = sorted(list(results["f32"]["scalar"].keys())) + + +scalar_32 = [] +neon_32 = [] +for l in lengths: + sc32 = results["f32"]["scalar"][l] + nn32 = results["f32"]["neon"][l] + scalar_32.append(100.0) + neon_32.append(100.0 * sc32/nn32) + +scalar_64 = [] +neon_64 = [] +for l in lengths: + sc64 = results["f64"]["scalar"][l] + nn64 = results["f64"]["neon"][l] + scalar_64.append(100.0) + neon_64.append(100.0 * sc64/nn64) + +lengths = [math.log(l, 2) for l in lengths] + +plt.figure() +plt.plot(lengths, scalar_64, lengths, neon_64) +plt.title("f64") +plt.ylabel("relative speed, %") +plt.xlabel("log2(length)") +plt.xticks(list(range(4,23))) +plt.grid() +plt.legend(["scalar", "neon"]) + +plt.figure() +plt.plot(lengths, scalar_32, lengths, neon_32) +plt.title("f32") +plt.ylabel("relative speed, %") +plt.xlabel("log2(length)") +plt.legend(["scalar", "neon"]) +plt.xticks(list(range(4,23))) +plt.grid() +plt.show() + +