delete some unused kernel

MegEngine · Oct 10, 2023 · 405d866 · 405d866
1 parent 673ce0e
commit 405d866
Show file tree

Hide file tree

Showing 3 changed files with 0 additions and 432 deletions.
diff --git a/src/kern/optimized/arm/kernel.cpp b/src/kern/optimized/arm/kernel.cpp
@@ -204,47 +204,6 @@ TaskSet llm_matmul_compute_int4_float(
     return TaskSet{{task1, M}, {task2, N}};
 }
 
-TaskSet llm_matmul_compute_int4_float_packed(
-        float* dst, const void* src0, const float* bias, const float* src1, uint32_t M,
-        uint32_t N, uint32_t K, void* workspace, uint32_t size) {
-    //! src0 is quantized weights, weights store in 32 data as block and a block
-    //! share the same scale, src1 is featureMap. src0 layout is {N,
-    //! K}, src1 layout is {M, K}, the dst is {M, N}
-    INFER_ASSERT(sizeof(float) * K <= size, "workspace is not enough.");
-    uint32_t weight_q40_stride =
-            K * dtype_in_byte(DType::Int4) / dtype_block_size(DType::Int4);
-    uint32_t weight_q80_stride =
-            K * dtype_in_byte(DType::Int8) / dtype_block_size(DType::Int8);
-    //! dequantize input, and store in workspace
-    //! becuase the input is small than the weights, quantized the input will
-    //! reduce the memory traffic
-    auto task1 = [=](const TaskId& id) {
-        for (uint32_t m = id.start; m < id.end; m++) {
-            BlockQ80* q_src1 =
-                    (BlockQ80*)(static_cast<uint8_t*>(workspace) + m * weight_q80_stride);
-            quantize_row_q8_0(src1 + m * K, q_src1, K);
-        }
-    };
-    int8_t* q_src = static_cast<int8_t*>(workspace);
-    auto task2 = [=](const TaskId& id) {
-        for (uint32_t n = id.start; n < id.end; n++) {
-            const void* q_weight =
-                    static_cast<const uint8_t*>(src0) + n * 8 * weight_q40_stride;
-            for (uint32_t m = 0; m < M; m++) {
-                int8_t* src = q_src + m * weight_q80_stride;
-                float* dst_ptr = dst + m * N + n * 8;
-                const float* bias_ptr = bias ? bias + n * 8 : nullptr;
-#if defined(__ARM_FEATURE_DOTPROD)
-                vec_vec_dot_q40_with_q80_packed(K, q_weight, src, dst_ptr, bias_ptr);
-                // vec_vec_dot_q40_with_q80_packed_asm(K, q_weight, src, dst_ptr,
-                // bias_ptr);
-#endif
-            }
-        }
-    };
-    return TaskSet{{task1, M}, {task2, N / 8}};
-}
-
 size_t llm_matmul_get_workspace_float(uint32_t, uint32_t M, uint32_t N, uint32_t K) {
     return sizeof(float) * K * M;
 }

diff --git a/src/kern/optimized/arm/kernel.h b/src/kern/optimized/arm/kernel.h
@@ -50,7 +50,6 @@ PartialImplementKernel(
 PartialImplementKernel(RmsNormFloat, llm_rms_norm_compute_float);
 PartialImplementKernel(EmbeddingGetInt4Float, llm_embedding_get_int4_float);
 PartialImplementKernel(MatmulInt4Float, llm_matmul_compute_int4_float);
-PartialImplementKernel(MatmulInt4FloatPacked, llm_matmul_compute_int4_float_packed);
 PartialImplementKernel(
         MatmulWithHeadStrideFloat, llm_matmul_compute_with_head_stride_float);
 PartialImplementKernel(HeadBatchedMatmulFloat, llm_head_batched_matmul_compute_float);