Channel Performance Patterns

Understanding channel bottlenecks, when to use channels vs alternatives like atomics and mutexes, and optimization patterns for concurrent Go programs.

Introduction

Channels are a powerful abstraction for goroutine communication, but they are not the fastest synchronization primitive. Go developers often reach for channels first, even in scenarios where simpler primitives would perform better. This article explores when channels are appropriate, when they're not, and how to optimize channel-based code.

The Performance Cost of Channels

Channels are powerful but slow. Let's benchmark channels against alternatives:

package main

import (
	"fmt"
	"sync"
	"sync/atomic"
	"testing"
	"time"
)

func BenchmarkChannelSendReceive(b *testing.B) {
	ch := make(chan int, 1)
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ch <- 42
		<-ch
	}
}

func BenchmarkMutexLockUnlock(b *testing.B) {
	var mu sync.Mutex
	var value int
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		mu.Lock()
		value = 42
		mu.Unlock()
		mu.Lock()
		_ = value
		mu.Unlock()
	}
}

func BenchmarkAtomicAddLoad(b *testing.B) {
	var value atomic.Int64
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		value.Store(42)
		_ = value.Load()
	}
}

func BenchmarkWaitGroupAdd(b *testing.B) {
	var wg sync.WaitGroup
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		wg.Add(1)
		wg.Done()
	}
}

Typical Results (ns/op):

ChannelSendReceive: 500-800 ns/op
MutexLockUnlock: 30-50 ns/op
AtomicAddLoad: 5-10 ns/op
WaitGroupAdd: 20-30 ns/op

Channels are 50-100x slower than atomic operations. This matters in hot paths.

When Channels Are Appropriate

Channels excel in specific scenarios:

1. Fan-Out/Fan-In Pattern

Distributing work across multiple goroutines and collecting results:

func FanOut(items []Item, numWorkers int) []Result {
	itemChan := make(chan Item, len(items))
	resultChan := make(chan Result, len(items))

	// Start workers
	var wg sync.WaitGroup
	for i := 0; i < numWorkers; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()
			for item := range itemChan {
				resultChan <- processItem(item)
			}
		}()
	}

	// Feed items
	go func() {
		for _, item := range items {
			itemChan <- item
		}
		close(itemChan)
	}()

	// Collect results
	go func() {
		wg.Wait()
		close(resultChan)
	}()

	results := make([]Result, 0, len(items))
	for result := range resultChan {
		results = append(results, result)
	}
	return results
}

Channels are ideal here because:

They naturally coordinate multiple goroutines
Closing a channel broadcasts to all receivers
The work distribution pattern is clear and maintainable

2. Pipeline Stages

Chaining processing stages:

func Pipeline(input []int) {
	// Stage 1: generate numbers
	nums := make(chan int, 10)
	go func() {
		for _, n := range input {
			nums <- n
		}
		close(nums)
	}()

	// Stage 2: square numbers
	squares := make(chan int, 10)
	go func() {
		for n := range nums {
			squares <- n * n
		}
		close(squares)
	}()

	// Stage 3: filter even
	evens := make(chan int, 10)
	go func() {
		for n := range squares {
			if n%2 == 0 {
				evens <- n
			}
		}
		close(evens)
	}()

	// Consume results
	for n := range evens {
		fmt.Println(n)
	}
}

3. Signaling and Synchronization

One-time signals or broadcast events:

// Done signal to stop all goroutines
done := make(chan struct{})

go func() {
	for {
		select {
		case <-done:
			return
		default:
			doWork()
		}
	}
}()

// ... later
close(done) // Broadcast stop to all goroutines

4. Select-Based Multiplexing

Waiting on multiple channels:

results := make(chan Result, 1)
errors := make(chan error, 1)
timeout := time.After(5 * time.Second)

select {
case r := <-results:
	fmt.Println("Got result:", r)
case err := <-errors:
	fmt.Println("Got error:", err)
case <-timeout:
	fmt.Println("Timed out")
}

When NOT to Use Channels

Case 1: Simple Counters

// Incorrect: using channel for counter
countCh := make(chan int)
go func() {
	count := 0
	for {
		select {
		case <-countCh:
			count++
		}
	}
}()

// Much slower than atomic

// Better: use atomic.Int64
var count atomic.Int64
count.Add(1)

Benchmark:

func BenchmarkChannelCounter(b *testing.B) {
	ch := make(chan struct{})
	go func() {
		for range ch {
		}
	}()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ch <- struct{}{}
	}
	close(ch)
}

func BenchmarkAtomicCounter(b *testing.B) {
	var count atomic.Int64
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		count.Add(1)
	}
}

Results: Atomic is 50-100x faster.

Case 2: Protecting Shared State

// Over-engineered with channels
stateCh := make(chan State)
go func() {
	state := State{}
	for newState := range stateCh {
		state = newState
	}
}()

// Better: use sync.Mutex
var mu sync.Mutex
var state State

func UpdateState(s State) {
	mu.Lock()
	state = s
	mu.Unlock()
}

Case 3: One-Time Initialization

// Inefficient with channel
var initialized bool
var initCh = make(chan bool, 1)

func Init() {
	if !initialized {
		// expensive setup
		initialized = true
		initCh <- true
	}
}

// Better: use sync.Once
var once sync.Once
var instance *Expensive

func GetInstance() *Expensive {
	once.Do(func() {
		instance = createExpensive()
	})
	return instance
}

Buffered vs Unbuffered Channels

The choice between buffered and unbuffered channels affects goroutine blocking patterns:

// Unbuffered: sender blocks until receiver ready
unbuffered := make(chan int)
unbuffered <- 42 // Blocks until someone receives

// Buffered: sender blocks only when buffer full
buffered := make(chan int, 10)
buffered <- 42 // Never blocks (unless buffer full)

Benchmark comparing throughput:

func BenchmarkUnbufferedChannel(b *testing.B) {
	ch := make(chan int)

	go func() {
		for range ch {
		}
	}()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ch <- i
	}
	close(ch)
}

func BenchmarkBufferedChannel(b *testing.B) {
	ch := make(chan int, 1000)

	go func() {
		for range ch {
		}
	}()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ch <- i
	}
	close(ch)
}

Results: Buffered channels have higher throughput (fewer context switches), but unbuffered channels have tighter synchronization.

Choose buffering based on your pattern: unbuffered for tight synchronization, buffered for decoupling sender and receiver.

Try-Send and Try-Receive Patterns

These patterns are specially optimized by the compiler:

// Try-send: send if non-blocking, skip otherwise
select {
case ch <- value:
	fmt.Println("Sent successfully")
default:
	fmt.Println("Channel full, skipped")
}

// Try-receive: receive if available, skip otherwise
select {
case v := <-ch:
	fmt.Println("Received:", v)
default:
	fmt.Println("Channel empty, skipped")
}

The compiler recognizes these patterns and optimizes them more than regular select statements.

Benchmark:

func BenchmarkTrySend(b *testing.B) {
	ch := make(chan int, 1)
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		select {
		case ch <- i:
		default:
		}
	}
}

func BenchmarkSend(b *testing.B) {
	ch := make(chan int, 1)
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ch <- i
	}
}

Try-send has overhead; only use when you need non-blocking behavior.

Channel Value Types: Value vs Pointer

What you send through a channel matters:

type Packet struct {
	Header [64]byte
	Payload [4096]byte
}

// Sends copy of entire Packet (4160 bytes each send)
largeChan := make(chan Packet, 10)
largeChan <- packet

// Sends only pointer (8 bytes each send)
ptrChan := make(chan *Packet, 10)
ptrChan <- &packet

Benchmark:

type SmallValue struct {
	A int64
	B int64
}

type LargeValue struct {
	Data [1000]int64
}

func BenchmarkChannelSmallValue(b *testing.B) {
	ch := make(chan SmallValue, 100)
	go func() {
		for range ch {
		}
	}()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ch <- SmallValue{A: 1, B: 2}
	}
	close(ch)
}

func BenchmarkChannelLargeValue(b *testing.B) {
	ch := make(chan LargeValue, 100)
	go func() {
		for range ch {
		}
	}()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ch <- LargeValue{}
	}
	close(ch)
}

func BenchmarkChannelPointer(b *testing.B) {
	ch := make(chan *LargeValue, 100)
	v := &LargeValue{}
	go func() {
		for range ch {
		}
	}()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ch <- v
	}
	close(ch)
}

Results: Pointer channel is much faster than large-value channel.

Send pointers through channels for large types. For small types, value vs pointer is less critical.

Channel of Channels

Sometimes you need to create goroutines dynamically and coordinate them:

// Instead of multiple channels:
ch1 := make(chan Result)
ch2 := make(chan Result)
ch3 := make(chan Result)

select {
case r := <-ch1:
case r := <-ch2:
case r := <-ch3:
}

// Use a channel of results from multiple sources:
jobs := make(chan Job)
results := make(chan Result, 10)

for i := 0; i < numWorkers; i++ {
	go func() {
		for job := range jobs {
			results <- process(job)
		}
	}()
}

// Feed jobs and collect results
for result := range results {
	handleResult(result)
}

This reduces the number of case clauses in select statements.

Closing Channels as Broadcast

Closing a channel is one of the few zero-cost operations on channels:

// Broadcast: close channel to signal all receivers
done := make(chan struct{})

for i := 0; i < 1000; i++ {
	go func() {
		<-done // All goroutines wake up when done closes
	}()
}

close(done) // Single operation triggers 1000 goroutines

This is efficient because:

No data is sent
All receivers wake up simultaneously
No per-receiver overhead

Compare with sending:

// Inefficient: must send 1000 times
for i := 0; i < 1000; i++ {
	signal <- struct{}{}
}

// Efficient: close once
close(signal)

Practical Example: Rate Limiter

A practical use case where channels shine—rate limiting:

type RateLimiter struct {
	tokens chan struct{}
	ticker *time.Ticker
}

func NewRateLimiter(rps int) *RateLimiter {
	rl := &RateLimiter{
		tokens: make(chan struct{}, rps),
		ticker: time.NewTicker(time.Second / time.Duration(rps)),
	}

	go func() {
		for range rl.ticker.C {
			select {
			case rl.tokens <- struct{}{}:
			default:
				// Token not taken, skip
			}
		}
	}()

	return rl
}

func (rl *RateLimiter) Wait() {
	<-rl.tokens
}

func (rl *RateLimiter) Stop() {
	rl.ticker.Stop()
	close(rl.tokens)
}

Channels are perfect here because:

Natural representation of available permits
Works well with goroutines making requests
Clear semantics

sync.Pool vs Channel-Based Pooling

For object pooling, sync.Pool is faster than channel-based approaches:

// Channel-based pool (slower)
type ChannelPool struct {
	buffers chan *bytes.Buffer
}

func NewChannelPool(size int) *ChannelPool {
	return &ChannelPool{
		buffers: make(chan *bytes.Buffer, size),
	}
}

func (cp *ChannelPool) Get() *bytes.Buffer {
	select {
	case buf := <-cp.buffers:
		return buf
	default:
		return &bytes.Buffer{}
	}
}

func (cp *ChannelPool) Put(buf *bytes.Buffer) {
	buf.Reset()
	select {
	case cp.buffers <- buf:
	default:
	}
}

// sync.Pool (faster)
var bufferPool = sync.Pool{
	New: func() interface{} {
		return &bytes.Buffer{}
	},
}

func GetBuffer() *bytes.Buffer {
	return bufferPool.Get().(*bytes.Buffer)
}

func PutBuffer(buf *bytes.Buffer) {
	buf.Reset()
	bufferPool.Put(buf)
}

Benchmark:

func BenchmarkChannelPool(b *testing.B) {
	pool := NewChannelPool(1000)
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		buf := pool.Get()
		buf.WriteString("test")
		pool.Put(buf)
	}
}

func BenchmarkSyncPool(b *testing.B) {
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		buf := GetBuffer()
		buf.WriteString("test")
		PutBuffer(buf)
	}
}

Results: sync.Pool is 3-5x faster due to per-CPU pooling.

Use sync.Pool for object pooling, not channels.

Summary and Recommendations

Use channels for coordination: Fan-out/fan-in, pipelines, signaling, multiplexing.
Avoid channels for counters: Use atomic.Int64 or sync.Mutex for shared state.
Use sync.Once for initialization: Not channels.
Use sync.Pool for object pooling: Not channel-based pools.
Send pointers for large types: Reduces copy overhead.
Use buffered channels judiciously: They increase throughput but can mask coordination issues.
Use try-send/receive sparingly: They have overhead; only use when needed.
Exploit channel closing for broadcast: It's zero-cost.
Profile before optimizing: Channels are slow, but if they're not in the hot path, it doesn't matter.
Prefer clarity over micro-optimization: Channels are powerful for goroutine coordination. Use them where they make the code clearer, even if other primitives are theoretically faster.

The key insight: channels are about coordination and simplicity, not raw performance. When you need lightweight synchronization, atomics and mutexes are faster. When you need clear communication patterns between multiple goroutines, channels are often the right choice despite being slower.

On this page