depot/packages/networking/ipfs-cluster/allocator/balanced/balanced.go

// Package balanced implements an allocator that can sort allocations
// based on multiple metrics, where metrics may be an arbitrary way to
// partition a set of peers.
//
// For example, allocating by ["tag:region", "disk"] the resulting peer
// candidate order will balanced between regions and ordered by the value of
// the weight of the disk metric.
package balanced

import (
	"context"
	"fmt"
	"sort"

	api "github.com/ipfs-cluster/ipfs-cluster/api"
	logging "github.com/ipfs/go-log/v2"
	peer "github.com/libp2p/go-libp2p/core/peer"
	rpc "github.com/libp2p/go-libp2p-gorpc"
)

var logger = logging.Logger("allocator")

// Allocator is an allocator that partitions metrics and orders
// the final list of allocation by selecting for each partition.
type Allocator struct {
	config    *Config
	rpcClient *rpc.Client
}

// New returns an initialized Allocator.
func New(cfg *Config) (*Allocator, error) {
	err := cfg.Validate()
	if err != nil {
		return nil, err
	}

	return &Allocator{
		config: cfg,
	}, nil
}

// SetClient provides us with an rpc.Client which allows
// contacting other components in the cluster.
func (a *Allocator) SetClient(c *rpc.Client) {
	a.rpcClient = c
}

// Shutdown is called on cluster shutdown. We just invalidate
// any metrics from this point.
func (a *Allocator) Shutdown(ctx context.Context) error {
	a.rpcClient = nil
	return nil
}

type partitionedMetric struct {
	metricName       string
	curChoosingIndex int
	noMore           bool
	partitions       []*partition // they are in order of their values
}

type partition struct {
	value            string
	weight           int64
	aggregatedWeight int64
	peers            map[peer.ID]bool   // the bool tracks whether the peer has been picked already out of the partition when doing the final sort.
	sub              *partitionedMetric // all peers in sub-partitions will have the same value for this metric
}

// Returns a partitionedMetric which has partitions and subpartitions based
// on the metrics and values given by the "by" slice. The partitions
// are ordered based on the cumulative weight.
func partitionMetrics(set api.MetricsSet, by []string) *partitionedMetric {
	rootMetric := by[0]
	pnedMetric := &partitionedMetric{
		metricName: rootMetric,
		partitions: partitionValues(set[rootMetric]),
	}

	// For sorting based on weight (more to less)
	lessF := func(i, j int) bool {
		wi := pnedMetric.partitions[i].weight
		wj := pnedMetric.partitions[j].weight

		// if weight is equal, sort by aggregated weight of
		// all sub-partitions.
		if wi == wj {
			awi := pnedMetric.partitions[i].aggregatedWeight
			awj := pnedMetric.partitions[j].aggregatedWeight
			// If subpartitions weight the same, do strict order
			// based on value string
			if awi == awj {
				return pnedMetric.partitions[i].value < pnedMetric.partitions[j].value
			}
			return awj < awi

		}
		// Descending!
		return wj < wi
	}

	if len(by) == 1 { // we are done
		sort.Slice(pnedMetric.partitions, lessF)
		return pnedMetric
	}

	// process sub-partitions
	for _, partition := range pnedMetric.partitions {
		filteredSet := make(api.MetricsSet)
		for k, v := range set {
			if k == rootMetric { // not needed anymore
				continue
			}
			for _, m := range v {
				// only leave metrics for peers in current partition
				if _, ok := partition.peers[m.Peer]; ok {
					filteredSet[k] = append(filteredSet[k], m)
				}
			}
		}

		partition.sub = partitionMetrics(filteredSet, by[1:])

		// Add the aggregated weight of the subpartitions
		for _, subp := range partition.sub.partitions {
			partition.aggregatedWeight += subp.aggregatedWeight
		}
	}
	sort.Slice(pnedMetric.partitions, lessF)
	return pnedMetric
}

func partitionValues(metrics []api.Metric) []*partition {
	partitions := []*partition{}

	if len(metrics) <= 0 {
		return partitions
	}

	// We group peers with the same value in the same partition.
	partitionsByValue := make(map[string]*partition)

	for _, m := range metrics {
		// Sometimes two metrics have the same value / weight, but we
		// still want to put them in different partitions. Otherwise
		// their weights get added and they form a bucket and
		// therefore not they are not selected in order: 3 peers with
		// freespace=100 and one peer with freespace=200 would result
		// in one of the peers with freespace 100 being chosen first
		// because the partition's weight is 300.
		//
		// We are going to call these metrics (like free-space),
		// non-partitionable metrics. This is going to be the default
		// (for backwards compat reasons).
		//
		// The informers must set the Partitionable field accordingly
		// when two metrics with the same value must be grouped in the
		// same partition.
		//
		// Note: aggregatedWeight is the same as weight here (sum of
		// weight of all metrics in partitions), and gets updated
		// later in partitionMetrics with the aggregated weight of
		// sub-partitions.
		if !m.Partitionable {
			partitions = append(partitions, &partition{
				value:            m.Value,
				weight:           m.GetWeight(),
				aggregatedWeight: m.GetWeight(),
				peers: map[peer.ID]bool{
					m.Peer: false,
				},
			})
			continue
		}

		// Any other case, we partition by value.
		if p, ok := partitionsByValue[m.Value]; ok {
			p.peers[m.Peer] = false
			p.weight += m.GetWeight()
			p.aggregatedWeight += m.GetWeight()
		} else {
			partitionsByValue[m.Value] = &partition{
				value:            m.Value,
				weight:           m.GetWeight(),
				aggregatedWeight: m.GetWeight(),
				peers: map[peer.ID]bool{
					m.Peer: false,
				},
			}
		}

	}
	for _, p := range partitionsByValue {
		partitions = append(partitions, p)
	}
	return partitions
}

// Returns a list of peers sorted by never choosing twice from the same
// partition if there is some other partition to choose from.
func (pnedm *partitionedMetric) sortedPeers() []peer.ID {
	peers := []peer.ID{}
	for {
		peer := pnedm.chooseNext()
		if peer == "" { // This means we are done.
			break
		}
		peers = append(peers, peer)
	}
	return peers
}

func (pnedm *partitionedMetric) chooseNext() peer.ID {
	lenp := len(pnedm.partitions)
	if lenp == 0 {
		return ""
	}

	if pnedm.noMore {
		return ""
	}

	var peer peer.ID

	curPartition := pnedm.partitions[pnedm.curChoosingIndex]
	done := 0
	for {
		if curPartition.sub != nil {
			// Choose something from the sub-partitionedMetric
			peer = curPartition.sub.chooseNext()
		} else {
			// We are a bottom-partition. Choose one of our peers
			for pid, used := range curPartition.peers {
				if !used {
					peer = pid
					curPartition.peers[pid] = true // mark as used
					break
				}
			}
		}
		// look in next partition next time
		pnedm.curChoosingIndex = (pnedm.curChoosingIndex + 1) % lenp
		curPartition = pnedm.partitions[pnedm.curChoosingIndex]
		done++

		if peer != "" {
			break
		}

		// no peer and we have looked in as many partitions as we have
		if done == lenp {
			pnedm.noMore = true
			break
		}
	}

	return peer
}

// Allocate produces a sorted list of cluster peer IDs based on different
// metrics provided for those peer IDs.
// It works as follows:
//
//   - First, it buckets each peer metrics based on the AllocateBy list. The
//   metric name must match the bucket name, otherwise they are put at the end.
//   - Second, based on the AllocateBy order, it orders the first bucket and
//   groups peers by ordered value.
//   - Third, it selects metrics on the second bucket for the most prioritary
//   peers of the first bucket and orders their metrics. Then for the peers in
//   second position etc.
//   - It repeats the process until there is no more buckets to sort.
//   - Finally, it returns the first peer of the first
//   - Third, based on the AllocateBy order, it select the first metric
func (a *Allocator) Allocate(
	ctx context.Context,
	c api.Cid,
	current, candidates, priority api.MetricsSet,
) ([]peer.ID, error) {

	// For the allocation to work well, there have to be metrics of all
	// the types for all the peers. There cannot be a metric of one type
	// for a peer that does not appear in the other types.
	//
	// Removing such occurrences is done in allocate.go, before the
	// allocator is called.
	//
	// Otherwise, the sorting might be funny.

	candidatePartition := partitionMetrics(candidates, a.config.AllocateBy)
	priorityPartition := partitionMetrics(priority, a.config.AllocateBy)

	logger.Debugf("Balanced allocator partitions:\n%s\n", printPartition(candidatePartition, 0))
	//fmt.Println(printPartition(candidatePartition, 0))

	first := priorityPartition.sortedPeers()
	last := candidatePartition.sortedPeers()

	return append(first, last...), nil
}

// Metrics returns the names of the metrics that have been registered
// with this allocator.
func (a *Allocator) Metrics() []string {
	return a.config.AllocateBy
}

func printPartition(m *partitionedMetric, ind int) string {
	str := ""
	indent := func() {
		for i := 0; i < ind+2; i++ {
			str += " "
		}
	}

	for _, p := range m.partitions {
		indent()
		str += fmt.Sprintf(" | %s:%s - %d - [", m.metricName, p.value, p.weight)
		for p, u := range p.peers {
			str += fmt.Sprintf("%s|%t, ", p, u)
		}
		str += "]\n"
		if p.sub != nil {
			str += printPartition(p.sub, ind+2)
		}
	}
	return str
}