Source code for fedscale.cloud.aggregation.optimizers

[docs]class TorchServerOptimizer(object):
    """This is a abstract server optimizer class
    
    Args:
        mode (string): mode of gradient aggregation policy
        args (distionary): Variable arguments for fedscale runtime config. defaults to the setup in arg_parser.py
        device (string): Runtime device type
        sample_seed (int): Random seed

    """
    def __init__(self, mode, args, device, sample_seed=233):

        self.mode = mode
        self.args = args
        self.device = device

        if mode == 'fed-yogi':
            from fedscale.utils.optimizer.yogi import YoGi
            self.gradient_controller = YoGi(
                eta=args.yogi_eta, tau=args.yogi_tau, beta=args.yogi_beta, beta2=args.yogi_beta2)

[docs]    def update_round_gradient(self, last_model, current_model, target_model):
        """ update global model based on different policy
        
        Args:
            last_model (list of tensor weight): A list of global model weight in last round.
            current_model (list of tensor weight): A list of global model weight in this round.
            target_model (PyTorch or TensorFlow nn module): Aggregated model.
        
        """
        if self.mode == 'fed-yogi':
            """
            "Adaptive Federated Optimizations", 
            Sashank J. Reddi, Zachary Charles, Manzil Zaheer, Zachary Garrett, Keith Rush, Jakub Konecný, Sanjiv Kumar, H. Brendan McMahan,
            ICLR 2021.
            """
            last_model = [x.to(device=self.device) for x in last_model]
            current_model = [x.to(device=self.device) for x in current_model]

            diff_weight = self.gradient_controller.update(
                [pb-pa for pa, pb in zip(last_model, current_model)])

            for idx, param in enumerate(target_model.parameters()):
                param.data = last_model[idx] + diff_weight[idx]

        elif self.mode == 'q-fedavg':
            """
            "Fair Resource Allocation in Federated Learning", Tian Li, Maziar Sanjabi, Ahmad Beirami, Virginia Smith, ICLR 2020.
            """
            learning_rate, qfedq = self.args.learning_rate, self.args.qfed_q
            Deltas, hs = None, 0.
            last_model = [x.to(device=self.device) for x in last_model]

            for result in self.client_training_results:
                # plug in the weight updates into the gradient
                grads = [(u - torch.from_numpy(v).to(device=self.device)) * 1.0 /
                         learning_rate for u, v in zip(last_model, result['update_weight'])]
                loss = result['moving_loss']

                if Deltas is None:
                    Deltas = [np.float_power(
                        loss+1e-10, qfedq) * grad for grad in grads]
                else:
                    for idx in range(len(Deltas)):
                        Deltas[idx] += np.float_power(loss +
                                                      1e-10, qfedq) * grads[idx]

                # estimation of the local Lipchitz constant
                hs += (qfedq * np.float_power(loss+1e-10, (qfedq-1)) * torch.sum(torch.stack([torch.square(
                    grad).sum() for grad in grads])) + (1.0/learning_rate) * np.float_power(loss+1e-10, qfedq))

            # update global model
            for idx, param in enumerate(target_model.parameters()):
                param.data = last_model[idx] - Deltas[idx]/(hs+1e-10)

        else:
            # The default optimizer, FedAvg, has been applied in aggregator.py on the fly
            pass