Source code for toil.lib.ec2nodes

# Copyright (C) 2015-2021 Regents of the University of California
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import json
import logging
import os
import re
import textwrap
from typing import Any, Dict, List, Tuple, Union

import requests

logger = logging.getLogger(__name__)
dirname = os.path.dirname(__file__)


EC2Regions = {'us-west-1': 'US West (N. California)',
              'us-west-2': 'US West (Oregon)',
              'us-east-1': 'US East (N. Virginia)',
              'us-east-2': 'US East (Ohio)',
              'us-gov-west-1': 'AWS GovCloud (US)',
              'ca-central-1': 'Canada (Central)',
              'ap-northeast-1': 'Asia Pacific (Tokyo)',
              'ap-northeast-2': 'Asia Pacific (Seoul)',
              'ap-northeast-3': 'Asia Pacific (Osaka-Local)',
              'ap-southeast-1': 'Asia Pacific (Singapore)',
              'ap-southeast-2': 'Asia Pacific (Sydney)',
              'ap-south-1': 'Asia Pacific (Mumbai)',
              'eu-west-1': 'EU (Ireland)',
              'eu-west-2': 'EU (London)',
              'eu-west-3': 'EU (Paris)',
              'eu-central-1': 'EU (Frankfurt)',
              'sa-east-1': 'South America (Sao Paulo)'}


[docs] class InstanceType: __slots__ = ('name', 'cores', 'memory', 'disks', 'disk_capacity', 'architecture') def __init__(self, name: str, cores: int, memory: float, disks: float, disk_capacity: float, architecture: str): self.name = name # the API name of the instance type self.cores = cores # the number of cores self.memory = memory # RAM in GiB self.disks = disks # the number of ephemeral (aka 'instance store') volumes self.disk_capacity = disk_capacity # the capacity of each ephemeral volume in GiB self.architecture = architecture # the architecture of the instance type. Can be either amd64 or arm64
[docs] def __str__(self) -> str: return ("Type: {}\n" "Cores: {}\n" "Disks: {}\n" "Memory: {}\n" "Disk Capacity: {}\n" "Architecture: {}\n" "".format( self.name, self.cores, self.disks, self.memory, self.disk_capacity, self.architecture))
[docs] def __eq__(self, other: object) -> bool: if not isinstance(other, InstanceType): return NotImplemented if (self.name == other.name and self.cores == other.cores and self.memory == other.memory and self.disks == other.disks and self.disk_capacity == other.disk_capacity and self.architecture == other.architecture): return True return False
[docs] def isNumber(s: str) -> bool: """ Determines if a unicode string (that may include commas) is a number. :param s: Any unicode string. :return: True if s represents a number, False otherwise. """ s = s.replace(',', '') try: float(s) return True except ValueError: pass try: import unicodedata unicodedata.numeric(s) return True except (TypeError, ValueError) as e: pass return False
[docs] def parseStorage(storageData: str) -> Union[List[int], Tuple[Union[int, float], float]]: """ Parses EC2 JSON storage param string into a number. Examples: "2 x 160 SSD" "3 x 2000 HDD" "EBS only" "1 x 410" "8 x 1.9 NVMe SSD" "900 GB NVMe SSD" :param str storageData: EC2 JSON storage param string. :return: Two floats representing: (# of disks), and (disk_capacity in GiB of each disk). """ if storageData == "EBS only": return [0, 0] else: specs = storageData.strip().split() if isNumber(specs[0]) and specs[1] == 'x' and isNumber(specs[2]): return float(specs[0].replace(',', '')), float(specs[2].replace(',', '')) elif isNumber(specs[0]) and specs[1] == 'GB' and specs[2] == 'NVMe' and specs[3] == 'SSD': return 1, float(specs[0].replace(',', '')) else: raise RuntimeError('EC2 JSON format has likely changed. Error parsing disk specs.')
[docs] def parseMemory(memAttribute: str) -> float: """ Returns EC2 'memory' string as a float. Format should always be '#' GiB (example: '244 GiB' or '1,952 GiB'). Amazon loves to put commas in their numbers, so we have to accommodate that. If the syntax ever changes, this will raise. :param memAttribute: EC2 JSON memory param string. :return: A float representing memory in GiB. """ mem = memAttribute.replace(',', '').split() if mem[1] == 'GiB': return float(mem[0]) else: raise RuntimeError('EC2 JSON format has likely changed. Error parsing memory.')
[docs] def fetchEC2Index(filename: str) -> None: """Downloads and writes the AWS Billing JSON to a file using the AWS pricing API. See: https://aws.amazon.com/blogs/aws/new-aws-price-list-api/ :return: A dict of InstanceType objects, where the key is the string: aws instance name (example: 't2.micro'), and the value is an InstanceType object representing that aws instance name. """ print('Downloading ~1Gb AWS billing file to parse for information.\n') response = requests.get('https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.json') if response.ok: with open(filename, 'w') as f: f.write(str(json.dumps(json.loads(response.text), indent=4))) print('Download completed successfully!\n') else: raise RuntimeError('Error: ' + str(response) + ' :: ' + str(response.text))
[docs] def fetchEC2InstanceDict(awsBillingJson: Dict[str, Any], region: str) -> Dict[str, InstanceType]: """ Takes a JSON and returns a list of InstanceType objects representing EC2 instance params. :param region: :return: """ ec2InstanceList = [] for k, v in awsBillingJson['products'].items(): i = v['attributes'] # NOTES: # # 3 tenant types: 'Host' (always $0.00; just a template?) # 'Dedicated' (toil does not support; these are pricier) # 'Shared' (AWS default and what toil uses) # # The same instance can appear with multiple "operation" values; # "RunInstances" is normal # "RunInstances:<code>" is e.g. Linux with MS SQL Server installed. if (i.get('location') == region and i.get('tenancy') == 'Shared' and i.get('operatingSystem') == 'Linux' and i.get('operation') == 'RunInstances'): normal_use = i.get('usagetype').endswith('BoxUsage:' + i['instanceType']) # not reserved or unused if normal_use: disks, disk_capacity = parseStorage(v["attributes"]["storage"]) # Determines whether the instance type is from an ARM or AMD family # ARM instance names include a digit followed by a 'g' before the instance size architecture = 'arm64' if re.search(r".*\dg.*\..*", i["instanceType"]) else 'amd64' instance = InstanceType(name=i["instanceType"], cores=i["vcpu"], memory=parseMemory(i["memory"]), disks=disks, disk_capacity=disk_capacity, architecture=architecture) if instance in ec2InstanceList: raise RuntimeError('EC2 JSON format has likely changed. ' 'Duplicate instance {} found.'.format(instance)) ec2InstanceList.append(instance) print('Finished for ' + str(region) + '. ' + str(len(ec2InstanceList)) + ' added.') return {_.name: _ for _ in ec2InstanceList}
[docs] def updateStaticEC2Instances() -> None: """ Generates a new python file of fetchable EC2 Instances by region with current prices and specs. Takes a few (~3+) minutes to run (you'll need decent internet). :return: Nothing. Writes a new 'generatedEC2Lists.py' file. """ print("Updating Toil's EC2 lists to the most current version from AWS's bulk API.\n" "This may take a while, depending on your internet connection (~1Gb file).\n") origFile = os.path.join(dirname, 'generatedEC2Lists.py') # original if not os.path.exists(origFile): raise RuntimeError(f"Path {origFile} does not exist.") # use a temporary file until all info is fetched genFile = os.path.join(dirname, 'generatedEC2Lists_tmp.py') # temp if os.path.exists(genFile): os.remove(genFile) # filepath to store the aws json request (will be cleaned up) # this is done because AWS changes their json format from time to time # and debugging is faster with the file stored locally awsJsonIndex = os.path.join(dirname, 'index.json') if not os.path.exists(awsJsonIndex): fetchEC2Index(filename=awsJsonIndex) else: print('Reusing previously downloaded json @: ' + awsJsonIndex) with open(awsJsonIndex) as f: awsProductDict = json.loads(f.read()) currentEC2List = [] instancesByRegion: Dict[str, List[str]] = {} for regionNickname in EC2Regions: currentEC2Dict = fetchEC2InstanceDict(awsProductDict, region=EC2Regions[regionNickname]) for instanceName, instanceTypeObj in currentEC2Dict.items(): if instanceTypeObj not in currentEC2List: currentEC2List.append(instanceTypeObj) instancesByRegion.setdefault(regionNickname, []).append(instanceName) # write provenance note, copyright and imports with open(genFile, 'w') as f: f.write(textwrap.dedent(''' # !!! AUTOGENERATED FILE !!! # Update with: src/toil/utils/toilUpdateEC2Instances.py # # Copyright (C) 2015-{year} UCSC Computational Genomics Lab # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from toil.lib.ec2nodes import InstanceType\n\n\n''').format(year=datetime.date.today().strftime("%Y"))[1:]) # write header of total EC2 instance type list genString = "# {num} Instance Types. Generated {date}.\n".format( num=str(len(currentEC2List)), date=str(datetime.datetime.now())) genString = genString + "E2Instances = {\n" sortedCurrentEC2List = sorted(currentEC2List, key=lambda x: x.name) # write the list of all instances types for i in sortedCurrentEC2List: z = " '{name}': InstanceType(name='{name}', cores={cores}, memory={memory}, disks={disks}, disk_capacity={disk_capacity}, architecture='{architecture}')," \ "\n".format(name=i.name, cores=i.cores, memory=i.memory, disks=i.disks, disk_capacity=i.disk_capacity, architecture=i.architecture) genString = genString + z genString = genString + '}\n\n' genString = genString + 'regionDict = {\n' for regionName, instanceList in instancesByRegion.items(): genString = genString + f" '{regionName}': [" for instance in sorted(instanceList): genString = genString + f"'{instance}', " if genString.endswith(', '): genString = genString[:-2] genString = genString + '],\n' if genString.endswith(',\n'): genString = genString[:-len(',\n')] genString = genString + '}\n' with open(genFile, 'a+') as f: f.write(genString) # append key for fetching at the end regionKey = '\nec2InstancesByRegion = {region: [E2Instances[i] for i in instances] for region, instances in regionDict.items()}\n' with open(genFile, 'a+') as f: f.write(regionKey) # delete the original file if os.path.exists(origFile): os.remove(origFile) # replace the instance list with a current list os.rename(genFile, origFile) # delete the aws billing json file if os.path.exists(awsJsonIndex): os.remove(awsJsonIndex)