parser.py

257 lines | 11.37 kB Blame History Raw Download
import datetime
import os
import pandas as pd
import csv
import sys
from cloudscale.distributed_jmeter.scripts.visualization.SLO import SLO
from cloudscale.distributed_jmeter.scripts.visualization.converters import Converters


class Parse:
    
    def __init__(self):
        self.max_time = SLO

    def does_violate_slo(self, url, estimated_time, response_code):
        try:
            return not (int(estimated_time) < self.max_time[url] and response_code == 200)
        except KeyError as e:
            print "There's no SLO for %s" % url


    def get_instances_lifetime(self, as_file):
        instance_ids = []
        instances = []

        with open(as_file) as fp:
            next(fp) # skip the header
            for line in fp:
                line = line[:-1]
                instance_id, start_time, end_time, action = line.split(',')

                if action == 'launch' and instance_id not in instance_ids:
                    instance = {}
                    instance['id'] = instance_id
                    instance['start_time'] = datetime.datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S")
                    instances.append(instance)
                    instance_ids.append(instance_id)

                if action == 'terminate' and instance_id in instance_ids:
                    i=0
                    for instance in instances:
                        if instance['id'] == instance_id and not instance.has_key('end_time'):
                            instances[i]['end_time'] = datetime.datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
                            break
                        i+=1
        for instance in instances:
            min_date = self.data['date'].min().to_datetime()
            if instance['start_time'] < min_date:
                instance['start_time'] = min_date

            if not instance.has_key('end_time'):
                instance['end_time'] = self.data['date'].max().to_datetime()
        return instances

    def delete_records_that_violates_slo(self, output_file, input_file):
        converters = Converters()
        with open(output_file, 'w') as slo_fp:
            slo_fp.write('"date","response_time","url","response_code","status","autoscalable","instance_id"\n')
            with open(input_file) as fp:
                next(fp) # skip header
                for line in fp:
                    timestamp, estimated_time, url, response_code, status, attr1, attr2  = line.split(",")
                    response_code = converters.response_code_converter(response_code)
                    if self.does_violate_slo(url, estimated_time, response_code):
                        slo_fp.write('%s,%s,%s,%s,%s,%s,%s' % (timestamp, estimated_time, url, response_code, status, attr1, attr2))


    def slo_agg_1minute(self, output_file, file):
        converters = Converters()
        with open(output_file, 'w') as slo_fp:
            with open(file) as fp:
                next(fp) # skip header
                timestamps = {}
                for line in fp:
                    timestamp, estimated_time, url, response_code, status, attr1, attr2  = line.split(",")
                    #df = datetime.datetime.strptime(datetime.datetime.fromtimestamp(float(timestamp)/1000).strftime("%Y-%m-%d %H:%M"), "%Y-%m-%d %H:%M") # remove microseconds from time
                    response_code = converters.response_code_converter(response_code)
                    new_timestamp = int(float(timestamp)/60000)*60000
                    if not timestamps.has_key(new_timestamp):
                        timestamps[new_timestamp] = {'num' : 0, 'num_all_requests': 0}

                    timestamps[new_timestamp]['num_all_requests'] += 1

                    i=0
                    if self.does_violate_slo(url, estimated_time, response_code):
                        i = 1

                    timestamps[new_timestamp]['num'] += i


            slo_fp.write('"date","num","num_all_requests"\n')
            for timestamp in timestamps.keys():
                slo_fp.write('%s,%s,%s\n' % (timestamp, timestamps[timestamp]['num'],timestamps[timestamp]['num_all_requests']))

    def slo_agg_seconds(self, parsed_file, output_file, seconds):
        print "Seconds: %s" % seconds
        converters = Converters()
        with open(output_file, 'w') as slo_fp:
            with open(parsed_file) as fp:
                next(fp) # skip header
                parsed_file_data = csv.reader(fp)
                sorted_data = sorted(parsed_file_data, key = lambda row: int(row[0]))

                timestamps = {}
                ref_timestamp, _, _, _, _, _, _  = sorted_data[0]
                ref_timestamp = (int(ref_timestamp)/1000)
                timestamps[ref_timestamp] = {'num' : 0, 'num_all_requests': 0}

                min_date = sys.maxint
                for line in sorted_data:
                    timestamp, estimated_time, url, response_code, status, attr1, attr2 = line
                    timestamp = (int(timestamp)/1000)
                    if timestamp < min_date:
                        min_date = timestamp
                    response_code = converters.response_code_converter(response_code)

                    time_delta = datetime.datetime.fromtimestamp(timestamp) - datetime.datetime.fromtimestamp(ref_timestamp)
                    # print "time_delta: %s" % time_delta.seconds
                    # print "time_delta: %s" % datetime.datetime.fromtimestamp(timestamp)
                    if time_delta.seconds >= seconds:
                        # print "new ref timestamp: %s" % datetime.datetime.fromtimestamp(timestamp)
                        ref_timestamp = timestamp
                        if not timestamps.has_key(ref_timestamp):
                            timestamps[ref_timestamp] = {'num' : 0, 'num_all_requests': 0}

                    timestamps[ref_timestamp]['num_all_requests'] += 1

                    i=0
                    if self.does_violate_slo(url, estimated_time, response_code):
                        i = 1

                    timestamps[ref_timestamp]['num'] += i

                print min_date
                slo_fp.write('"date","num","num_all_requests"\n')
                slo_fp.write('%s,%s,%s\n' % (0, 0, 0))
                for timestamp in timestamps.keys():
                    timestamp_subtract = (timestamp - min_date)+seconds
                    slo_fp.write('%s,%s,%s\n' % (timestamp_subtract*1000, timestamps[timestamp]['num'],timestamps[timestamp]['num_all_requests']))

    def _find_min_date(self, data):
        min_date = sys.maxint
        for row in data:
            if int(row[0]) < min_date:
                min_date = min_date

        return min_date

    def merge(self, output_file, as_file, file):

        response_time_stack = []
        epoch = datetime.datetime(1970,1,1)
        with open(output_file, 'w') as m_fp:
            m_fp.write('"date","response_time","url","response_code","status","attr1","attr2","instance_id","y"\n')
            if os.path.exists(as_file):
                instances = self.get_instances_lifetime(as_file)
                i = -5
                for instance in instances:
                    instance_id = instance['id']
                    with open(file) as fp:
                        next(fp) # skip the header
                        already_got_max_response_time = False
                        for line in fp:
                            # if already_got_max_response_time is False: # do this only the first time in loop
                            #     start_time = instance['start_time']
                            #     end_time = instance['end_time'] if instance.has_key('end_time') and instance['end_time'] < self.data['date'].max().to_datetime() else self.get_end_time(instance_id, instances)
                            #     max_response_time_to_as_dt = self.data_indexed.between_time(start_time, end_time).max()['response_time']
                            #     already_got_max_response_time = True

                            timestamp, estimated_time, url, response_code, status, attr1, attr2  = line.split(",")
                            dt = datetime.datetime.fromtimestamp(int(timestamp)/1000.0)
                            if dt >= instance['start_time'] and dt <= instance['end_time']:
                                m_fp.write('%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (timestamp, estimated_time, url, response_code, status, attr1, attr2[:-1], instance_id, i))
                    i-=5

    def timestamp_to_datetime_file(self):
        with open('files/response-times-over-time.trans.tab', 'w') as fp_out:
            fp_out.write('timestamp\tdate\tresponse_time\turl\tresponse_code\tstatus\n')
            fp_out.write('c\td\tc\ts\td\td\n')
            with open(self.file) as fp:
                next(fp) # skip the header
                for line in fp:
                    ts, curr_dt, response_time, url, response_code, status = self.parse_line(line)
                    fp_out.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (ts, curr_dt.strftime('%H:%M'), response_time, url, response_code, status))

    def get_end_time(self, instance_id, instances):
        i = 0
        for instance in instances:
            if instance['id'] == instance_id:
                break
            i+=1
        try:
            end_time =  instances[i+1]['start_time']
            return end_time
        except IndexError as e:
            return self.data['date'].max().to_datetime()

    def parse_line(self, line):
        ts, response_time, url, response_code, status, _, _ = line.split(",")
        dt = datetime.datetime.fromtimestamp(int(ts)/1000.0)
        rc = self.parse_response_code(response_code)

        return ts, dt, int(response_time), str(url), rc, str(status)

    def parse_response_code(self, rc):
        try:
            return int(rc)
        except:
            return 500

    def parse(self, output_file, file):
        with open(file) as fp:
            next(fp) # skip the header
            with open(output_file, "w") as parsed_fp:
                parsed_fp.write("date,response_time,url,response_code,status,attr1,attr2\n")
                converters = Converters()
                for line in fp:
                    timestamp, estimated_time, url, response_code, status, attr1, attr2  = line.split(",")
                    response_code = converters.response_code_converter(response_code)
                    url = converters.url_converter(url)
                    if url is not None:
                        parsed_fp.write('%s,%s,%s,%s,%s,%s,%s' % (timestamp, estimated_time, url, response_code, status, attr1, attr2))

    def to_dataframe(self, file, as_file):


        print "Parsing " + file
        converters = Converters()
        self.data_indexed = pd.read_csv(file, index_col='date', converters={
            'date' : converters.timestamp_converter,
            'response_code' : converters.response_code_converter,
            'url' : converters.url_converter
        })

        self.data = pd.read_csv(file, converters={
            'date' : converters.timestamp_converter,
            'response_code' : converters.response_code_converter,
            'url' : converters.url_converter
        })


        # print "Parsing " + as_file
        # self.autoscalability_data = pd.read_csv(as_file, converters={
        #     'start_time' : converters.datetime_to_timestamp,
        #     'end_time' : converters.datetime_to_timestamp,
        #     'action' : converters.action_to_number
        # })
        return

    def merge_autoscaling(self, file1, file2):
        pass