



/*

2019年网约车&巡游车原始数据介绍：
driver_id: 驾驶员的唯一标识，示例："AAAFSD"
deptime: 行程开始时间，示例："2019/6/17 7:18:51"
desttime: 行程结束时间, 示例："2019/6/17 7:48:43"
fare: 行程金额（元）
distance: 行程里程（公里）


结构化处理步骤如下：
step1: 根据开始、结束时间生成行程时长
step2: 将行程信息折算到与行程有交集的小时当中:
	如果行程完整在某小时内，行程的全部收入、时长、里程都计入这个小时
	如果行程跨小时，按照与每个小时交集的时长计入各个小时；按照与每个小时交集的小时占行程总时长的比例，将收入和里程乘以每小时的比例计入每个小时
step3: 生成每个行程在每个小时所计入的信息后，对每个驾驶员每小时的形成信息进行加总

*/




program data_structured_2019
	
	// step 1: 

	* 将 deptime 和 desttime 转换为 Stata 日期时间格式
	gen start_datetime = clock(deptime, "YMDhms")
	gen end_datetime = clock(desttime, "YMDhms")


	* 确保变量是正确的时间格式
	format start_datetime %tc
	format end_datetime %tc



	* 生成行程时长和行程间隔时长
	gen double trip_duration = (end_datetime - start_datetime)/3600000


	sort driver_id start_datetime
	gen double trip_gap = .
	count
	local n = r(N)
	forvalues i = 1 / `n'{
		local j = `i' + 1
		if driver_id[`j'] == driver_id[`i'] {
			local a = (start_datetime[`j'] - end_datetime[`i'])/3600000
			qui replace trip_gap = `a' in `i'
		}
	}


	* 去掉时长记录有错误的行程
	drop if trip_gap < 0
	drop if trip_duration <= 0





	//step 2

	* 创建小时变量
	gen start_hour_id = floor((start_datetime - tc(01jan1960 00:00:00)) / 3600000)
	gen end_hour_id = floor((end_datetime - tc(01jan1960 00:00:00)) / 3600000)

	* 展开数据集以包括行程跨越的每个小时
	gen expand_hour = end_hour_id - start_hour_id + 1
	expand expand_hour
	by driver_id start_datetime end_datetime, sort: gen hour_index = _n - 1

	* 计算行程在每小时的开始和结束时间
	gen double hour_start = start_datetime 
	replace hour_start = tc(01jan1960 00:00:00) + (start_hour_id + hour_index) * 3600000 if hour_index > 0
	format hour_start %tc

	gen double hour_end = min(tc(01jan1960 00:00:00) + (start_hour_id + hour_index + 1) * 3600000, end_datetime)
	format hour_end %tc



	* 计算每小时的收入、时长和里程
	gen hour_duration = (hour_end - hour_start) / 3600000
	gen hour_income = fare * (hour_duration / trip_duration)
	gen hour_distance = distance * (hour_duration / trip_duration)


	gen str4 hour_year = substr(string(hour_start, "%tcCCYY-NN-DD_HH:MM:SS"), 1, 4)
	gen str2 hour_month = substr(string(hour_start, "%tcCCYY-NN-DD_HH:MM:SS"), 6, 2)
	gen str2 hour_day = substr(string(hour_start, "%tcCCYY-NN-DD_HH:MM:SS"), 9, 2)
	gen str2 hour_hour = substr(string(hour_start, "%tcCCYY-NN-DD_HH:MM:SS"), 12, 2)


	// step 3

	* 创建小时字符串变量
	gen hour_variable = hour_year + hour_month + hour_day + hour_hour

	* 按小时汇总每个驾驶员的信息
	collapse (sum) hour_income hour_duration hour_distance, by(driver_id hour_variable)
	drop if hour_duration == 0
	drop if hour_income == 0

	
end 





program data_flag_2019


	// 对极端值进行标记
	gen hour_wage = hour_income/hour_duration
	gen hour_velocity = hour_distance/hour_duration
	gen flag  = 0
	* 标注 0.01% 极端值
	foreach var in hour_income  hour_distance hour_wage hour_velocity{
	_pctile `var',p(0.01, 0.1, 0.5, 1, 99, 99.5, 99.9, 99.99)   
	return list
	replace flag = 1 if `var' > r(r8)                                                                                 
    }

end




program merge_hour_data
    args filename
	* 生成网约车id数据
	use "`filename'.dta",replace
	duplicates drop driver_id, force
	keep driver_id wangyue 
	save "id.dta", replace

	* 生成小时数据
	use "`filename'.dta",replace
	duplicates drop hour_variable, force
	keep hour_variable
	save "hour_variable.dta", replace

	* 生成网约车id与所有小时两两匹配数据
	use "id.dta"
	cross using "hour_variable.dta"

	* 与驾驶员小时工作信息合并
	merge 1:1 driver_id hour_variable using "`filename'.dta"
	drop if _merge==2
	rename _merge _merge_work

	* 与天气信息合并
	merge m:1 hour_variable using "weather.dta"
	keep if _merge == 3 
	rename _merge _merge_weather

	* 生成工作、降雨、小时、星期等信息
	gen whether_work = _merge_work == 3
	gen whether_rain = rain > 0
	gen hour = substr(hour_variable, -2 ,2 )
	destring hour, replace
	gen week = dow(date(substr(hour_variable,1,8), "YMD"))
	gen weekday = week != 0 & week != 6

end
