---------------------------------------------------------------------------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /Users/yangyanjun/Desktop/exchangerate/提交版本/logs/data_clean_log.txt
  log type:  text
 opened on:  26 Jul 2025, 14:20:55

.   do "$dofiles/data clean.do"

. 
. * ================================================
. *-  data clean
. * ================================================
. 
.   display "开始数据清洗..."
开始数据清洗...

. 
.   
. **#A.HS编码版本转换数据准备及识别需要删除的大宗商品
. 
. /*------------------------------------------------------------------------------
>  * A.HS编码版本转换数据准备及识别需要删除的大宗商品
>  * 目的：将不同年份使用的HS编码统一转换为1996年版本，并识别大宗商品
>  *------------------------------------------------------------------------------*/
. 
.   *------------------A.1 HS编码版本转换数据准备-----------------*     
. 
.   import excel "$R/HS、SITC、BEC各版本转换/HS2002 to HS1996 - Correlation and conversion tables.xls", sheet("Conversion Table") firstrow clear
(2 vars, 5,321 obs)

.   drop in 1
(1 observation deleted)

.   drop if From ==""
(97 observations deleted)

.   rename From hs6 

.   rename To   hs6_1996

.   save "$D/hs2002-2006 to hs1996.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/hs2002-2006 to hs1996.dta saved

. 
.   import excel "$R/HS、SITC、BEC各版本转换/HS 2007 to HS 1996 Correlation and conversion tables.xls", sheet("Conversion Tables") firstrow clear
(2 vars, 5,053 obs)

.   drop in 1
(1 observation deleted)

.   drop if From ==""
(0 observations deleted)

.   rename From hs6 

.   rename To   hs6_1996

.   save "$D/hs2007-2011 to hs1996.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/hs2007-2011 to hs1996.dta saved

. 
.   import excel "$R/HS、SITC、BEC各版本转换/HS 2012 to HS 1996 Correlation and conversion tables.xls", sheet("Sheet1") firstrow clear
(2 vars, 5,206 obs)

.   rename HS2012 hs6 

.   rename HS1996 hs6_1996

.   save "$D/hs2012-2013 to hs1996.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/hs2012-2013 to hs1996.dta saved

. 
.   *------------------A.2 识别需要删除的大宗商品-----------------*
. 
.   import delimited "/$R/Concordance_H1_to_I3的副本/JobID-19_Concordance_H1_to_I3.CSV", stringcols(1 3) clear
(encoding automatically selected: ISO-8859-1)
(4 vars, 5,113 obs)

.   drop hs1996productdescription isicrevision3productdescription

.   rename hs1996productcode hs6 

.   rename isicrevision3productcode isic

.   save "$D/hs1996 to ISIC_ver3.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/hs1996 to ISIC_ver3.dta saved

. 
.   import delimited "/$R/Concordance_H1_to_I3的副本/JobID-19_Concordance_H1_to_I3.CSV", stringcols(1 3) clear
(encoding automatically selected: ISO-8859-1)
(4 vars, 5,113 obs)

.   rename hs1996productcode hs6 

.   rename isicrevision3productcode isic

.   gen isic2=substr(isic,1,2)

.   gen Bulk_commodity = .
(5,113 missing values generated)

.   replace Bulk_commodity = 1 if isic2=="23" | isic2=="27"
(379 real changes made)

.   drop hs1996productdescription isicrevision3productdescription

.   drop if Bulk_commodity == .
(4,734 observations deleted)

.   drop isic isic2

.   save "$D/需要删除的大宗商品代码.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/需要删除的大宗商品代码.dta saved

. 
. **#B.主数据集处理与HS编码统一
. 
. /*------------------------------------------------------------------------------
>  * B.主数据集处理与HS编码统一
>  *------------------------------------------------------------------------------*/
. 
.   *------------------B.1主数据集加载与基础处理-----------------*
. 
.   use "$R/imp2000-2013.dta", clear

. 
.   egen firm =group(party_id)

.   label var firm "重新编码party_id"

.   order year company party_id firm

. 
.   merge m:1 origin_id year using "$R/汇率及ppi数据.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        98,943
        from master                    98,757  (_merge==1)
        from using                        186  (_merge==2)

    Matched                        39,893,070  (_merge==3)
    -----------------------------------------

.   drop if _merge==2
(186 observations deleted)

.   drop _merge

.   label variable hs_id "8位产品代码"

. 
.   gen hs6=substr(hs_id,1,6)           

.   label var hs6 "6位产品hs代码"

.   gen hs4=substr(hs_id,1,4)

.   label var hs4 "4位产品hs代码"

.   gen hs2=substr(hs_id,1,2)

.   label var hs2 "2位产品hs代码"

. 
.   drop  if regexm(hs_id,"[^0-9.]")   //将有问题的hs码的数据删除掉
(78 observations deleted)

.   merge m:1 year using "$R/人民币兑美元汇率.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                             8
        from master                         0  (_merge==1)
        from using                          8  (_merge==2)

    Matched                        39,991,749  (_merge==3)
    -----------------------------------------

.   drop if _merge==2                   //删除多余年份数据
(8 observations deleted)

.   gen valuermb = value * e2

.   drop e2 e1_欧元区 eurozone _merge

.   order year party_id firm hs_id hs6 hs4 hs2 value valuermb

. 
.   *------------------B.2 数据筛选-----------------*
. 
.   drop if 国家 == "中国"         
(2,624,269 observations deleted)

.   drop if origin_id == .   
(62 observations deleted)

. 
.   *------------------B.3 HS编码统一为1996年版本-----------------*
. 
.   merge m:1 hs6 using "$D/hs2002-2006 to hs1996.dta"
(variable hs6 was str6, now str7 to accommodate using data's values)

    Result                      Number of obs
    -----------------------------------------
    Not matched                     1,554,427
        from master                 1,554,381  (_merge==1)
        from using                         46  (_merge==2)

    Matched                        35,813,037  (_merge==3)
    -----------------------------------------

.   drop if _merge==2     
(46 observations deleted)

.   replace hs6_1996 = hs6 if _merge==1       
(1,554,381 real changes made)

.   replace hs6_1996 ="." if year>2006 | year < 2002
(25,150,420 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2002 
(101,323 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2003 
(112,532 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2004 
(127,100 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2005 
(128,023 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2006
(155,455 real changes made)

.   drop hs6_1996 _merge

. 
.   merge m:1 hs6 using "$D/hs2007-2011 to hs1996.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                     1,457,585
        from master                 1,457,503  (_merge==1)
        from using                         82  (_merge==2)

    Matched                        35,909,915  (_merge==3)
    -----------------------------------------

.   drop if _merge==2     
(82 observations deleted)

.   replace hs6_1996 = hs6 if _merge==1       
(1,457,503 real changes made)

.   replace hs6_1996 ="." if year>2011 | year < 2007
(22,545,030 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2007 
(253,548 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2008 
(265,952 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2009 
(274,365 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2010 
(308,541 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2011
(323,756 real changes made)

.   drop hs6_1996 _merge

. 
.   merge m:1 hs6 using "$D/hs2012-2013 to hs1996.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                     2,884,359
        from master                 2,884,185  (_merge==1)
        from using                        174  (_merge==2)

    Matched                        34,483,233  (_merge==3)
    -----------------------------------------

.   drop if _merge==2     
(174 observations deleted)

.   replace hs6_1996 = hs6 if _merge==1       
(2,884,185 real changes made)

.   replace hs6_1996 ="." if year < 2012
(30,636,399 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2012 
(341,988 real changes made)

.   replace hs6 =  hs6_1996 if  hs6 != hs6_1996 & year==2013
(346,054 real changes made)

.   drop hs6_1996 _merge

. 
.   *------------------B.4 删除大宗商品-----------------*
. 
.   merge m:1 hs6 using "$D/需要删除的大宗商品代码.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                    35,997,460
        from master                35,997,456  (_merge==1)
        from using                          4  (_merge==2)

    Matched                         1,369,962  (_merge==3)
    -----------------------------------------

.   drop if _merge==2     
(4 observations deleted)

.   drop if _merge==3        
(1,369,962 observations deleted)

.   drop Bulk_commodity _merge

. 
.   drop hs4 hs2

.   gen hs4=substr(hs6,1,4)

.   label var hs4 "4位产品hs代码"

.   gen hs2=substr(hs6,1,2)

.   label var hs2 "2位产品hs代码"

.   order year party_id firm hs_id hs6 hs4 hs2

.   sort year firm

. 
.   save "$D/离散度指标构建数据.dta", replace      //统一编码并删除大宗商品后的数据
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived
    data/离散度指标构建数据.dta saved

. 
. **#C.集中度指标构建与回归数据准备
. /*------------------------------------------------------------------------------
>  * C.集中度指标构建与回归数据准备
>  * 目的：构建赫芬达尔指数并准备回归分析所需的面板数据
>  *------------------------------------------------------------------------------*/
. 
.   *------------------C.1 构建集中度指标所用数据-基于hs6位编码-----------------*
. 
.   use "$D/离散度指标构建数据.dta", clear 

.   bys firm year: egen double totalvaluermb=sum(valuermb)       //使用企业总进口额数
> 据计算行业的份额

.   label var totalvaluermb "企业每年人民币计价进口额"

.   duplicates drop year hs6 firm,force 

Duplicates in terms of year hs6 firm

(13,709,682 observations deleted)

.   hhi5 totalvaluermb,   by (hs6 year) percentage prefix(hhi5)  //percentage表示将得
> 到的系数扩大1万倍，prefix生成新变量 

.   collapse (mean) hhi5_totalvaluermb, by(year hs6)

.   rename hhi5_totalvaluermb    hhi_hs6

.   label var hhi_hs6 "基于hs6进口额计算"

.   save "$W/赫芬达尔指数_hs6.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working
    data/赫芬达尔指数_hs6.dta saved

. 
.   *------------------C.2 构建集中度指标所用数据-基于hs4位编码-----------------* 
. 
.   use "$D/离散度指标构建数据.dta", clear 

.   bys firm year: egen double totalvaluermb=sum(valuermb)       //使用企业总进口额数
> 据计算行业的份额

.   label var totalvaluermb "企业每年人民币计价进口额"

.   duplicates drop year hs4 firm,force 

Duplicates in terms of year hs4 firm

(20,123,152 observations deleted)

.   hhi5 totalvaluermb,   by (hs4 year) percentage prefix(hhi5)  //percentage表示将得
> 到的系数扩大1万倍，prefix生成新变量

.   collapse (mean) hhi5_totalvaluermb, by(year hs4)

.   rename hhi5_totalvaluermb    hhi_hs4

.   label var hhi_hs4 "基于hs4进口额计算"

.   save "$W/赫芬达尔指数_hs4.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working
    data/赫芬达尔指数_hs4.dta saved

. 
. **#D.生成回归所需要的变量
. 
. /*------------------------------------------------------------------------------
>  * D.生成回归所需要的变量
>  *------------------------------------------------------------------------------*/
. 
.   use "$D/离散度指标构建数据.dta", clear 

.   bys firm year hs4 origin_id: egen totalvalue4rmb=sum(valuermb)       

.   label var totalvalue4rmb "4位hs编码人民币计价进口额"

.   bys firm year hs4 origin_id: egen totalquantity4 =sum(quantity)

.   label var totalquantity4  "根据4位hs码加总的货物数量"

. 
.   duplicates drop firm year hs4 origin_id,force                     

Duplicates in terms of firm year hs4 origin_id

(11,004,989 observations deleted)

. 
.   drop  value valuermb quantity exp_imp                             

.   rename totalvalue4rmb value

.   rename totalquantity4  quantity

. 
.   gen       price = value / quantity
(626,945 missing values generated)

.   label var price "以人民币计价的商品单价"

.   gen       ln_price = log(price)
(626,945 missing values generated)

.   label var ln_price "产品价格对数"

.   gen       ln_ppi   = log(ppi)
(258,582 missing values generated)

.   label var ln_ppi   "ppi对数"

.   gen       ln_e     = log(e)
(68,196 missing values generated)

.   label var ln_e     "汇率对数"

. 
.   drop if   price  ==.
(626,945 observations deleted)

.   drop if   ppi    ==.
(250,360 observations deleted)

.   drop if   e      ==.
(701 observations deleted)

. 
.   egen double firm_origin_id_hs4 = group(firm origin_id hs4)

.   xtset  firm_origin_id_hs4 year

Panel variable: firm_origin_id_hs4 (unbalanced)
 Time variable: year, 2000 to 2013, but with gaps
         Delta: 1 unit

.   gen d_lnprice= d.ln_price
(13,317,745 missing values generated)

.   gen d_lnppi  = d.ln_ppi
(13,317,745 missing values generated)

.   gen d_lne    = d.ln_e
(13,317,745 missing values generated)

.   label var d_lnprice "对数价格的差分值(相邻两期)"

.   label var d_lnppi   "对数ppi的差分值(相邻两期)"

.   label var d_lne     "对数汇率的差分值(相邻两期)"

. 
.   drop ln_price ln_ppi ln_e firm_origin_id_hs4

.   drop if d_lnprice ==.  
(13,317,745 observations deleted)

. 
.   merge m:1 year hs6 using "$W/赫芬达尔指数_hs6.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                         7,825
        from master                         0  (_merge==1)
        from using                      7,825  (_merge==2)

    Matched                        10,796,716  (_merge==3)
    -----------------------------------------

.   rename _merge _merge1

.   merge m:1 year hs4 using "$W/赫芬达尔指数_hs4.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                         9,544
        from master                     7,825  (_merge==1)
        from using                      1,719  (_merge==2)

    Matched                        10,796,716  (_merge==3)
    -----------------------------------------

.   drop if firm ==.
(9,544 observations deleted)

.   drop _merge _merge1

.   
.   egen  double firm_origin_id_hs4 = group(firm origin_id hs4)

.   xtset firm_origin_id_hs4 year

Panel variable: firm_origin_id_hs4 (unbalanced)
 Time variable: year, 2001 to 2013, but with gaps
         Delta: 1 unit

.   destring hs_id hs6 hs4 hs2,replace
hs_id: all characters numeric; replaced as long
hs6: all characters numeric; replaced as long
hs4: all characters numeric; replaced as int
hs2: all characters numeric; replaced as byte

. 
.   replace hhi_hs4=hhi_hs4/10000       //将其变为原倍数
(10,796,716 real changes made)

.   replace hhi_hs6=hhi_hs6/10000
(10,796,716 real changes made)

. 
.   save "$W/final_data.dta", replace     
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working data/final_data.dta
    saved

. 
.  *---------------------------------------------------------------------------------
> -----------------*/
. 
. 
. **#E.区分内销型企业和外贸型企业数据清洗
. 
. /*------------------------------------------------------------------------------
>  * E.区分内销型企业和外贸型企业数据清洗
>  * 目的：区分内销型企业和外贸型企业数据清洗
>  *------------------------------------------------------------------------------*/
. 
.   use "$R/hg2000-2013.dta",clear

.   keep if exp_imp == "0"    //挑选出口的企业
(39,726,026 observations deleted)

.   drop origin_id hs_id value quantity exp_imp year company

.   gen exp = "1" 

.   duplicates drop party_id, force

Duplicates in terms of party_id

(92,804,220 observations deleted)

.   save "$D/有出口的企业.dta", replace     
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/有出口的企业.dta
    saved

.   
.   use "$R/hg2000-2013.dta",clear

.   keep if exp_imp == "1"    //挑选进口的企业
(93,324,008 observations deleted)

.   drop origin_id hs_id value quantity exp_imp year company

.   gen imp = "1" 

.   duplicates drop party_id, force

Duplicates in terms of party_id

(39,302,370 observations deleted)

.   save "$D/有进口的企业.dta", replace     
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/有进口的企业.dta
    saved

.   
.   use "$R/hg2000-2013.dta",clear

.   merge m:1 party_id using "$D/有出口的企业.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                     3,026,855
        from master                 3,026,855  (_merge==1)
        from using                          0  (_merge==2)

    Matched                       130,023,179  (_merge==3)
    -----------------------------------------

.   drop _merge

.   merge m:1 party_id using "$D/有进口的企业.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                    21,387,282
        from master                21,387,282  (_merge==1)
        from using                          0  (_merge==2)

    Matched                       111,662,752  (_merge==3)
    -----------------------------------------

.   drop _merge

.   gen     types = "0" if exp == "1"     & imp == "1"         //即进口也出口的企业  
>             
(24,414,137 missing values generated)

.   replace types = "1" if missing(types) & imp == "1"   //纯进口企业
(3,026,855 real changes made)

.   replace types = "2" if missing(types) & exp == "1"   //纯出口企业        
(21,387,282 real changes made)

.   label var types "=0：即进口也出口，=1:纯进口，=2:纯出口"       

. 
.   duplicates drop party_id, force

Duplicates in terms of party_id

(132,390,475 observations deleted)

.   drop year origin_id hs_id value quantity exp_imp

.   save "$D/企业代码及类型.dta", replace  
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived
    data/企业代码及类型.dta saved

. 
. 
. **#F.附录II数据清洗
. 
. /*------------------------------------------------------------------------------
>  * F.附录II数据清洗
>  * 考虑国际竞争数据清洗
>  *------------------------------------------------------------------------------*/
. 
.   *------------------F1.合并原始数据-----------------*
.   /* 创建一个空的主数据集 */
.   clear

.   gen t = .

.   gen k = .

.   gen i = .

.   gen j = .

.   gen v = .

.   gen q = .

.   gen year = .

.   save "$R/combined_data.dta", replace
(dataset contains 0 observations)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta
    saved

.   clear

. 
.   /* 循环处理2000-2013年的数据 */
.   forvalues y = 2000/2013 {
  2.     /* 显示进度信息 */
.     display "Processing year `y'..."
  3.     
.     import delimited "$R/BACI_HS96_V202501/BACI_HS96_Y`y'_V202501.csv", clear
  4.     gen year = `y'
  5.     save "$R/temp_`y'.dta", replace
  6.   
.     /* 追加到主数据集 */
.     use "$R/combined_data.dta", clear
  7.     append using "$R/temp_`y'.dta"
  8.     save "$R/combined_data.dta", replace
  9.     erase "$R/temp_`y'.dta"
 10.   }
Processing year 2000...
(encoding automatically selected: ISO-8859-2)
(6 vars, 7,178,886 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2000.dta not
    found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2000.dta saved
(variable k was float, now double to accommodate using data's values)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta
    saved
Processing year 2001...
(encoding automatically selected: ISO-8859-2)
(6 vars, 7,487,445 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2001.dta not
    found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2001.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta
    saved
Processing year 2002...
(encoding automatically selected: ISO-8859-2)
(6 vars, 7,764,286 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2002.dta not
    found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2002.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta
    saved
Processing year 2003...
(encoding automatically selected: ISO-8859-2)
(6 vars, 8,030,461 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2003.dta not
    found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2003.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta
    saved
Processing year 2004...
(encoding automatically selected: ISO-8859-2)
(6 vars, 8,421,357 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2004.dta not
    found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2004.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta
    saved
Processing year 2005...
(encoding automatically selected: ISO-8859-2)
(6 vars, 8,753,746 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2005.dta not
    found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2005.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved
Processing year 2006...
(encoding automatically selected: ISO-8859-2)
(6 vars, 9,105,753 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2006.dta not found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2006.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved
Processing year 2007...
(encoding automatically selected: ISO-8859-2)
(6 vars, 9,279,758 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2007.dta not found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2007.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved
Processing year 2008...
(encoding automatically selected: ISO-8859-2)
(6 vars, 9,476,841 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2008.dta not found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2008.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved
Processing year 2009...
(encoding automatically selected: ISO-8859-2)
(6 vars, 9,354,828 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2009.dta not found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2009.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved
Processing year 2010...
(encoding automatically selected: ISO-8859-2)
(6 vars, 9,611,705 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2010.dta not found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2010.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved
Processing year 2011...
(encoding automatically selected: ISO-8859-2)
(6 vars, 9,784,574 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2011.dta not found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2011.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved
Processing year 2012...
(encoding automatically selected: ISO-8859-2)
(6 vars, 10,083,252 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2012.dta not found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2012.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved
Processing year 2013...
(encoding automatically selected: ISO-8859-2)
(6 vars, 10,248,305 obs)
(file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2013.dta not found)
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/temp_2013.dta saved
file /Users/yangyanjun/Desktop/exchangerate/提交版本/raw data/combined_data.dta saved

. 
.   *------------------F2.指标构建-----------------*
.   clear

.   use "$R/combined_data.dta", clear 

. 
.   label variable t "Year"

.   label variable k "Product code" 

.   label variable i "Exporter country code"

.   label variable j "Importer country code"

.   label variable v "Trade value"

.   label variable q "Trade quantity"

. 
.   duplicates report i j k t v q

Duplicates in terms of i j k t v q

--------------------------------------
   Copies | Observations       Surplus
----------+---------------------------
        1 |    124581197             0
--------------------------------------

.   duplicates drop i j k t v q, force    // 如果存在完全相同的记录，删除重复项

Duplicates in terms of i j k t v q

(0 observations are duplicates)

. 
.     /* 构建全球竞争指标 */
.   * 第1步：计算每个国家j对产品k在t年的总进口额
.   bysort j k t: egen total_import_jkt = sum(v)

.   
.   by j k t: keep if _n == 1
(113,212,401 observations deleted)

.   keep j k t total_import_jkt

.   
.   * 第2步：计算产品k在t年的全球总进口额
.   bysort k t: egen global_import_kt = sum(total_import_jkt)

. 
.   * 第3步：计算每个国家j在产品k全球市场中的进口份额
.   gen market_share_jkt = total_import_jkt / global_import_kt

.   
.   bysort k t: egen sum_shares = sum(market_share_jkt)

.   summarize sum_shares

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
  sum_shares | 11,368,796           1    2.24e-08   .9999999          1

. 
.   * 第4步：计算HHI指数(按产品k和年份t)
.   gen squared_share = market_share_jkt^2

.   bysort k t: egen HHI_kt = sum(squared_share)

. 
.   * 保留每个产品-年份组合的唯一观察值
.   bysort k t: keep if _n == 1
(11,297,429 observations deleted)

.   keep k t HHI_kt

. 
.   sum HHI_kt

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
      HHI_kt |     71,367    .1031326    .1129468   .0154476          1

.   rename t year 

.   rename k hs6

. 
.   save "$D/国际竞争hhi指标.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/国际竞争hhi指标.dta saved

. 
.   *------------------F3. 构建替代指标-----------------*
.   clear

.   use "$R/combined_data.dta", clear 

. 
.   label variable t "Year"

.   label variable k "Product code" 

.   label variable i "Exporter country code"

.   label variable j "Importer country code"

.   label variable v "Trade value"

.   label variable q "Trade quantity"

. 
.   gen hs4 = floor(k / 100)

. 
.   bysort j hs4 t: egen total_import_jkt = sum(v)

.   
.   by j hs4 t: keep if _n == 1
(121,351,046 observations deleted)

.   keep j hs4 t total_import_jkt

. 
.   bysort hs4 t: egen global_import_kt = sum(total_import_jkt)

. 
.   * 第3步：计算每个国家j在产品k全球市场中的进口份额
.   gen market_share_jkt = total_import_jkt / global_import_kt

.   
.   bysort hs4 t: egen sum_shares = sum(market_share_jkt)

.   summarize sum_shares

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
  sum_shares |  3,230,151           1    2.24e-08   .9999999          1

. 
.   * 第4步：计算HHI指数(按产品k和年份t)
.   gen squared_share = market_share_jkt^2

.   bysort hs4 t: egen HHI_kt = sum(squared_share)

. 
.   * 保留每个产品-年份组合的唯一观察值
.   bysort hs4 t: keep if _n == 1
(3,212,783 observations deleted)

.   keep hs4 t HHI_kt

.   sum HHI_kt    

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
      HHI_kt |     17,368    .0818603    .0694909   .0152643          1

.   rename t year 

. 
.   save "$D/国际竞争hhi指标_hs4.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/derived data/国际竞争hhi指标_hs4.dta saved

. 
. 
. **#G.附录III 稳健性检验数据清洗      
. 
. /*------------------------------------------------------------------------------
>  * 附录III  表III 1 第（1）列数据清洗：计算行业平均的hhi指数
>  *------------------------------------------------------------------------------*/
. 
.   use "$W/赫芬达尔指数_hs4.dta", clear 

.   collapse (mean) hhi_hs4, by(hs4)

.   gen hhi_hs4_avg = hhi_hs4 / 10000

.   drop hhi_hs4

.   destring hs4 ,replace
hs4: all characters numeric; replaced as int

.   save "$W/赫芬达尔指数_hs4_平均.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working data/赫芬达尔指数_hs4_平均.dta saved

. 
. 
. /*------------------------------------------------------------------------------
>  * 附录III  表III 1 第（2）列数据清洗：使用6位HS编码计算
>  *------------------------------------------------------------------------------*/
. 
.   use "$D/离散度指标构建数据.dta", clear 

.   bys firm year hs6 origin_id: egen totalvalue6rmb=sum(valuermb)       

.   label var totalvalue6rmb "6位hs编码人民币计价进口额"

.   bys firm year hs6 origin_id: egen totalquantity6 =sum(quantity)

.   label var totalquantity6  "根据6位hs码加总的货物数量"

. 
.   duplicates drop firm year hs6 origin_id,force  

Duplicates in terms of firm year hs6 origin_id

(3,704,889 observations deleted)

. 
.   drop  value valuermb quantity exp_imp      

.   rename totalvalue6rmb value

.   rename totalquantity6  quantity

. 
.   gen       price = value / quantity             
(901,942 missing values generated)

.   label var price "以人民币计价的商品单价"

.   gen       ln_price = log(price)                
(901,942 missing values generated)

.   label var ln_price "产品价格对数"

.   gen       ln_ppi   = log(ppi)
(316,257 missing values generated)

.   label var ln_ppi   "ppi对数"

.   gen       ln_e     = log(e)
(90,846 missing values generated)

.   label var ln_e     "汇率对数"

. 
.   drop if   price  ==.                       
(901,942 observations deleted)

.   drop if   ppi    ==.
(304,232 observations deleted)

.   drop if   e      ==.
(773 observations deleted)

. 
.   drop ppi e value quantity price 

.   egen double firm_origin_id_hs6 = group(firm origin_id hs6)  

.   xtset  firm_origin_id_hs6 year

Panel variable: firm_origin_id_hs6 (unbalanced)
 Time variable: year, 2000 to 2013, but with gaps
         Delta: 1 unit

.   gen d_lnprice= d.ln_price
(18,363,258 missing values generated)

.   gen d_lnppi  = d.ln_ppi
(18,363,258 missing values generated)

.   gen d_lne    = d.ln_e
(18,363,258 missing values generated)

.   label var d_lnprice "对数价格的差分值(相邻两期)"

.   label var d_lnppi   "对数ppi的差分值(相邻两期)"

.   label var d_lne     "对数汇率的差分值(相邻两期)"

. 
.   drop ln_price ln_ppi ln_e firm_origin_id_hs6  

.   drop if d_lnprice ==.   
(18,363,258 observations deleted)

. 
.   merge m:1 year hs6 using "$W/赫芬达尔指数_hs6.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                         7,845
        from master                         0  (_merge==1)
        from using                      7,845  (_merge==2)

    Matched                        12,722,362  (_merge==3)
    -----------------------------------------

.   rename _merge _merge1

.   merge m:1 year hs4 using "$W/赫芬达尔指数_hs4.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                         9,570
        from master                     7,845  (_merge==1)
        from using                      1,725  (_merge==2)

    Matched                        12,722,362  (_merge==3)
    -----------------------------------------

.   drop if firm ==.
(9,570 observations deleted)

.   drop _merge _merge1

. 
.   egen  double firm_origin_id_hs6 = group(firm origin_id hs6)

.   xtset firm_origin_id_hs6 year

Panel variable: firm_origin_id_hs6 (unbalanced)
 Time variable: year, 2001 to 2013, but with gaps
         Delta: 1 unit

.   destring hs_id hs6 hs4 hs2,replace    
hs_id: all characters numeric; replaced as long
hs6: all characters numeric; replaced as long
hs4: all characters numeric; replaced as int
hs2: all characters numeric; replaced as byte

.   replace hhi_hs4=hhi_hs4/10000       //将其变为原倍数
(12,722,362 real changes made)

.   replace hhi_hs6=hhi_hs6/10000
(12,722,362 real changes made)

. 
.   save "$W/final_data_hs6.dta", replace 
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working data/final_data_hs6.dta saved

. 
. 
. /*------------------------------------------------------------------------------
>  * 附录III  表III 1 第（3）列数据清洗：使用8位HS编码计算
>  *------------------------------------------------------------------------------*/
. 
.   *------------------hhi指标构建-----------------*
. 
.   use "$D/离散度指标构建数据.dta", clear

.   bys firm year: egen double totalvaluermb=sum(valuermb)    

.   label var totalvaluermb "企业每年人民币计价进口额"

.   duplicates drop year hs_id firm,force 

Duplicates in terms of year hs_id firm

(12,349,494 observations deleted)

.   hhi5 totalvaluermb,   by (hs_id year) percentage prefix(hhi5)  

.   collapse (mean) hhi5_totalvaluermb, by(year hs_id)

.   rename hhi5_totalvaluermb    hhi_hs8

.   label var hhi_hs8 "基于hs8进口额计算"

.   rename hs_id hs8

.   save "$W/赫芬达尔指数_hs8.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working data/赫芬达尔指数_hs8.dta saved

. 
.   *------------------构建回归数据-----------------*
. 
.   use "$D/离散度指标构建数据.dta", clear 

.   rename hs_id hs8

.   bys firm year hs8 origin_id: egen totalvalue8rmb=sum(valuermb)       

.   label var totalvalue8rmb "6位hs编码人民币计价进口额"

.   bys firm year hs8 origin_id: egen totalquantity8 =sum(quantity)

.   label var totalquantity8  "根据6位hs码加总的货物数量"

. 
.   duplicates drop firm year hs8 origin_id,force                     

Duplicates in terms of firm year hs8 origin_id

(2,178,957 observations deleted)

. 
.   drop  value valuermb quantity exp_imp                             

.   rename totalvalue8rmb value

.   rename totalquantity8  quantity

. 
.   gen       price = value / quantity             
(965,751 missing values generated)

.   label var price "以人民币计价的商品单价"

.   gen       ln_price = log(price)                
(965,751 missing values generated)

.   label var ln_price "产品价格对数"

.   gen       ln_ppi   = log(ppi)
(326,978 missing values generated)

.   label var ln_ppi   "ppi对数"

.   gen       ln_e     = log(e)
(94,793 missing values generated)

.   label var ln_e     "汇率对数"

. 
.   drop if   price  ==.                 
(965,751 observations deleted)

.   drop if   ppi    ==.
(314,474 observations deleted)

.   drop if   e      ==.
(782 observations deleted)

. 
.   drop ppi e value quantity price       

.   egen double firm_origin_id_hs8 = group(firm origin_id hs8)    

.   xtset  firm_origin_id_hs8 year        

Panel variable: firm_origin_id_hs8 (unbalanced)
 Time variable: year, 2000 to 2013, but with gaps
         Delta: 1 unit

.   gen d_lnprice= d.ln_price
(19,730,060 missing values generated)

.   gen d_lnppi  = d.ln_ppi
(19,730,060 missing values generated)

.   gen d_lne    = d.ln_e
(19,730,060 missing values generated)

.   label var d_lnprice "对数价格的差分值(相邻两期)"

.   label var d_lnppi   "对数ppi的差分值(相邻两期)"

.   label var d_lne     "对数汇率的差分值(相邻两期)"

. 
.   drop ln_price ln_ppi ln_e firm_origin_id_hs8          

.   drop if d_lnprice ==.                                
(19,730,060 observations deleted)

. 
.   merge m:1 year hs8 using "$W/赫芬达尔指数_hs8.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                        17,004
        from master                         0  (_merge==1)
        from using                     17,004  (_merge==2)

    Matched                        12,807,432  (_merge==3)
    -----------------------------------------

.   drop if firm ==.
(17,004 observations deleted)

.   drop _merge 

. 
.   egen  double firm_origin_id_hs8 = group(firm origin_id hs8)

.   xtset firm_origin_id_hs8 year

Panel variable: firm_origin_id_hs8 (unbalanced)
 Time variable: year, 2001 to 2013, but with gaps
         Delta: 1 unit

.   destring hs8,replace  
hs8: all characters numeric; replaced as long

.   replace hhi_hs8 = hhi_hs8 / 10000
(12,807,432 real changes made)

. 
.   save "$W/final_data_hs8.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working data/final_data_hs8.dta saved

. 
. 
. /*------------------------------------------------------------------------------
>  * 附录III  表III 1 第（4）列数据清洗:行业-出口来源国层面构建进口竞争度指标
>  *------------------------------------------------------------------------------*/
. 
.   clear

.   use "$D/离散度指标构建数据.dta", clear       

.   bys firm year: egen double totalvaluermb=sum(valuermb)    

.   label var totalvaluermb "企业每年人民币计价进口额"

.   duplicates drop year hs4 firm,force 

Duplicates in terms of year hs4 firm

(20,123,152 observations deleted)

. 
.   hhi5 totalvaluermb,   by (hs4 year origin_id) percentage prefix(hhi5)      

.   collapse (mean) hhi5_totalvaluermb, by(year hs4 origin_id)    

.   rename hhi5_totalvaluermb    hhi_hs4_c

.   label var hhi_hs4_c "基于hs4进口额计算行业-出口来源国层面的hhi"

.   destring hs4,replace
hs4: all characters numeric; replaced as int

.   save "$W/hhi_hs4_出口来源国层面.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working data/hhi_hs4_出口来源国层面.dta saved

. 
. 
. /*------------------------------------------------------------------------------
>  * 附录III  表III 1 第（5）列数据清洗:几何平均价格
>  *------------------------------------------------------------------------------*/              
. 
.   clear

.   use "$D/离散度指标构建数据.dta", clear

. 
.   drop if 国家 == "中国"         
(0 observations deleted)

.   drop if origin_id == .        
(0 observations deleted)

.   drop if quantity  ==0 
(1,061,838 observations deleted)

. 
.   bys firm year hs4 origin_id: gen double n=_n

.   bys firm year hs4 origin_id: gen double N=_N

.   by  firm year hs4 origin_id: gen double totalvalue4rmb_2=valuermb if n==1
(10,570,096 missing values generated)

.   by  firm year hs4 origin_id: replace totalvalue4rmb_2=totalvalue4rmb_2[_n-1]*valuermb if n~=1
(10570081 real changes made)

.   replace totalvalue4rmb_2=totalvalue4rmb_2^(1/n)
(10,570,081 real changes made)

.   gen double value2 = valuermb if N==1
(16,631,065 missing values generated)

.   replace    value2 = totalvalue4rmb_2 if N==n
(6,060,967 real changes made)

.   label var  value2 "4位hs编码人民币计价进口额(几何平均)"

.   
.   by  firm year hs4 origin_id: gen double  totalquantity_2=quantity if n==1
(10,570,096 missing values generated)

.   by  firm year hs4 origin_id: replace totalquantity_2=totalquantity_2[_n-1]*quantity if n~=1
(10570096 real changes made)

.   replace totalquantity_2=totalquantity_2^(1/n)
(10,377,907 real changes made)

.   gen double quantity2 = quantity if N==1
(16,631,065 missing values generated)

.   replace    quantity2 = totalquantity_2 if N==n
(6,060,969 real changes made)

.   label var  quantity2 "4位hs码加总的货物数量（几何平均）"

. 
.   drop if value2 ==.      //可删除加总下的重复值
(10,570,098 observations deleted)

.   drop n N totalvalue4rmb_2 totalquantity_2 value valuermb quantity exp_imp

. 
.   gen       price2    = value2 / quantity2             

.   label var price2    "以人民币计价的商品单价（几何平均）"

.   gen       ln_price2 = log(price2)                    

.   label var ln_price2 "产品价格对数（几何平均）"

.   gen       ln_ppi    = log(ppi)
(250,360 missing values generated)

.   label var ln_ppi    "ppi对数"

.   gen       ln_e      = log(e)
(61,931 missing values generated)

.   label var ln_e      "汇率对数"

. 
.   egen double firm_origin_id_hs4 = group(firm origin_id hs4)

.   xtset  firm_origin_id_hs4 year        

Panel variable: firm_origin_id_hs4 (unbalanced)
 Time variable: year, 2000 to 2013, but with gaps
         Delta: 1 unit

.   gen d_lnprice2= d.ln_price2
(13,484,825 missing values generated)

.   gen d_lnppi  = d.ln_ppi
(13,568,563 missing values generated)

.   gen d_lne    = d.ln_e
(13,502,202 missing values generated)

. 
.   label var d_lnprice2 "对数价格的差分值(相邻两期几何平均)"

.   label var d_lnppi    "对数ppi的差分值(相邻两期)"

.   label var d_lne      "对数汇率的差分值(相邻两期)" 

.   drop  ln_price2 ln_ppi ln_e firm_origin_id_hs4

. 
.   merge m:1 year hs6 using "$W/赫芬达尔指数_hs6.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                           307
        from master                         0  (_merge==1)
        from using                        307  (_merge==2)

    Matched                        24,365,520  (_merge==3)
    -----------------------------------------

.   rename _merge _merge1

.   merge m:1 year hs4 using "$W/赫芬达尔指数_hs4.dta"

    Result                      Number of obs
    -----------------------------------------
    Not matched                           314
        from master                       307  (_merge==1)
        from using                          7  (_merge==2)

    Matched                        24,365,520  (_merge==3)
    -----------------------------------------

.   drop if firm ==.
(314 observations deleted)

.   drop _merge _merge1

. 
.   egen  double firm_origin_id_hs4 = group(firm origin_id hs4)

.   xtset firm_origin_id_hs4 year

Panel variable: firm_origin_id_hs4 (unbalanced)
 Time variable: year, 2000 to 2013, but with gaps
         Delta: 1 unit

.   destring hs_id hs6 hs4 hs2,replace
hs_id: all characters numeric; replaced as long
hs6: all characters numeric; replaced as long
hs4: all characters numeric; replaced as int
hs2: all characters numeric; replaced as byte

.   replace hhi_hs4=hhi_hs4/10000
(24,365,520 real changes made)

.   replace hhi_hs6=hhi_hs6/10000
(24,365,520 real changes made)

. 
.   save "$W/final_data_几何平均价格.dta", replace
file /Users/yangyanjun/Desktop/exchangerate/提交版本/working data/final_data_几何平均价格.dta saved

. 
. 
.   display "数据清理模块完成"
数据清理模块完成

.   
. 
. 
. 
. 
. 
. 
. 
. 
. 
. 
. 
. 
end of do-file

.   log close
      name:  <unnamed>
       log:  /Users/yangyanjun/Desktop/exchangerate/提交版本/logs/data_clean_log.txt
  log type:  text
 closed on:  26 Jul 2025, 15:49:58
-----------------------------------------------------------------------------------------------------
